LinHES Forums • View topic - Announcement: A Batch IMDB Grabber

jesse · **Posted:** Sat Aug 20, 2005 7:34 am

Here is the help output:

Code:

IMDB Batch Grabber (v1.01) by Tim Harvey, Andrei Rjeousski, Jesse Anderson
Looks through the video held in the videos database and performs queries using the www.imdb.com website.
usage: ./imdbbatchgrabber.pl -hviANHUP [parameters]
       -h           help
       -v           display version
       -i           display info

       -A Get IMDB data for ALL movies in database
       -N Get IMDB data for NEW movies in database (DEFAULT)
       -Host Specify a new host (DEFAULT:127.0.0.1)
       -User Specify a new user (DEFAULT:mythtv)
       -Password Specify a new password (DEFAULT:mythtv)
       -Images Specify a new directory for images (DEFAULT:/myth/movie_posters/)

Program Description:
You have probably used the Utilities/Setup->Video Manager->Search IMDB function to get the data for your movie of IMDB. However, to get the information for each movie, you have to repeat this process many, many times. Batch IMDB Grabber does all of this at once.

Process Description:
First, it gets all of the database entries based on the ALL or NEW selection. Then it queries IMDB to get the number. If there are multiple search hits, it gives you a list and a prompt to choose the right one. Then it gets the data from IMDB and puts it in the database. Finally all of images are downloaded.

I don't have a link where you can download it yet. For now you will have t o create your own file and make it executable. Cecil has told me that it will be in the next release of KnoppMyth. Here is the code for imdbbatchgrabber.pl:

Code:

#!/usr/bin/perl

#
# This perl script is intended to perform movie data lookups based on 
# the popular www.imdb.com website
#
# For more information on MythVideo's external movie lookup mechanism, see
# the README file in this directory.
#
# Author: Tim Harvey (tharvey AT alumni.calpoly DOT edu)
# Modified: Andrei Rjeousski
# Modified Again: Jesse Anderson
# v1.1
# - Added amazon.com covers and improved handling for imdb posters
# v1.2
#     - when searching amazon, try searching for main movie name and if nothing is found, search for informal name
#     - better handling for amazon posters, see if movie title is a substring in the search results returned by amazon
#     - fixed redirects for some movies on impawards

use DBI;
use LWP::Simple;      # libwww-perl providing simple HTML get actions
use HTML::Entities;
use URI::Escape;
use XML::Simple;

use Getopt::Long;

$title = "IMDB Batch Grabber"; 
$version = "v1.1";
$author = "Tim Harvey, Andrei Rjeousski, Jesse Anderson";

my $host = '127.0.0.1';
my $db = 'mythconverg';
my $db_user = 'mythtv';
my $db_password = 'mythtv';   
my $sql;

my $imageDirectory = '/myth/movie_posters/';

my $hflag = '';
my $iflag = '';
my $vflag = '';
my $Nflag = '';
my $Aflag = '';

GetOptions("h"=>\$hflag,
      "i"=>\$iflag,
      "v"=>\$vflag,
      "N"=>\$Nflag,
      "A"=>\$Aflag,

           "Host=s"=>\$host,
      "User=s"=>\$db_user,
      "Password=s"=>\$db_password,
      "Images=s"=>\$imageDirectory);
   
# print out info
if ( $vflag ) { version(); exit 1; }
if ( $iflag ) { info(); exit 1; }

# print out usage if needed
if ( $hflag ) { help(); }

if ( $Aflag ) {
   print "Processing all database entries\n\n";
   #Do all database hits that are in the database regardless of data
   $sql = "select inetref, filename, intId from videometadata order by title";
   startBatchProcess();
}

elsif ( $Nflag ) {
   print "Processing new database entries\n\n";
   #Do all database hits that haven't been found on IMDB already
   $sql = "select inetref, filename, intId from videometadata where inetref = 00000000 and plot=\"None\" order by title";
   startBatchProcess();
}

else {
   help();
}

# display usage
sub usage {
   print "usage: $0 -hviANHUP [parameters]\n";
   print "       -h           help\n";
   print "       -v           display version\n";
   print "       -i           display info\n";
   print "\n";
   print "       -A Get IMDB data for ALL movies in database\n";
   print "       -N Get IMDB data for NEW movies in database (DEFAULT)\n";
   print "       -Host Specify a new host (DEFAULT:$host)\n";
   print "       -User Specify a new user (DEFAULT:$db_user)\n";
   print "       -Password Specify a new password (DEFAULT:$db_password)\n";
   print "       -Images Specify a new directory for images (DEFAULT:$imageDirectory)\n";
   exit(-1);
}

# display 1-line of info that describes the version of the program
sub version {
   print "$title ($version) by $author\n"
} 

# display 1-line of info that can describe the type of query used
sub info {
   print "Looks through the video held in the videos database and performs queries using the www.imdb.com website.\n";
}

# display detailed help 
sub help {
   version();
   info();
   usage();
}

$dbh, sth;

sub getMovieDataFromName {
    my ($moviename)=@_;

    $moviename = lc( $moviename );
    $lastSlashIndex = rindex( $moviename, '/' ) + 1;
    $lastPeriod = rindex( $moviename, '.' );

    $moviename = substr( $moviename, $lastSlashIndex, ( $lastPeriod - $lastSlashIndex ) );
    
    #try to find -cd 1 or cd 1 and remove it
    if ( rindex( $moviename, '-cd' ) != -1 ) {
       $moviename = substr( $moviename, 0, rindex( $moviename, '-cd' ) );
    }

    if ( rindex( $moviename, 'cd' ) != -1 ) {
       $moviename = substr( $moviename, 0, rindex( $moviename, 'cd' ) );
    }

    #try to find -disc 1 or disc 1 and remove it
    if ( rindex( $moviename, '-disc' ) != -1 ) {
       $moviename = substr( $moviename, 0, rindex( $moviename, '-disc' ) );
    }

    if ( rindex( $moviename, 'disc' ) != -1 ) {
       $moviename = substr( $moviename, 0, rindex( $moviename, 'disc' ) );
    }
    
    return $moviename;
}

sub startBatchProcess {
   $dbh = DBI->connect("dbi:mysql:$db:$host","$db_user","$db_password");

   $sth = $dbh->prepare($sql);

   $sth->execute || 
           die "Could not execute SQL statement ... maybe invalid?";


   my @allmovies;   
   my $allMoviesCount = 0;
   my @originalMovieName;
   my @originalIntId;
   my @selectedInetref;

   #process all database results and get IMDB's list of hits
   print "\n\n\n\nTime to get all database entries and query IMDB...\nWorking";
   while (@row=$sth->fetchrow_array)
   {
   $movieName = getMovieDataFromName( $row[1] );

   my @tempMovieArray = getMovieList( $movieName );
        
        if ( @tempMovieArray == 0 ) {
           print "\nError: IMDB could not find a match for $movieName.  That movie will be skipped.\n";
           $tempMovieArray[0][0] = -1;
           $tempMovieArray[0][1] = -1;
        } else {
           print ".";
   }

   $allmovies[$allMoviesCount] = [ @tempMovieArray ];

   $originalMovieName[$allMoviesCount] = $movieName;
        $originalIntId[$allMoviesCount] = $row[2];
   $allMoviesCount++;   
   }

   #Get user input for whichever movies dont have a direct match.
   print "\n\n\n\nDone getting IMDB movie list.\nTime for you to sort out any movies that had more than one hit...";   
   for $i ( 0 .. $#allmovies ) {
      @allmoviesrow = $allmovies[$i];
      if ( $allmoviesrow[0][0][0] != -1 && $allmoviesrow[0][0][1] != -1 ) {
        for $j ( 0 .. $#allmoviesrow ) {
        $row = $allmoviesrow[$j];
        #print( $#{$row} );
        if ( $#{$row} > 0 ) {
           print "\n\n\nFor \"$originalMovieName[$i]\" IMDB has the following names:\n";
           for $h ( 0 .. $#{$row} ) {
              $tempH = $h + 1;
              print "$tempH-$row->[$h][0]-$row->[$h][1]\n";
           }
      
           print "\nPlease input corresponding movie number or 0 to skip:";
           $selectedNum = <STDIN>;
           $selectedNum--;   
         } else {
           $selectedNum = 0;
         }

         if ( $selectedNum == -1 ) {
           #if user puts 0 then it ges decremented to be -1.  need to set it to -1 to be ignored
           $selectedInetref[$i] = -1;
         } else {
           $selectedInetref[$i] = $row->[$selectedNum][1];
         }
       }
     } else {
       #the movie wasn't found.  all inet refs of -1 are ignored
       $selectedInetref[$i] = -1;
     }
   }

   #Now get movie data and put in database
   print "\n\n\n\nDone getting IMDB refs.  Time to get data and put it in database...\nWorking";
   for $i ( 0 .. $#selectedInetref ) {
      if ( $selectedInetref[$i] != -1 ) {
        getMovieData( $selectedInetref[$i], $originalIntId[$i] );
        print ".";
      }
   }


   #Now get Images
   print "\n\n\n\nDone getting IMDB data.\nTime to get images...\nWorking";
   
   for $i ( 0 .. $#selectedInetref ) {
        if ( $selectedInetref[$i] != -1 ) {
     $imageURL = getMoviePoster( $selectedInetref[$i] );
   
          if ( $imageURL ne "" && $originalIntId[$i] && $selectedInetref[$i] && $originalIntId[$i] ne '' && $selectedInetref[$i] ne '' ) {
        system ( "wget $imageURL -O $imageDirectory$selectedInetref[$i].jpg -q" );

        $sql2 = "update videometadata set coverfile=\"$imageDirectory$selectedInetref[$i].jpg\" where intid=$originalIntId[$i]";

        $sth2 = $dbh->prepare($sql2);
   
        $sth2->execute || 
                  die "SQL was $sql2\nCould not execute SQL statement ... maybe invalid?";   
      
          print ".";
          }
        }
   }
   
   print "\n\n\n\nAll Done.\n";
   
   $dbh->disconnect();
}

###### Start of Code for movie list
###
###
###
###

##
##
##
##
# get Movie Data 
sub getMovieData {
   my ($movieid, $intid)=@_; # grab movieid parameter
   
   #if ($movieid) {
   #   print ( "Invalid MovieId." );
   #   return;
   #}

   if ($movieid eq  '') {
      print ( "Invalid MovieId." );
      return;
   }

   if (defined $opt_d) { printf("# looking for movie id: '%s'\n", $movieid);}

   # get the search results  page
   my $request = "http://www.imdb.com/title/tt" . $movieid . "/";
   if (defined $opt_d) { printf("# request: '%s'\n", $request); }
   my $response = get $request;
   if (defined $opt_r) { printf("%s", $response); }

   # parse title and year
   my $title = parseBetween($response, "<title>", "</title>");
   $title =~ m#(.+) \((\d+).*\)#;  # Note some years have a /II after them?
   $title = $1;
   my $year = $2;

   $title = $dbh->quote( $title );

   # parse director 
   my $director = parseBetween($response, ">Directed by</b>", "/a><br>");
   $director = parseBetween($director, "/\">", "<");

   # parse writer 
   # (Note: this takes the 'first' writer, may want to include others)
   my $writer = parseBetween($response, ">Writing credits</b>", "</table>");
   $writer = parseBetween($writer, "/\">", "</");

   # parse plot
   my $plot = parseBetween($response, ">Plot Outline:</b> ", "<a href=\"");
   if (!$plot) {
      $plot = parseBetween($response, ">Plot Summary:</b> ", "<a href=\"");
   }

   $plot = $dbh->quote( $plot );

   # parse user rating
   my $userrating = parseBetween($response, ">User Rating:</b>", "> (");
   $userrating = parseBetween($userrating, "<b>", "/");

   # parse MPAA rating
   my $ratingcountry = "USA";
   my $movierating = parseBetween($response, ">MPAA</a>:</b> ", "<br>");
   if (!$movierating) {
       $movierating = parseBetween($response, ">Certification:</b>", "<br>");
       $movierating = parseBetween($movierating, "certificates=$ratingcountry",
                                   "/a>");
       $movierating = parseBetween($movierating, ">", "<");
   }

   # parse movie length
   my $runtime = parseBetween($response, ">Runtime:</b>\n", " min");
   #check to make sure it is a number
   $runtime = $runtime + 0;

   # parse cast 
   #  Note: full cast would be from url: 
   #    www.imdb.com/title/<movieid>/fullcredits
   my @actors;
   my $cast = "";
   my $count = 0;
   my $data = parseBetween($response, "Cast overview, first billed only:",
                               "/table>"); 
   if ($data) {
      my $beg = "/\">"; 
      my $end = "</a>";
      my $start = index($data, $beg);
      my $finish = index($data, $end, $start);
      my $actor;
      while ($start != -1) {
         $start += length($beg);
         $actor = substr($data, $start, $finish - $start);
         # add to array
         $actors[$count++] = $actor;

         # advance data to next movie
         $data = substr($data, - (length($data) - $finish));
         $start = index($data, $beg);
         $finish = index($data, $end, $start + 1); 
      }
      $cast = join(',', @actors);
   }
   
   
   # parse genres 
   my @genres;
   my $lgenres = "";
   $count = 0;
   $data = parseBetween($response, "<b class=\"ch\">Genre:</b>","<a href=\"/rg/title-tease/keywords/title/tt$movieid/keywords\">(more)</a>"); 
   if ($data) {
      my $beg = "/\">"; 
      my $end = "</a>";
      my $start = index($data, $beg);
      my $finish = index($data, $end, $start);
      my $genre;
      while ($start != -1) {
         $start += length($beg);
         $genre = substr($data, $start, $finish - $start);
         # add to array
         $genres[$count++] = $genre;

         # advance data to next movie
         $data = substr($data, - (length($data) - $finish));
         $start = index($data, $beg);
         $finish = index($data, $end, $start + 1); 
      }
      $lgenres = join(',', @genres);
   }
   
   # parse countries 
   my @countries;
   my $lcountries = "";
   $count = 0;
   $data = parseBetween($response, "<b class=\"ch\">Country:</b>","<br>"); 
   if ($data) {
      my $beg = "/\">"; 
      my $end = "</a>";
      my $start = index($data, $beg);
      my $finish = index($data, $end, $start);
      my $country;
      while ($start != -1) {
         $start += length($beg);
         $country = substr($data, $start, $finish - $start);
         # add to array
         $countries[$count++] = $country;

         # advance data to next movie
         $data = substr($data, - (length($data) - $finish));
         $start = index($data, $beg);
         $finish = index($data, $end, $start + 1); 
      }
      $lcountries = join(',', @countries);
   }

   if ( $title eq "" && $director eq "" && $writer eq "" ) {
      print ("Could not find name for $movieid\n" );
      return;
   }

   if ( $title eq "NULL" ) {
      print ("Could not find name for $movieid\n" );
      return;
   }

   if ( $title eq NULL ) {
      print ("Could not find name for $movieid\n" );
      return;
   }

   #output fields (these field names must match what MythVideo is looking for)
   $sql2 = "update videometadata set title=$title, year=$year, director=\"$director\",
     plot=$plot, userrating=\"$userrating\", rating=\"$movierating\", length=$runtime, inetref=\"$movieid\"
     where intid=\"$intid\"";
   
   $sth2 = $dbh->prepare($sql2);
   
   $sth2->execute || 
              die "Query was $sql2\n\nCould not execute SQL statement ... maybe invalid?";
   
   #print "Title:$title\n";
   #print "Year:$year\n";
   #print "Director:$director\n";
   #print "Plot:$plot\n";
   #print "UserRating:$userrating\n";
   #print "MovieRating:$movierating\n";
   #print "Runtime:$runtime\n";
   #print "Writers: $writer\n";
   #print "Cast: $cast\n";
   #print "Genres: $lgenres\n";
   #print "Countries: $lcountries\n";
}


# dump Movie Poster
##
##
##
##
sub getMoviePoster {
   my ($movieid)=@_; # grab movieid parameter
   
   if ($movieid eq '') {
      print "\$movieid is null, returning\n" if (defined $opt_d);
      return;
   }  
   if (defined $opt_d) { printf("# looking for movie id: '%s'\n", $movieid);}

   # get the search results  page
   my $request = "http://www.imdb.com/title/tt" . $movieid . "/posters";
   if (defined $opt_d) { printf("# request: '%s'\n", $request); }
   my $response = get $request;
   if (defined $opt_r) { printf("%s", $response); }

   my $uri = "";

   # look for references to impawards.com posters - they are high quality
   my $site = "http://www.impawards.com";
   my $impsite = parseBetween($response, "<a href=\"".$site, "\">".$site);

   # jersey girl fix
   $impsite = parseBetween($response, "<a href=\"http://impawards.com","\">http://impawards.com") if ($impsite eq "");

   if ($impsite) {
      $impsite = $site . $impsite;

      if (defined $opt_d) { print "# Searching for poster at: ".$impsite."\n"; }
      my $impres = get $impsite;
      if (defined $opt_d) { printf("# got %i bytes\n", length($impres)); }
      if (defined $opt_r) { printf("%s", $impres); }      

   # making sure it isnt redirect
   $uri = parseBetween($impres, "0;URL=..", "\">");
   if ($uri ne "") {
      if (defined $opt_d) { printf("# processing redirect to %s\n",$uri); }
      # this was redirect
                $impsite = $site . $uri;
                $impres = get $impsite;
   }

      
      # do stuff normally   
      $uri = parseBetween($impres, "<img SRC=\"posters/", "\" ALT");
      # uri here is relative... patch it up to make a valid uri
      if (!($uri =~ /http:(.*)/ )) {
         my $path = substr($impsite, 0, rindex($impsite, '/') + 1);
         $uri = $path."posters/".$uri;
      }
      if (defined $opt_d) { print "# found ipmawards poster: $uri\n"; }
   }

   # if the impawards site attempt didn't give a filename grab it from imdb
   if ($uri eq "") {
       if (defined $opt_d) { print "# looking for imdb posters\n"; }
       my $host = "http://posters.imdb.com/posters/";

       $uri = parseBetween($response, $host, "\"><td><td><a href=\"");
       if ($uri ne "") {
           $uri = $host.$uri;
       } else {
          if (defined $opt_d) { print "# no poster found\n"; }
       }
   }
   
   # now we couldnt even find lowres poster from IMDB, lets try looking for dvd
   # cover on amazon.com   

   my @movie_titles;
   my $found_low_res = 0;
   my $k = 0;
   
   # no poster found, take lowres image from imdb
   if ($uri eq "") {
       if (defined $opt_d) { print "# looking for lowres imdb posters\n"; }
       my $host = "http://www.imdb.com/title/tt" . $movieid . "/";
       $response = get $host;

       $uri = parseBetween($response, "alt=\"cover\" src=\"http://ia.imdb.com/media/imdb/", "\"");
       
      if (defined $opt_d) { print "# starting to look for movie title\n"; }
      
      # get main title
      if (defined $opt_d) { print "# Getting possible movie titles:\n"; }
      $movie_titles[$k++] = parseBetween($response, "<title>", "<\/title>");
      if (defined $opt_d) { print "# Title: ".$movie_titles[$k-1]."\n"; }

      # now we get all other possible movie titles and store them in the titles array
      while($response =~ m/>([^>^\(]*)([ ]{0,1}\([^\)]*\)[^\(^\)]*[ ]{0,1}){0,1}\(informal title\)/g) {
         $movie_titles[$k++] = $1;
         chomp($movie_titles[$k-1]);
         $movie_titles[$k-1] =~ s/^\s+//;
         $movie_titles[$k-1] =~ s/\s+$//;
         if (defined $opt_d) { print "# Title: ".$movie_titles[$k-1]."\n"; }
      }
       
       if ($uri ne "" ) {
           $uri = "http://ia.imdb.com/media/imdb/".$uri;
           $found_low_res = 1;
       } else {
          if (defined $opt_d) { print "# no poster found\n"; }
       }
   }
   
   # now we couldnt even find lowres poster from IMDB, lets try looking for dvd
   # cover on amazon.com
   if ($uri eq "" or $found_low_res) {
      if (defined $opt_d) { print "# starting to look for poster on Amazon.com\n"; }

      my $titleid = 0;
      my $found = 0;
      my $ama_uri = "";
      my $xml_parser = XML::Simple->new();
      
         
      do {
         # get rid of the year
         $movie_titles[$titleid] =~ s/ ?\([^\)]+\) ?//g;
         $movie_titles[$titleid] =~ /(.*), The$/i;
         if ($1) { $movie_titles[$titleid] = $1; }

         $movie_titles[$titleid] =~ s/[\"\']//g; # if we give amazon quotes they give them back
      
         if (defined $opt_d) { print "# Searching for: $movie_titles[$titleid]\n"; }

         # Encode the movie title to be save
         my $safe_movie_title = $movie_titles[$titleid];
         $safe_movie_title =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg;      
         # request XML info from amazon
         my $xml_uri = "http://xml.amazon.com/onca/xml3?t=000&dev-t=000&KeywordSearch=".$safe_movie_title."&mode=dvd&type=lite&page=1&f=xml";
         if (defined $opt_d) { print "# Amazon request string is: $xml_uri\n";}

         # get the response
         $response = get $xml_uri;
         if (defined $opt_r) { printf("%s", $response); }
      
         # parse the response
         my $xml_doc = $xml_parser->XMLin($response);

         # if we only got one result, fake it as array
         if (ref($xml_doc->{Details}) ne 'ARRAY') {
            my @tmpArray = ($xml_doc->{Details});
            $xml_doc->{Details} = \@tmpArray;
         }

         $k = 0;
         do {
            if (ref($xml_doc->{Details}->[$k]) eq 'HASH') {
               my $tmp_movie_title = $xml_doc->{Details}->[$k]->{ProductName};
               $tmp_movie_title =~ s/[\"\']//g;
               if (defined $opt_d) { print "# Amazon: comparing (" . $tmp_movie_title . ") to (" . $movie_titles[$titleid] . ")\n"; }
               if ($tmp_movie_title =~ /.*$movie_titles[$titleid].*/) {
                  if (defined $opt_d) { print "# Amazon: found poster " . $xml_doc->{Details}->[$k]->{ImageUrlLarge} . "\n"; }
                  $ama_uri = $xml_doc->{Details}->[$k]->{ImageUrlLarge};
                  $found = 1;
               }
            }

           $k++;
        } until ($found || $k == 5);
        #only search through first 5 matches

         $titleid++;
      } until ($found || $titleid > $#movie_titles);
      
      my $image = get $ama_uri if (defined($ama_uri) && $ama_uri ne "");
      if ($ama_uri ne "" && length($image) eq "807") {
         if (defined $opt_d) { printf("# this image is blank\n"); }
         $ama_uri = "";
      }
      
      if (!defined($ama_uri)) {
         $ama_uri = "";
      }
   
      if ($ama_uri ne "") {
   $uri = $ama_uri;
      }
   }

   return $uri;
}


## Movie List
##
##
##
# dump Movie list:  1 entry per line, each line as 'movieid:Movie Title'
sub getMovieList {
   my ($filename, $options)=@_; # grab parameters

   # If we wanted to inspect the file for any reason we can do that now

   #
   # Convert filename into a query string 
   # (use same rules that Metadata::guesTitle does)
   my $query = $filename;
   $query = uri_unescape($query);  # in case it was escaped
   # Strip off the file extension
   if (rindex($query, '.') != -1) {
      $query = substr($query, 0, rindex($query, '.'));
   }
   # Strip off anything following '(' - people use this for general comments
   if (rindex($query, '(') != -1) {
      $query = substr($query, 0, rindex($query, '(')); 
   }
   # Strip off anything following '[' - people use this for general comments
   if (rindex($query, '[') != -1) {
      $query = substr($query, 0, rindex($query, '[')); 
   }

   # IMDB searches do better if any trailing ,The is left off
   $query =~ /(.*), The$/i;
   if ($1) { $query = $1; }
   
   # prepare the url 
   $query = uri_escape($query);
   if (!$options) { $options = "" ;}
   if (defined $opt_d) { 
      printf("# query: '%s', options: '%s'\n", $query, $options);
   }
   
   # get the search results  page
   #    some known IMDB options are:  
   #         type=[fuzy]         looser search
   #         from_year=[int]     limit matches to year (broken at imdb)
   #         to_year=[int]       limit matches to year (broken at imdb)
   #         sort=[smart]        ??
   #         tv=[no|both|only]   limits between tv and movies (broken at imdb)
   #$options = "tt=on;nm=on;mx=20";  # not exactly clear what these options do
   my $request = "http://www.imdb.com/find?q=$query;$options";
   if (defined $opt_d) { printf("# request: '%s'\n", $request); }
   my $response = get $request;
   if (defined $opt_r) {
      print $response;
      exit(0);
   }
   my @movies;
   # check to see if we got a results page or a movie page
   #    looking for 'add=<movieid>" target=' which only exists
   #    in a movie description page 
   my $movienum = parseBetween($response, "add=", "\" target=");
   if ($movienum) {
       if (defined $opt_d) { printf("# redirected to movie page\n"); }
       my $movietitle = parseBetween($response, "<title>", "</title>"); 
       $movietitle =~ m#(.+) \((\d+)\)#;
       $movietitle = $1;
       
       $movies[0][0] = $movietitle;
       $movies[0][1] = $movienum;
       
          return @movies;
   }

   # extract possible matches
   #    possible matches are grouped in several catagories:  
   #        exact, partial, and approximate
   my $popular_results = parseBetween($response, "<b>Popular Titles</b>",
                                              "</ol>");
   my $exact_matches = parseBetween($response, "<b>Titles (Exact Matches)</b>",
                                              "</ol>");
   my $partial_matches = parseBetween($response, "<b>Titles (Partial Matches)</b>", 
                                              "</ol>");
#   my $approx_matches = parseBetween($response, "<b>Approximate Matches</b>", 
#                                               "</ol>");
   # parse movie list from matches
   my $beg = "<li>";
   my $end = "</li>";
   my $count = 0;

#   my $data = $exact_matches.$partial_matches;
   my $data = $popular_results.$exact_matches;
   # resort to partial matches if no exact
   if ($data eq "") { $data = $partial_matches; }
   # resort to approximate matches if no exact or partial
#   if ($data eq "") { $data = $approx_matches; }
   if ($data eq "") {
      if (defined $opt_d) { printf("# no results\n"); }
      return; 
   }
   my $start = index($data, $beg);
   my $finish = index($data, $end, $start);
   my $year;
   my $type;
   my $title;
   while ($start != -1 && $start < length($data)) {
      $start += length($beg);
      my $entry = substr($data, $start, $finish - $start);
      $start = index($data, $beg, $finish + 1);
      $finish = index($data, $end, $start);

      my $title = "";
      my $year = "";
      my $type = "";
      my $movienum = "";

      my $link_end = "</a>";
      $fl_end = index($entry, $link_end);
      $fl_end += length($link_end);
      my $lhs = substr($entry, 0, $fl_end);
      my $rhs = substr($entry, $fl_end);

      if ($lhs =~ m/<a href="\/title\/tt(\d+)\/.*\">(.+)<\/a>/i) {
          $movienum = $1;
          $title = $2;
      } else {
           if (defined $opt_d) {
               print("Unrecognized entry format\n");
           }
           next;
      }

      if ($rhs =~ m/\((\d+)\) \((.+)\)/) {
          $year = $1;
          $type = $2;
      } elsif ($rhs =~ m/\((\d+)\)/) {
          $year = $1;
      }

      my $skip = 0;

      # fix broken 'tv=no' option
      if ($options =~ /tv=no/) {
         if ($type eq "TV") {
            if (defined $opt_d) {printf("# skipping TV program: %s\n", $title);}
            $skip = 1; 
         }
      }
      if ($options =~ /tv=only/) {
         if ($type eq "") {
            if (defined $opt_d) {printf("# skipping Movie: %s\n", $title);}
            $skip = 1; 
         }
      }
      # fix broken 'from_year=' option
      if ($options =~ /from_year=(\d+)/) {
         if ($year < $1) {
            if (defined $opt_d) {printf("# skipping b/c of yr: %s\n", $title);}
            $skip = 1; 
         }
      }
      # fix broken 'to_year=' option
      if ($options =~ /to_year=(\d+)/) {
         if ($year > $1) {
            if (defined $opt_d) {printf("# skipping b/c of yr: %s\n", $title);}
            $skip = 1; 
         }
      }

      # option to strip out videos (I think that's what '(V)' means anyway?)
      if ($options =~ /video=no/) {
         if ($type eq "V") {
            if (defined $opt_d) {
                printf("# skipping Video program: %s\n", $title);
            }
            $skip = 1; 
         }
      }
   
      # (always) strip out video game's (why does IMDB give these anyway?)
      if ($type eq "VG") {
         if (defined $opt_d) {printf("# skipping videogame: %s\n", $title);}
         $skip = 1; 
      }

      # add to array
      if (!$skip) {
          my $moviename = $title;
          if ($year ne "") {
              $moviename .= " ($year)";
          }

         $movies[$count][0] = $moviename;

         $movies[$count++][1] = $movienum;
      }
   }

   # display array of values

   return @movies;
}

# returns text within 'data' between 'beg' and 'end' matching strings
sub parseBetween {
   my ($data, $beg, $end)=@_; # grab parameters

   my $ldata = lc($data);
   my $start = index($ldata, lc($beg)) + length($beg);
   my $finish = index($ldata, lc($end), $start);
   if ($start != (length($beg) -1) && $finish != -1) {
      my $result = substr($data, $start, $finish - $start);
      # return w/ decoded numeric character references
      # (see http://www.w3.org/TR/html4/charset.html#h-5.3.1)
      decode_entities($result);
      return $result;
   }
   return "";
}

Announcement: A Batch IMDB Grabber - Updated to v 1.1

Who is online