Scraping Imdb with Hpricot

Imdb doesn’t use semantic HTML (try finding important headings by searching for the b tag). So scraping it with Hpricot with Xpath and CSS selectors is harder than it should be.

The following script finds info on Imdb for filenames in a directory. The info is saved to a HTML page.

#!/usr/bin/env ruby

require 'rubygems'
require 'hpricot'
require 'open-uri'
require 'find'
require 'cgi'

fail "Usage: ./imdb.rb  DIRECTORY" unless ARGV.size == 1

class ImdbScraper

    def initialize
        @title_matches = ["Popular Titles" , "Titles (Exact Matches)", "Titles (Partial Matches)"]
    end

    def find_match(title)
        doc = open("http://imdb.com/find?s=all&q=#{CGI::escape(title)};more=tt") { |f| Hpricot(f) }
        elements = doc.search("/html/body//p").each{ |element|
            if (@title_matches.include?(element.search("/b").inner_html))
                return element.search("/ol/li/a").first['href']
            end
        }
    end

    def get_poster(url, selector)
        doc = open(url) { |f| Hpricot(f) }
        return [""] if (doc.nil? || doc.search(selector).first.nil?)
        doc.search(selector).first.get_attribute("background").to_s.strip
    end

    def get_contents(url, selector)
        doc = open(url) { |f| Hpricot(f) }
        return [""] if (doc.nil? || doc.at(selector).nil?)
        contents = []
        doc.at(selector).traverse_text { |text| contents << text }
        contents
    end

    def get_first_content(url, selector); get_contents(url, selector).first.to_s.strip; end

    def find_info(title)
        p "Looking up #{title}" 
        href = find_match(title)
        if (!href.nil?)
            url = "http://imdb.com#{href}".split("?").first
            p "Mining #{url}" 
            {"title" => get_first_content(url, "/html/body//h1/strong"),
                "summary" => get_first_content(url + "plotsummary", "//p.plotpar"),
                "rating" => get_contents(url + "ratings", "/html/body//p").to_s,
                "image" => get_poster(url + "posters", "//table[@background]").to_s
                }    
        end
    end
end

def find_files(dir)
    files = []
    Find.find(dir) { |file|
        files << file if (File.file?(file) && File.basename(file)[0, 1] != ".")
    }
    files
end

contents = []
scraper = ImdbScraper.new

find_files($*[0]).each { |file|
    info = scraper.find_info(File.basename(file, File.extname(file)))
    contents << "<h1>#{info['title']}</h1>
        <p>#{info['summary']}</p>
        <p>#{info['rating']}</p>
        " unless info.nil?
    }

html = File.new("index.html" , "w")
html.puts "<html>
    <body>#{contents}
    </body>
    </html>" 
html.close

admin