Scraping Imdb with Hpricot
11 Jan 2007 21:45 - (0) comments
Imdb doesn't use semantic HTML (try finding important headings by searching for the b tag). So scraping it with Hpricot with Xpath and CSS selectors is harder than it should be.
The following script finds info on Imdb for filenames in a directory.
The info is saved to a HTML page.
#!/usr/bin/env ruby
require 'rubygems'
require 'hpricot'
require 'open-uri'
require 'find'
require 'cgi'fail "Usage: ./imdb.rb DIRECTORY" unless ARGV.size == 1
class ImdbScraper
def initialize
@title_matches = ["Popular Titles" , "Titles (Exact Matches)", "Titles (Partial Matches)"]
enddef find_match(title)
doc = open("http://imdb.com/find?s=all&q=#{CGI::escape(title)};more=tt") { |f| Hpricot(f) }
elements = doc.search("/html/body//p").each{ |element|
if (@title_matches.include?(element.search("/b").inner_html))
return element.search("/ol/li/a").first['href']
end
}
enddef get_poster(url, selector)
doc = open(url) { |f| Hpricot(f) }
return [""] if (doc.nil? || doc.search(selector).first.nil?)
doc.search(selector).first.get_attribute("background").to_s.strip
enddef get_contents(url, selector)
doc = open(url) { |f| Hpricot(f) }
return [""] if (doc.nil? || doc.at(selector).nil?)
contents = []
doc.at(selector).traverse_text { |text| contents << text }
contents
enddef get_first_content(url, selector); get_contents(url, selector).first.to_s.strip; end
def find_info(title)
p "Looking up #{title}"
href = find_match(title)
if (!href.nil?)
url = "http://imdb.com#{href}".split("?").first
p "Mining #{url}"
{"title" => get_first_content(url, "/html/body//h1/strong"),
"summary" => get_first_content(url + "plotsummary", "//p.plotpar"),
"rating" => get_contents(url + "ratings", "/html/body//p").to_s,
"image" => get_poster(url + "posters", "//table[@background]").to_s
}
end
end
enddef find_files(dir)
files = []
Find.find(dir) { |file|
files << file if (File.file?(file) && File.basename(file)[0, 1] != ".")
}
files
endcontents = []
scraper = ImdbScraper.newfind_files($*[0]).each { |file|
info = scraper.find_info(File.basename(file, File.extname(file)))
contents << "<h1>#{info['title']}</h1>
<p>#{info['summary']}</p>
<p>#{info['rating']}</p>
" unless info.nil?
}html = File.new("index.html" , "w")
html.puts "<html>
<body>#{contents}
</body>
</html>"
html.close
Comments
No comments allowed.