Quantcast
Channel: devnotcorp » html
Viewing all articles
Browse latest Browse all 2

Extract links using ruby and mechanize

0
0

#!/usr/bin/ruby
require 'rubygems'
require 'mechanize'

url = "http://fantasyfilmfest.com/pages/filme.html";
fw = "http://www.freshwap.com/index.php?do=search&subaction=search&full_search=1&catlist[]=5&titleonly=3&story=";

# Set up the user agent.
agent = Mechanize.new
agent.history.max_size = 0
agent.user_agent_alias = 'Linux Firefox'
agent.read_timeout = 3 

# Set proxy if environment variable is set.
proxy_regex = /:\/\/(.[^:]*):(\d*)/
if ENV['http_proxy'] != nil && ENV['http_proxy'].match(proxy_regex) 
  agent.set_proxy(Regexp.last_match(1), Regexp.last_match(2))
end

page = agent.get(url)
rows = page./('div.ROW')
rows.each do |row|
  title = row./('th.filmtitel ul.LIST li.LIST a.LIST').text.strip
  puts title
  links = agent.get("#{fw}#{title}");
  links./('div.title a').each do |link|
    url = link.attribute('href').to_s
    if url.match(/movies/)
      puts url 
    end
  end
end


Viewing all articles
Browse latest Browse all 2

Latest Images

Trending Articles





Latest Images