Here’s a little example of the usage of perl’s XML::LibXML to extract links from a HTML page.
#!/usr/bin/perl use strict; use warnings; use LWP::Simple; use XML::LibXML; my $url = "http://fantasyfilmfest.com/pages/filme.html"; my $fw = "http://www.freshwap.com/index.php?do=search&subaction=search&full_search=1&catlist[]=5&titleonly=3&story="; my $p = XML::LibXML->new(); my %opts = ( suppress_errors => 1, recover => 1, ); my $dom = $p->parse_html_file($url, \%opts); my $root = $dom->getDocumentElement; my $title; my $info; foreach my $node ($root->findnodes("//div[\@class='FilmREITER']")) { $title = $node->findvalue('a'); if ($title =~ m/, the/i) { $title =~ s/(.*), the/the $1/i; } print $title . "\n"; $dom = $p->parse_html_file($fw.$title, \%opts); next if !defined($dom); $root = $dom->getDocumentElement; foreach my $link ($root->findnodes("//div[\@class='title']/a")) { print $link->getAttribute('href')."\n"; } print "--------------\n"; }
Image may be NSFW.
Clik here to view.

Clik here to view.
