Here’s a little example of the usage of perl’s XML::LibXML to extract links from a HTML page.
#!/usr/bin/perl
use strict;
use warnings;
use LWP::Simple;
use XML::LibXML;
my $url = "http://fantasyfilmfest.com/pages/filme.html";
my $fw = "http://www.freshwap.com/index.php?do=search&subaction=search&full_search=1&catlist[]=5&titleonly=3&story=";
my $p = XML::LibXML->new();
my %opts = (
suppress_errors => 1,
recover => 1,
);
my $dom = $p->parse_html_file($url, \%opts);
my $root = $dom->getDocumentElement;
my $title;
my $info;
foreach my $node ($root->findnodes("//div[\@class='FilmREITER']")) {
$title = $node->findvalue('a');
if ($title =~ m/, the/i) {
$title =~ s/(.*), the/the $1/i;
}
print $title . "\n";
$dom = $p->parse_html_file($fw.$title, \%opts);
next if !defined($dom);
$root = $dom->getDocumentElement;
foreach my $link ($root->findnodes("//div[\@class='title']/a")) {
print $link->getAttribute('href')."\n";
}
print "--------------\n";
}