Mirroring Single Pages
% ./MirrorOne.pl
cats.html: Not Modified
dogs.html: OK
gillie_fish.html: Not Modified
----------------------Script I.3.1 mirrorOne.pl--------------------
#!/usr/local/bin/perl
# mirrorOne.pl
use LWP::Simple;
use HTTP::Status;
use constant DIRECTORY => '/local/web/price_lists';
%DOCUMENTS = (
'dogs.html' => 'http://www.pets.com/dogs/price_list.html',
'cats.html' => 'http://www.pets.com/cats/price_list.html',
'gillie_fish.html' => 'http://aquaria.com/prices.html'
);
chdir DIRECTORY;
foreach (sort keys %DOCUMENTS) {
my $status = mirror($DOCUMENTS{$_},$_);
warn "$_: ",status_message($status),"\n";
}
-------------------------------------------------------------------
Mirroring a Document Tree
With a little more work, you can recursively mirror an entire set of linked pages. Script I.3.2 mirrors the requested document and all subdocuments, using the LWP HTML::LinkExtor module to extract all the HTML links.----------------------Script I.3.2 mirrorTree.pl--------------------
#!/usr/local/bin/perl
# File: mirrorTree.pl
use LWP::UserAgent;
use HTML::LinkExtor;
use URI::URL;
use File::Path;
use File::Basename;
%DONE = ();
my $URL = shift;
$UA = new LWP::UserAgent;
$PARSER = HTML::LinkExtor->new();
$TOP = $UA->request(HTTP::Request->new(HEAD => $URL));
$BASE = $TOP->base;
mirror(URI::URL->new($TOP->request->url));
sub mirror {
my $url = shift;
# get rid of query string "?" and fragments "#"
my $path = $url->path;
my $fixed_url = URI::URL->new ($url->scheme . '://' . $url->netloc . $path);
# make the URL relative
my $rel = $fixed_url->rel($BASE);
$rel .= 'index.html' if $rel=~m!/$! || length($rel) == 0;
# skip it if we've already done it
return if $DONE{$rel}++;
# create the directory if it doesn't exist already
my $dir = dirname($rel);
mkpath([$dir]) unless -d $dir;
# mirror the document
my $doc = $UA->mirror($fixed_url,$rel);
print STDERR "$rel: ",$doc->message,"\n";
return if $doc->is_error;
# Follow HTML documents
return unless $rel=~/\.html?$/i;
my $base = $doc->base;
# pull out the links and call us recursively
my @links = $PARSER->parse_file("$rel")->links;
my @hrefs = map { url($_->[2],$base)->abs } @links;
foreach (@hrefs) {
next unless is_child($BASE,$_);
mirror($_);
}
}
sub is_child {
my ($base,$url) = @_;
my $rel = $url->rel($base);
return ($rel ne $url) && ($rel !~ m!^[/.]!);
}
No comments:
Post a Comment