% find_bad_links http://prego/apache-1.2/
checking http://prego/apache-1.2/...
checking http://prego/apache-1.2/manual/...
checking http://prego/apache-1.2/manual/misc/footer.html...
checking http://prego/apache-1.2/manual/misc/header.html...
checking http://prego/apache-1.2/manual/misc/nopgp.html...
checking http://www.yahoo.com/Science/Mathematics/Security_and_Encryption/...
checking http://www.eff.org/pub/EFF/Policy/Crypto/...
checking http://www.quadralay.com/www/Crypt/Crypt.html...
checking http://www.law.indiana.edu/law/iclu.html...
checking http://bong.com/~brian...
checking http://prego/apache-1.2/manual/cgi_path.html...
checking http://www.ics.uci.edu/pub/ietf/http/...
.
.
.
BAD LINKS:
manual/misc/known_bugs.html : http://www.apache.org/dist/patches/apply_to_1.2b6/
manual/misc/fin_wait_2.html : http://www.freebsd.org/
manual/misc/fin_wait_2.html : http://www.ncr.com/
manual/misc/compat_notes.html : http://www.eit.com/
manual/misc/howto.html : http://www.zyzzyva.com/robots/alert/
manual/misc/perf.html : http://www.software.hp.com/internet/perf/tuning.html
manual/misc/perf.html : http://www.qosina.com/~awm/apache/linux-tcp.html
manual/misc/perf.html : http://www.sun.com/sun-on-net/Sun.Internet.Solutions/performance/
manual/misc/perf.html : http://www.sun.com/solaris/products/siss/
manual/misc/nopgp.html : http://www.yahoo.com/Science/Mathematics/Security_and_Encryption/
152 documents checked
11 bad links
----------------------Script I.3.2 mirrorTree.pl--------------------
#!/usr/local/bin/perl
# File: find_bad_links.pl
use LWP::UserAgent;
use HTML::LinkExtor;
use URI::URL;
%CAN_HANDLE = ('http'=>1,
'gopher'=>1,
# 'ftp'=>1, # timeout problems?
);
%OUTCOME = ();
$CHECKED = $BAD = 0;
@BAD = ();
my $URL = shift;
$UA = new LWP::UserAgent;
$PARSER = HTML::LinkExtor->new();
$TOP = $UA->request(HTTP::Request->new(HEAD => $URL));
$BASE = $TOP->base;
check_links(URI::URL->new($TOP->request->url));
if (@BAD) {
print "\nBAD LINKS:\n";
print join("\n",@BAD),"\n\n";
}
print "$CHECKED documents checked\n",scalar(@BAD)," bad links\n";
sub check_links {
my $url = shift;
my $fixed_url = $url;
$fixed_url =~ s/\#.+$//;
return 1 unless $CAN_HANDLE{$url->scheme};
# check cached outcomes
return $OUTCOME{$fixed_url} if exists $OUTCOME{$fixed_url};
print STDERR "checking $fixed_url...\n";
$CHECKED++;
my $rel = $url->rel($BASE) || 'index.html';
my $child = is_child($BASE,$url);
$UA->timeout(5);
my $doc = $d = $UA->request(HTTP::Request->new(($child ? 'GET' : 'HEAD' )=>$url));
$OUTCOME{$fixed_url} = $doc->is_success;
return $OUTCOME{$fixed_url}
unless $child && $doc->header('Content-type') eq 'text/html';
# Follow HTML documents
my $base = $doc->base;
# pull out the links and call us recursively
my @links = $PARSER->parse($doc->content)->links;
my @hrefs = map { url($_->[2],$base)->abs } @links;
foreach (@hrefs) {
next if check_links($_);
push (@BAD,"$rel : $_");
}
1;
}
sub is_child {
my ($base,$url) = @_;
my $rel = $url->rel($base);
return ($rel ne $url) && ($rel !~ m!^[/.]!);
}
No comments:
Post a Comment