#!/usr/bin/perl
use 5.005;
use strict;
use Getopt::Std;
use LWP::UserAgent;
use HTML::Parser;
use Pod::Usage;
use URI;
package HTML::Parser::Links;
use base qw(HTML::Parser);
sub new
{
my($class, $base) = @_;
my $parser = new HTML::Parser;
$parser->{base } = $base;
$parser->{links} = [];
$parser->{fragment} = {};
bless $parser, $class
}
sub start
{
my($parser, $tag, $attr, $attrseq, $origtext) = @_;
$tag eq 'base' and
$parser->{base} = $attr->{href};
$tag eq 'a' and $attr->{href} and do
{
my $base = $parser->{base};
my $href = $attr->{href};
my $uri = new_abs URI $href, $base;
push @{$parser->{links}}, $uri;
};
$tag eq 'a' and $attr->{name} and do
{
my $name = $attr->{name};
$parser->{fragment}{$name} = 1;
};
}
sub links
{
my $parser = shift;
$parser->{links}
}
sub check_fragment
{
my($parser, $fragment) = @_;
$parser->{fragment}{$fragment}
}
package Page;
sub new
{
my($package, $uri) = @_;
my $page = { uri => $uri,
base => $uri };
bless $page, $package
}
sub uri { shift->{'uri' } }
sub base { shift->{'base'} }
sub get
{
my $page = shift;
my $uri = $page->{uri};
exists $Page::Content{$uri} and
return $Page::Content{$uri};
my $ua = new LWP::UserAgent;
my $request = new HTTP::Request GET => $uri;
my $response = $ua->request($request);
$response->is_success or
return undef;
$page->{base} = $response->request->uri;
$Page::Content{$uri} = $response->content;
$response->content
}
sub parse
{
my $page = shift;
my $uri = $page->{uri};
exists $Page::Parser{$uri} and
return $Page::Parser{$uri};
my $content = $page->get;
defined $content or
return undef;
my $parser = new HTML::Parser::Links $page->base;
$parser->parse($content);
$parser->eof;
$Page::Parser{$uri} = $parser;
$parser
}
sub links
{
my $page = shift;
my $parser = $page->parse;
defined $parser or
return undef;
$parser->links
}
package Link;
sub new
{
my($package, $uri) = @_;
my $base = $uri ->clone;
my $fragment = $base->fragment(undef);
my $link = { uri => $uri,
base => $base,
fragment => $fragment };
bless $link, $package
}
sub check
{
my $link = shift;
my $uri = $link->{uri};
exists $Link::Check{$uri} and
return $Link::Check{$uri};
my $fragment = $link->{fragment};
my $check = defined $fragment ? 'check_fragment' :
'check_base';
my $success = $link->$check();
$Link::Check{$uri} = $success;
$success
}
sub check_fragment
{
my $link = shift;
my $base = $link->{base};
my $fragment = $link->{fragment};
my $page = new Page $base;
my $parser = $page->parse;
defined $parser or return '';
$parser->check_fragment($fragment)
}
sub check_base
{
my $link = shift;
my $base = $link->{base};
my $ua = new LWP::UserAgent;
my $request = new HTTP::Request HEAD => $base;
my $response = $ua->request($request);
# Some servers don't like HEAD requests
$response->code==500 and do
{
$request = new HTTP::Request GET => $base;
$response = $ua->request($request);
};
$response->is_success;
}
package Spinner;
use vars qw($N @Spin);
@Spin = ('|', '/', '-', '\\');
sub Spin
{
print STDERR $Spin[$N++], "\r";
$N==4 and $N=0;
}
package main;
my %Options;
my %Checked;
my($Scheme, $Authority);
my($Pages, $Links, $Broken) = (0, 0, 0);
getopt('vt', \%Options);
Help();
CheckPages(@ARGV);
Summary();
sub Help
{
$Options{H} and pod2usage(VERBOSE=>1);
$Options{M} and pod2usage(VERBOSE=>2);
@ARGV or pod2usage();
}
sub CheckPages
{
my @pages = @_;
my @URIs = map { new URI $_ } @pages;
for my $uri (@URIs)
{
$Scheme = $uri->scheme;
$Authority = $uri->authority;
CheckPage($uri);
}
}
sub CheckPage
{
my $uri = shift;
$Checked{$uri} and return;
$Checked{$uri} = 1;
$Pages++;
Twiddle();
print "PAGE $uri\n" if $Options{v} > 1;
my $page = new Page $uri;
my $links = $page->links;
defined $links or
die "Can't get $uri\n";
CheckLinks($page, $links);
}
sub CheckLinks
{
my($page, $links) = @_;
my @links;
for my $link (@$links)
{
$link->scheme eq 'http' or next;
my $on_site = $link->authority eq $Authority;
$on_site or $Options{o} or next;
$Links++;
Twiddle();
print "LINK $link\n" if $Options{v} > 2;
Link->new($link)->check or do
{
Report($page, $link);
next;
};
$on_site or next;
$link->fragment(undef);
push @links, $link;
}
$Options{r} or return;
for my $link (@links)
{
CheckPage($link);
}
}
sub Report
{
my($page, $link) = @_;
my $uri = $page->uri->as_string;
$link = $link ->as_string;
$Options{a} and do
{
$uri =~ s($Scheme://$Authority)();
$link =~ s($Scheme://$Authority)();
};
$Broken++;
print "BROKEN $uri -> $link\n" if $Options{v} > 0;
}
sub Twiddle
{
$Options{t}==1 and Spinner::Spin();
$Options{t}==2 and Progress();
}
sub Progress
{
print STDERR "$Pages pages, $Links links, $Broken broken\r";
}
sub Summary
{
print STDERR "Checked $Pages pages, $Links links \n";
print STDERR "Found $Broken broken links\n";
}
__END__
=head1 NAME
B<linkcheck> - check the links on an HTML page
=head1 SYNOPSIS
B<linkcheck> [-a] [-o] [-r] [B<-t> I<level>] [B<-v> I<level>] I<URI> ...
=head1 DESCRIPTION
B<linkcheck> reads the web pages at I<URI> ...,
and checks the existence of any links that it finds there.
=head1 OPTIONS
=over 4
=item B<-a>
Omit the scheme://authority part when reporting broken links.
=item B<-o>
Check off-site links.
=item B<-r>
Recursively check pages that I<URI> links to.
Doesn't recurse to off-site pages.
=item B<-t> I<level>
Indicate activity with a twiddle: 0, 1, 2
=item B<-v> I<level>
Verbosity level: 0, 1, 2, 3
=back
=head1 BUGS
=over 4
=item *
The definition of I<off-site> is too simple.
There should be a way to restrict recursion to a directory
tree within a web site.
=back
=head1 AUTHOR
Steven McDougall, swmcd@world.std.com
=head1 COPYRIGHT
Copyright 2000 by Steven McDougall. This program is free
software; you can redistribute it and/or modify it under the same
terms as Perl.