#!/usr/bin/perl

use 5.005;
use strict;
use Getopt::Std;
use LWP::UserAgent;
use HTML::Parser;
use Pod::Usage;
use URI;

package HTML::Parser::Links;

use base qw(HTML::Parser);

sub new
{
    my($class, $base) = @_;

    my $parser = new HTML::Parser;
    $parser->{base }    = $base;
    $parser->{links}    = [];
    $parser->{fragment} = {};

    bless $parser, $class
}


sub start
{
    my($parser, $tag, $attr, $attrseq, $origtext) = @_;

    $tag eq 'base' and 
	$parser->{base} = $attr->{href};

    $tag eq 'a' and $attr->{href} and do
    {
	my $base = $parser->{base};
	my $href = $attr->{href};
	my $uri  = new_abs URI $href, $base;
	push @{$parser->{links}}, $uri;
    };

    $tag eq 'a' and $attr->{name} and do
    {
	my $name = $attr->{name};
	$parser->{fragment}{$name} = 1;
    };
}


sub links
{
    my $parser = shift;
    $parser->{links}
}


sub check_fragment
{
    my($parser, $fragment) = @_;
    $parser->{fragment}{$fragment}
}


package Page;


sub new
{
    my($package, $uri) = @_;

    my $page = { uri  => $uri,
	         base => $uri };

    bless $page, $package
}


sub uri  { shift->{'uri' } }
sub base { shift->{'base'} }


sub get
{
    my $page = shift;
    my $uri  = $page->{uri};

    exists $Page::Content{$uri} and 
	return $Page::Content{$uri};

    my $ua = new LWP::UserAgent;

    # WEBY hack
    my $weby_METHOD;
    if($uri !~ /\.(jpg|jpeg|png|gif|avi|mpeg|mpg|swf|mp3|wave|gz|tar|zip|rar|iso)$/i) { $weby_METHOD='GET'; }
    else { $weby_METHOD='HEAD'; }
    #print "# method: $weby_METHOD...$uri\n" $Options{v} > 2;
    # end of weby hack
 
    #return 1 if $uri =~ /\#$/;
   
    my $request  = new HTTP::Request $weby_METHOD => $uri;
    my $response = $ua->request($request);
       $response->is_success or
	   return undef;

    $page->{base} = $response->request->uri;

    $Page::Content{$uri} = $response->content;
    $response->content;
}


sub parse
{
    my $page = shift;
    my $uri  = $page->{uri};
    exists $Page::Parser{$uri} and 
	return $Page::Parser{$uri};

    my $content = $page->get;
    defined $content or 
	return undef;

    my $parser = new HTML::Parser::Links $page->base;
       $parser->parse($content);
       $parser->eof;

    $Page::Parser{$uri} = $parser;
    $parser
}


sub links
{
    my $page   = shift;
    my $parser = $page->parse;
    defined $parser or 
	return undef;

    $parser->links
}


package Link;

sub new
{
    my($package, $uri) = @_;

    my $base     = $uri ->clone;
    my $fragment = $base->fragment(undef);
    
    my $link = { uri      => $uri,
	         base     => $base,
	         fragment => $fragment };

    bless $link, $package
}


sub check
{
    my $link = shift;
    my $uri  = $link->{uri};
    
    
    exists $Link::Check{$uri} and 
	return $Link::Check{$uri};
    
    my $fragment = $link->{fragment};
    my $check    = defined $fragment ? 'check_fragment' : 
				       'check_base';
    my $success  = $link->$check();

    $Link::Check{$uri} = $success;
    $success
}


sub check_fragment
{
    my $link     = shift;
    my $base     = $link->{base};
    my $fragment = $link->{fragment};

    my $page     = new Page $base;
    my $parser   = $page->parse;
    defined $parser or return '';

    $parser->check_fragment($fragment)
}


sub check_base
{
    my $link = shift;
    my $base = $link->{base};

    my $ua       = new LWP::UserAgent;
    my $request  = new HTTP::Request HEAD => $base;
    my $response = $ua->request($request);

    # Some servers don't like HEAD requests
    $response->code==500 and do
    {
	$request  = new HTTP::Request GET => $base;
	$response = $ua->request($request);
    };

    $response->is_success;
}


package Spinner;

use vars qw($N @Spin);

@Spin = ('|', '/', '-', '\\');

sub Spin
{
    print STDERR $Spin[$N++], "\r";
    $N==4 and $N=0;
}



package main;

my %Options;
my %Checked;
my($Scheme, $Authority);
my($Pages, $Links, $Broken) = (0, 0, 0);

getopt('vt', \%Options);
Help();
CheckPages(@ARGV);
Summary();


sub Help
{
    $Options{H} and pod2usage(VERBOSE=>1);
    $Options{M} and pod2usage(VERBOSE=>2);
    @ARGV       or  pod2usage();
}


sub CheckPages
{
    my @pages = @_;
    my @URIs  = map { new URI $_ } @pages;

    for my $uri (@URIs)
    {
	$Scheme    = $uri->scheme;
	$Authority = $uri->authority;
	CheckPage($uri);
    }
}


sub CheckPage
{
    my $uri = shift;
    
    $Checked{$uri} and return;
    $Checked{$uri} = 1;
    $Pages++;
    
    Twiddle();

    print "PAGE $uri\n" if $Options{v} > 1;

    my $page  = new Page $uri;
    my $links = $page->links;
    defined $links or 
	die "Can't get $uri\n";

    CheckLinks($page, $links);
}


sub CheckLinks
{
    my($page, $links) = @_;
    my @links;

    for my $link (@$links)
    {
	# skip ANCHORS! (weby's hack)
	next if $link =~ /\#/;

	$link->scheme eq 'http' or next;
	my $on_site = $link->authority eq $Authority;
	$on_site or $Options{o} or next;

	$Links++;
	Twiddle();
	print "LINK $link\n" if $Options{v} > 2;
	Link->new($link)->check or do
	{
	    Report($page, $link);
	    next;
	};

	$on_site or next;
	$link->fragment(undef);
	push @links, $link;
    }

    $Options{r} or return;

    for my $link (@links)
    {
	CheckPage($link);
    }
}


sub Report
{
    my($page, $link) = @_;

    my $uri  = $page->uri->as_string;
       $link = $link     ->as_string;

    $Options{a} and do
    {
	$uri  =~ s($Scheme://$Authority)();
	$link =~ s($Scheme://$Authority)();
    };

    $Broken++;
    print "[BROKEN] $link\n" if $Options{v} > 0;
}


sub Twiddle
{
    $Options{t}==1 and Spinner::Spin();
    $Options{t}==2 and Progress();
}

sub Progress
{
    print STDERR "$Pages pages, $Links links, $Broken broken\r";
}

sub Summary
{
    print STDERR "Checked $Pages pages, $Links links          \n";
    print STDERR "Found $Broken broken links\n";
}

__END__

=head1 NAME

B<linkcheck> - check the links on an HTML page

=head1 SYNOPSIS

B<linkcheck> [-a] [-o] [-r] [B<-t> I<level>] [B<-v> I<level>] I<URI> ...

=head1 DESCRIPTION

B<linkcheck> reads the web pages at I<URI> ...,
and checks the existence of any links that it finds there.

=head1 OPTIONS

=over 4

=item B<-a>

Omit the scheme://authority part when reporting broken links.

=item B<-o>

Check off-site links.

=item B<-r>

Recursively check pages that I<URI> links to.
Doesn't recurse to off-site pages.

=item B<-t> I<level>

Indicate activity with a twiddle: 0, 1, 2

=item B<-v> I<level>

Verbosity level: 0, 1, 2, 3

=back

=head1 BUGS

=over 4

=item *

The definition of I<off-site> is too simple.
There should be a way to restrict recursion to a directory
tree within a web site.

=back

=head1 AUTHOR

Steven McDougall, swmcd@world.std.com

=head1 COPYRIGHT

Copyright 2000 by Steven McDougall. This program is free
software; you can redistribute it and/or modify it under the same
terms as Perl.