lfinder.pl

#!/usr/bin/perl

# #############################################################################
# weby rulez
# #############################################################################

use strict;
use File::Path;
use LWP::UserAgent;
use Getopt::Long;

my $url;
my $depth = 255;
my $save;
my $help;

my $result=GetOptions ( "url=s"   => \$url,     # string
			"depth=i" => \$depth,   # digit
			"save"    => \$save,    # flag
			"h|help"  => \$help);

&usage if $help;
&usage unless $url;

my $URL   = $url;
my $DEPTH = $depth;
my $SAVE  = $save;

mkdir($URL);
chdir($URL);

my @html     = get_hmtl($URL);
my $link_ref = extract_links(\@html);

# store index page
save(\@html, "index.html") if $SAVE;

# free array html
@html = undef;
my %visited;

fuck_url($link_ref);

sub usage
{
	print "
USAGE:
	
$0 [url] [save] [depth]
	
";
	exit;
}
sub fuck_url 
{
	my $link_ref = shift;
	for my $link (@{ $link_ref} ) {
	
		# skip already visited pages
		next if exists $visited{ $link };
	
		my $link_depth = link_depth($link);
	
		# skip this link if dpth is defined
		next if $link_depth > $DEPTH;
		
		# processing the alowed links here 
		print "link [$link_depth]: $link \n";
		
		# save the file if we have this command
		my @link_html = get_hmtl($link);
		my $l_ref     = extract_links(\@link_html);

		# Save page if needed
		&save(\@link_html, $link) if $SAVE;
		
		# save the page to visited
		$visited{ $link } = 1;
		
		# recurse 
		fuck_url($l_ref);
	}
}


sub link_depth
{
	my $link = shift;

	# remove http if we have in link
	if ( $link =~ /^http:\/\// ) { $link =~ s/^http:\/\/$URL//; }
	
	# make a split by SLASH
	my @arr = split '/', $link;
	
	# remove first element because it will be empty
	shift @arr;
	
	return scalar @arr || 1;
}
sub extract_links
{
	# get params here in reference
	my $arr_ref = shift;
	
	# init the empty array to save the links and return it
	my @links;
	
	# processing the HTML array
	foreach my $line ( @{ $arr_ref } ) {

		# pass trough regexpression using REGEXP CLASS to extract the link
		if ( $line =~ /href=(\"|\')([^\s\"\']+)/i ) {
			my $link = $2; 
			
			# eliminate other site links
			next if $link =~ /^mailto/i;

			if ( $link =~ /^http:\/\//i) {
				next unless $link =~ $URL;
			}
			
			# remove HTML anchors if exists
			if ( $link =~ /#/) {
				$link =~ /^([^\#]+)/;
				$link = $1;
			}
			
			# add domain if it is relative path
			if ( $link =~ /^\//) {
				$link = "http://$URL$link";
			}
			
			push @links, $link;
		}
	}

	# return array with saved links
	return \@links;
}
sub get_hmtl 
{
	my $url = shift;
	
	# prepare URL
	$url = "http://$url" unless $url =~ /^http:\/\//i;
	
	# init our USER AGENT
	my $ua = LWP::UserAgent->new;
	$ua->agent("MyApp/0.1 ");
	
	# make HTTP request
	my $req = HTTP::Request->new(GET => $url);
	my $res = $ua->request($req);
	
	# return data in array or empty array to avoid the errors
	return ($res->is_success) ? split "\n", $res->content : ();
}
sub save
{
	my $arr_ref = shift;
	my $fname   = shift;

	if ( $fname =~ /^http:\/\// ) { $fname =~ s/^http:\/\///; }
	if ( $fname =~ /$URL/)        { $fname =~ s/[^\/]+//;     }
	
	$fname =~ s/^\/// if $fname =~ /^\//;
	
	# we have no extension
	if ( $fname =~ /.*?\.\w{3,4}$/) {
		# we have sub dirs to make
		if ($fname =~ /\//) {
			my @arr = split '/', $fname;
			pop @arr;
			mkpath(join '/', @arr) unless -d join '/', @arr;
		}
	}
	else {
		mkdir($fname) unless -d $fname;
	}
	
	$fname = $fname . "index.html"  unless $fname;
	$fname = $fname . "index.html"  if $fname =~ /\/$/;
	$fname = $fname . "index.html"  if $fname =~ /\/(\.)$/;
	$fname = $fname . "/index.html" if -d $fname;
	
	
	print "\tsaving $fname\n\n";
	
	# we already have this file
	return if -f $fname;
	
	# open file for writing as new
	open  FH, "> $fname" or die "Saving to $fname error: $!\n";
	print FH join '', @{ $arr_ref };
	close FH;
	return 1;
}