#!/usr/bin/perl
# #############################################################################
# weby rulez
# #############################################################################
use strict;
use File::Path;
use LWP::UserAgent;
use Getopt::Long;
my $url;
my $depth = 255;
my $save;
my $help;
my $result=GetOptions ( "url=s" => \$url, # string
"depth=i" => \$depth, # digit
"save" => \$save, # flag
"h|help" => \$help);
&usage if $help;
&usage unless $url;
my $URL = $url;
my $DEPTH = $depth;
my $SAVE = $save;
mkdir($URL);
chdir($URL);
my @html = get_hmtl($URL);
my $link_ref = extract_links(\@html);
# store index page
save(\@html, "index.html") if $SAVE;
# free array html
@html = undef;
my %visited;
fuck_url($link_ref);
sub usage
{
print "
USAGE:
$0 [url] [save] [depth]
";
exit;
}
sub fuck_url
{
my $link_ref = shift;
for my $link (@{ $link_ref} ) {
# skip already visited pages
next if exists $visited{ $link };
my $link_depth = link_depth($link);
# skip this link if dpth is defined
next if $link_depth > $DEPTH;
# processing the alowed links here
print "link [$link_depth]: $link \n";
# save the file if we have this command
my @link_html = get_hmtl($link);
my $l_ref = extract_links(\@link_html);
# Save page if needed
&save(\@link_html, $link) if $SAVE;
# save the page to visited
$visited{ $link } = 1;
# recurse
fuck_url($l_ref);
}
}
sub link_depth
{
my $link = shift;
# remove http if we have in link
if ( $link =~ /^http:\/\// ) { $link =~ s/^http:\/\/$URL//; }
# make a split by SLASH
my @arr = split '/', $link;
# remove first element because it will be empty
shift @arr;
return scalar @arr || 1;
}
sub extract_links
{
# get params here in reference
my $arr_ref = shift;
# init the empty array to save the links and return it
my @links;
# processing the HTML array
foreach my $line ( @{ $arr_ref } ) {
# pass trough regexpression using REGEXP CLASS to extract the link
if ( $line =~ /href=(\"|\')([^\s\"\']+)/i ) {
my $link = $2;
# eliminate other site links
next if $link =~ /^mailto/i;
if ( $link =~ /^http:\/\//i) {
next unless $link =~ $URL;
}
# remove HTML anchors if exists
if ( $link =~ /#/) {
$link =~ /^([^\#]+)/;
$link = $1;
}
# add domain if it is relative path
if ( $link =~ /^\//) {
$link = "http://$URL$link";
}
push @links, $link;
}
}
# return array with saved links
return \@links;
}
sub get_hmtl
{
my $url = shift;
# prepare URL
$url = "http://$url" unless $url =~ /^http:\/\//i;
# init our USER AGENT
my $ua = LWP::UserAgent->new;
$ua->agent("MyApp/0.1 ");
# make HTTP request
my $req = HTTP::Request->new(GET => $url);
my $res = $ua->request($req);
# return data in array or empty array to avoid the errors
return ($res->is_success) ? split "\n", $res->content : ();
}
sub save
{
my $arr_ref = shift;
my $fname = shift;
if ( $fname =~ /^http:\/\// ) { $fname =~ s/^http:\/\///; }
if ( $fname =~ /$URL/) { $fname =~ s/[^\/]+//; }
$fname =~ s/^\/// if $fname =~ /^\//;
# we have no extension
if ( $fname =~ /.*?\.\w{3,4}$/) {
# we have sub dirs to make
if ($fname =~ /\//) {
my @arr = split '/', $fname;
pop @arr;
mkpath(join '/', @arr) unless -d join '/', @arr;
}
}
else {
mkdir($fname) unless -d $fname;
}
$fname = $fname . "index.html" unless $fname;
$fname = $fname . "index.html" if $fname =~ /\/$/;
$fname = $fname . "index.html" if $fname =~ /\/(\.)$/;
$fname = $fname . "/index.html" if -d $fname;
print "\tsaving $fname\n\n";
# we already have this file
return if -f $fname;
# open file for writing as new
open FH, "> $fname" or die "Saving to $fname error: $!\n";
print FH join '', @{ $arr_ref };
close FH;
return 1;
}