#!/usr/bin/perl # # Web mirroring program - copy a remote website (possibly recursively) to a # local directory. # # Copyright 1997 Daniel V. Klein, dan@klein.com All Rights Reserved # # This program may be freely copied and distributed, and used for any and # all purposes, provided that the above copyright notice is left intact # and unaltered, and that any modifications to this program are as freely # distributable and copyable. # # # These are the tags & attributes which are significant to webmirror. # my (%collect) = ( a => "href", area => "href", img => "src", fig => "src", body => "background", form => "action", frame => "src", ); # # Don't try and collect from any of these directories, on the assumption # that they are CGI directories # my (%cgi_dir) = map { ($_, 1) } qw(cgi-bin htbin cgi usr-cgi bin); my $tmpfile = "mirror-$$.tmp"; sub help { print <<"==END=="; webmirror [-switches] URL -allow-cgi Allow traversing of what we guess are CGI dirs -auth acct:passwd Specify basic authorization -help -index_html filename Use this name instead of index.html when a directory (not a specific file) is fetched. -no-get Ignore URLs with a '?' in the path -recurse Recursively descend the file tree -start-local Use the local version of the file to start (useful when you want to mirror a site starting at a page, but you want to edit the page locally first to eliminate some links) -talkative Be very verbose -verbose Be slightly verbose -wait seconds Wait N seconds between requests (defaults to 0 seconds - hammer the remote server, 'cuz it should be able to take it). Unique abbreviations are allowed for switches. URL is usually http://... but can use any protocol or port. ==END== exit; } require 5.001; use Time::Local; use Getopt::Long; use FileHandle; use URI::URL; use HTML::LinkExtor; use HTTP::Request; use HTTP::Response; use LWP::UserAgent; STDOUT->autoflush(1); $Getopt::Long::autoabbrev = 1; GetOptions( "allow-cgi" => \$allow_cgi, "auth=s" => \$auth, "help" => \$help, "index_html=s" => \$index_html, "no-get" => \$no_GET, "recurse" => \$recurse, "start-local" => \$start_local, "talkative" => \$talkative, "verbose" => \$verbose, "wait=i" => \$wait, ); help() if $Getopt::Long::error || $help || @ARGV != 1;; $verbose ||= $talkative; $index_html ||= "index.html"; delete $collect{a} && delete $collect{area} unless $recurse; # # Routine used as callback to pull out the specified attribute from a tag # sub extract { my ($tag, %attr) = @_; return unless $attr{$collect{$tag}}; push (@xref, $attr{$collect{$tag}}); print "Found <$tag $collect{$tag}=$attr{$collect{$tag}}>\n" if $talkative; } $ua = new LWP::UserAgent; $ua->agent("WebMirror/1.0"); # # Set up the request, possibly with authorization acct/passwd # $url = new URI::URL shift; $req = new HTTP::Request(GET => $url); $req->authorization_basic(split /:/, $auth) if $auth; # # Make the parser/extractor # $extrator = new HTML::LinkExtor (\&extract); # # Request document and parse it as it arrives. Stick the incoming file in # the appropriate place when it arrives. # @xref = %done = (); print "Starting at ", $url->as_string, " " if $verbose; $response = $ua->request($req, $tmpfile); $url = new URI::URL $response->request->url->as_string; $url->path($url->path . $index_html) if substr ($url->as_string, -1) eq "/"; $done{$url->as_string}++; # And save it as done # # Find the root of the tree above which we will not look (we use the # URL in the response, because there might have been redirections). # @path = $url->path_components; $path[-1] = ""; $root = new URI::URL $url->as_string; $root->path(join "/", @path); # # Either put the just-read HTML into a file, or use the local copy to start # ($file = $url) =~ s#.*/##; if ($start_local) { unlink $tmpfile; die "No local copy of $file to start with\n" unless -e $file; } else { rename ($tmpfile, $file) or die "Cannot rename '$tmpfile' to '$file': $!\n"; print $response->code, " ", $response->message, "\n" if $verbose; } $extrator->parse_file ($file); # # Expand all image URLs to absolute ones # $base = $response->base; @xref = grep {/^$root/o} map { $_ = url($_, $base)->abs; } @xref; do_mirror (@xref); sub do_mirror { my (@list) = @_; my ($url, $request); local(@xref); # Needs to be up-level addressable by &extract FILE: for (@list) { next FILE if /\?/ && $no_GET; # No GET scripts s/#.*$//; # Eliminate in-page tag referents next FILE if $done{$_}++; # No dups next FILE if /\.cgi$/; # No CGI $url = new URI::URL $_; ($file = $url->as_string) =~ s/$root//o; next FILE unless $file; # Gotta have a file name... $file .= $index_html if substr ($file, -1) eq "/"; undef $path; while ($file =~ m#(.*?)/#g) { next FILE if $cgi_dir{$1}; $path .= $1; unless (-d $path) { print "mkdir $path\n" if $verbose; mkdir ($path, 0755) || die "Cannot mkdir $path\n"; } $path .= "/"; } # # Request document and parse it as it arrives # print $url->as_string, " " if $verbose; $request = new HTTP::Request(GET => $url); $request->authorization_basic(split /:/, $auth) if $auth; $request->url($url); if (-e $file) { my ($mtime) = (stat($file))[9]; if ($mtime) { $request->header ('If-Modified-Since' => HTTP::Date::time2str($mtime)); } } $response = $ua->request($request, $tmpfile); if ($response->is_success) { my $file_length = (stat($tmpfile))[7]; my ($content_length) = $response->header('Content-length'); if (defined $content_length and $file_length < $content_length) { warn "Transfer truncated: " . "only $file_length out of $content_length bytes received\n"; } elsif (defined $content_length and $file_length > $content_length) { warn "Content-length mismatch: " . "expected $content_length bytes, got $file_length\n"; } $target = new URI::URL $response->request->url->as_string; if (substr ($target->as_string, -1) eq "/") { mkdir ($file, 0755); # Errors will sort out later... $file = $tmpfile; # For parse_file, below } else { rename ($tmpfile, $file) or die "Cannot rename '$tmpfile' to '$file': $!\n"; } } else { unlink ($tmpfile); } print $response->code, " ", $response->message, "\n" if $verbose; # # Hunt down any recursive links (if asked to) # @xref = (); $extrator->parse_file ($file) if $recurse && !$response->is_error && $response->header('Content-type') eq "text/html"; # # Expand all image URLs to absolute ones # $base = $response->base; @xref = grep {/^$root/o} map { $_ = url($_, $base)->abs; } @xref; do_mirror (@xref); } }