#!/usr/bin/perl
#
# Web mirroring program - copy a remote website (possibly recursively) to a
# local directory.
#
# Copyright 1997 Daniel V. Klein, dan@klein.com  All Rights Reserved
#
# This program may be freely copied and distributed, and used for any and
# all purposes, provided that the above copyright notice is left intact
# and unaltered, and that any modifications to this program are as freely
# distributable and copyable.
#

#
# These are the tags & attributes which are significant to webmirror.
#
my (%collect) = (
	a	=> "href",
	area	=> "href",
	img	=> "src",
	fig	=> "src",
	body	=> "background",
	form	=> "action",
	frame	=> "src",
	);
#
# Don't try and collect from any of these directories, on the assumption
# that they are CGI directories
#
my (%cgi_dir) = map { ($_, 1) } qw(cgi-bin htbin cgi usr-cgi bin);

my $tmpfile = "mirror-$$.tmp";

sub help {
    print <<"==END==";

webmirror [-switches] URL
	-allow-cgi		Allow traversing of what we guess are CGI dirs

	-auth acct:passwd	Specify basic authorization

	-help

	-index_html filename	Use this name instead of index.html when a
				directory (not a specific file) is fetched.

	-no-get			Ignore URLs with a '?' in the path

	-recurse		Recursively descend the file tree

	-start-local		Use the local version of the file to start
				(useful when you want to mirror a site
				starting at a page, but you want to edit the
				page locally first to eliminate some links)

	-talkative		Be very verbose

	-verbose		Be slightly verbose

	-wait seconds		Wait N seconds between requests (defaults to
				0 seconds - hammer the remote server, 'cuz it
				should be able to take it).

    Unique abbreviations are allowed for switches.  URL is usually http://...
    but can use any protocol or port.

==END==
    exit;
}

require 5.001;
use Time::Local;
use Getopt::Long;
use FileHandle;
use URI::URL;
use HTML::LinkExtor;
use HTTP::Request;
use HTTP::Response;
use LWP::UserAgent;

STDOUT->autoflush(1);
$Getopt::Long::autoabbrev = 1;

GetOptions(
	"allow-cgi"	=> \$allow_cgi,
	"auth=s"	=> \$auth,
	"help"		=> \$help,
	"index_html=s"	=> \$index_html,
	"no-get"	=> \$no_GET,
	"recurse"	=> \$recurse,
	"start-local"	=> \$start_local,
	"talkative"	=> \$talkative,
	"verbose"	=> \$verbose,
	"wait=i"	=> \$wait,
    );

help()	if $Getopt::Long::error || $help || @ARGV != 1;;
$verbose ||= $talkative;
$index_html ||= "index.html";

delete $collect{a} && delete $collect{area}	unless $recurse;
#
# Routine used as callback to pull out the specified attribute from a tag
#
sub extract {
    my ($tag, %attr) = @_;
    return unless $attr{$collect{$tag}};
    push (@xref, $attr{$collect{$tag}});
    print "Found <$tag $collect{$tag}=$attr{$collect{$tag}}>\n"	if $talkative;
    }

$ua = new LWP::UserAgent;
$ua->agent("WebMirror/1.0");
#
# Set up the request, possibly with authorization acct/passwd
#
$url = new URI::URL shift;
$req = new HTTP::Request(GET => $url);
$req->authorization_basic(split /:/, $auth)	if $auth;
#
# Make the parser/extractor
#
$extrator = new HTML::LinkExtor (\&extract);
#
# Request document and parse it as it arrives.  Stick the incoming file in
# the appropriate place when it arrives.
#
@xref = %done = ();
print "Starting at ", $url->as_string, " "	if $verbose;
$response = $ua->request($req, $tmpfile);
$url = new URI::URL $response->request->url->as_string;
$url->path($url->path . $index_html)	if substr ($url->as_string, -1) eq "/";
$done{$url->as_string}++;			# And save it as done
#
# Find the root of the tree above which we will not look (we use the
# URL in the response, because there might have been redirections).
#
@path = $url->path_components;
$path[-1] = "";
$root = new URI::URL $url->as_string;
$root->path(join "/", @path);
#
# Either put the just-read HTML into a file, or use the local copy to start
#
($file = $url) =~ s#.*/##;
if ($start_local) {
    unlink $tmpfile;
    die "No local copy of $file to start with\n"	unless -e $file;
    }
else {
    rename ($tmpfile, $file) or
		    die "Cannot rename '$tmpfile' to '$file': $!\n";
    print $response->code, " ", $response->message, "\n"	if $verbose;
    }
$extrator->parse_file ($file);
#
# Expand all image URLs to absolute ones
#
$base = $response->base;
@xref = grep {/^$root/o} map { $_ = url($_, $base)->abs; } @xref;

do_mirror (@xref);

sub do_mirror {
    my (@list) = @_;
    my ($url, $request);
    local(@xref);	# Needs to be up-level addressable by &extract

    FILE: for (@list) {
	next FILE	if /\?/ && $no_GET;	# No GET scripts
	s/#.*$//;			# Eliminate in-page tag referents
	next FILE	if $done{$_}++;	# No dups
	next FILE	if /\.cgi$/;	# No CGI
	$url = new URI::URL $_;
	($file = $url->as_string) =~ s/$root//o;
	next FILE	unless $file;	# Gotta have a file name...
	$file .= $index_html	if substr ($file, -1) eq "/";
	undef $path;
	while ($file =~ m#(.*?)/#g) {
	    next FILE if $cgi_dir{$1};
	    $path .= $1;
	    unless (-d $path) {
		print "mkdir $path\n"	if $verbose;
		mkdir ($path, 0755) || die "Cannot mkdir $path\n";
		}
	    $path .= "/";
	    }
	#
	# Request document and parse it as it arrives
	#
	print $url->as_string, " "		if $verbose;
	$request = new HTTP::Request(GET => $url);
	$request->authorization_basic(split /:/, $auth)	if $auth;
	$request->url($url);
	if (-e $file) {
	    my ($mtime) = (stat($file))[9];
	    if ($mtime) {
		$request->header ('If-Modified-Since' =>
				 HTTP::Date::time2str($mtime));
		}
	    }
	$response = $ua->request($request, $tmpfile);
	if ($response->is_success) {
	    my $file_length = (stat($tmpfile))[7];
	    my ($content_length) = $response->header('Content-length');

	    if (defined $content_length and $file_length < $content_length) {
		warn "Transfer truncated: " .
		    "only $file_length out of $content_length bytes received\n";
		}
	    elsif (defined $content_length and $file_length > $content_length) {
		warn "Content-length mismatch: " .
		    "expected $content_length bytes, got $file_length\n";
		}
	    $target = new URI::URL $response->request->url->as_string;
	    if (substr ($target->as_string, -1) eq "/") {
		mkdir ($file, 0755);	# Errors will sort out later...
		$file = $tmpfile;		# For parse_file, below
		}
	    else {
		rename ($tmpfile, $file) or
		    die "Cannot rename '$tmpfile' to '$file': $!\n";
		}
	    }
	else {
	    unlink ($tmpfile);
	    }
	print $response->code, " ", $response->message, "\n"	if $verbose;
	#
	# Hunt down any recursive links (if asked to)
	#
	@xref = ();
	$extrator->parse_file ($file)
		if $recurse && !$response->is_error &&
		    $response->header('Content-type') eq "text/html";
	#
	# Expand all image URLs to absolute ones
	#
	$base = $response->base;
	@xref = grep {/^$root/o} map { $_ = url($_, $base)->abs; } @xref;
	do_mirror (@xref);
	}
    }