#!/usr/bin/perl -w # # xurl - tchrist@perl.com # # extract urls, fixing up bases (badly). # see lwpcook.pod for a better way to do this. # # for a MUCH MUCH faster version that doesn't do much # thinking, see qxurl instead. # # retrieve Perl source from # http://www.perl.com/CPAN/src/5.0/latest.tar.gz # retrieve the LWP library from # http://www.perl.com/cgi-bin/cpan_mod?module=LWP require 5.002; BEGIN { die "usage: $0 URL ...\n" unless @ARGV } use strict; use LWP 5.00; use URI::URL; use HTML::Parse qw(parse_html); my $ua = new LWP::UserAgent; my($url, %saw, @urls); foreach $url ( @ARGV ) { my $res = $ua->request(HTTP::Request->new(GET => $url)); unless ($res->is_success) { warn "$0: Bad URL: $url\n"; next; } unless ($res->content_type eq 'text/html') { warn "$0: Not HTML: $url\n"; next; } my $ht_tree = parse_html($res->content); my $base = $res->base; my($linkpair, $fqurl); foreach $linkpair (@{$ht_tree->extract_links(qw)}) { my($link,$elem) = @$linkpair; push(@urls, $fqurl) unless $saw{ $fqurl = url($link,$base)->abs->as_string }++; } } print join("\n", @urls), "\n"; ##### # use next line for uniq and sorted urls #print join("\n", sort keys %saw), "\n"; #####