#!/usr/bin/perl -w # amarank -- rank amazon books # tchrist@perl.com # version 1.1, Mon Aug 31 06:48:06 MDT 1998 use strict; use Getopt::Std; use Text::Wrap qw($columns &wrap); use HTML::FormatText; use HTML::Parse; use HTML::TreeBuilder; use HTTP::Request; use HTTP::Request::Common qw(GET POST); use HTTP::Response; use LWP::UserAgent; use URI::URL; $columns = 75; sub UNKNOWN { 999999999 } sub usage { print STDERR "$0: @_\n" if @_; die < 1 ) { usage("only one of opts s, t, and s allowed."); } $which_search = $opts{"t"} ? 'title' : $opts{"s"} ? 'subject' : $opts{"a"} ? 'author' : 'title'; $search_string = "@ARGV" || "perl"; $url = "http://www.amazon.com/"; $browser = LWP::UserAgent->new(); $browser->agent("amarank/1.0"); $curreq = GET($url); $curreq->referer("http://wizard.yellowbrick.oz"); if (($response = $browser->request($curreq))->is_error()) { fdie "Failed to lookup $url: %s\n", $response->status_line; } $his_base = $response->base; dprint "His base is $his_base\n"; unless ($response->content =~ /Full search: /) { die "couldn't find full search\n"; } $url = url($1, $his_base); dprint "New url is $url\n"; $curreq = GET($url); $curreq->referer($his_base->as_string); if (($response = $browser->request($curreq))->is_error()) { fdie "Failed to lookup $url: %s\n", $response->status_line; } $his_base = $response->base; dprint "base is $his_base"; die "no title search: $content" unless $response->content() =~ m#Enter\s*Author.*?Title.*?(]*?action\s*=\s*"([^"]+)".*?)#is; $form = $1; $url = url($2, $his_base); dprint "Search is at $url\n"; $curreq = POST $url, [ "author" => $which_search eq 'author' && $search_string, "author-mode" => "full", "title" => $which_search eq 'title' && $search_string, "title-mode" => "word", "subject" => $which_search eq 'subject' && $search_string, "subject-mode" => "word", "submit" => "Search Now", ]; $curreq->referer($his_base->as_string); if (($response = $browser->request($curreq))->is_error()) { fdie "Failed to lookup $url: %s\n", $response->status_line; } $hitlist = $response->content(); dprint "base is $his_base"; while ($hitlist =~ m{ \s* (.*?) (.*) }xig ) { my($bookurl, $isbn, $title, $text) = ($1,$2,$3,$4); next if $Seen_ISBN{$isbn}++; # top few are dups $bookcount++; last if $opts{'n'} && $bookcount > $opts{'n'}; for ($title) { s/&/&/g; s/<//g; s/"/"/g; } $ISBN_Title{$isbn} = $title; print STDERR "[ISBN $isbn: $title]\n" if $opts{'v'} || !$opts{'r'}; $url = url($bookurl, $his_base)->abs . "/t"; $curreq = GET $url; $response = $browser->request($curreq); if ($response->is_error()) { printf "Failed to lookup $url: %s\n", $response->status_line; exit(1); } my $data = $response->content; $data =~ s/<\/?t[rhd].*?>//isg; # they have bad html in here my $html = parse_html($data); my $formatter = HTML::FormatText->new(leftmargin => 0, rightmargin => 500); my $ascii = $formatter->format($html); for ($ascii) { my($rank) = /Amazon\.com\s+Sales\s+Rank:\s*([\d,]+)/; ($ISBN_Rank{$isbn} = $rank || UNKNOWN) =~ s/,//g; s/.*\|\s*\n//s; s/\[(TABLE|FORM) NOT SHOWN\]//gs; s/Learn more about.*?ordering//si; s/-----.*$//s; s/\r//g; s/^ +$//g; s/^Our Price.*//m; s/^You Save.*//m; s/(?=List Price)/\n/; s/\n{2,}/\n/g; unless ($opts{'r'}) { $Desc{$isbn} = $_; } else { print map { wrap("", " ", $_) . "\n" } split /\n/; print "\n"; } } } exit if $opts{'r'}; for my $isbn ( sort { $ISBN_Rank{$a} <=> $ISBN_Rank{$b} } keys %ISBN_Rank ) { $_ = $Desc{$isbn}; s/(Amazon\.com\s+Sales\s+Rank:.*\n)//; print "Rank: ", ($ISBN_Rank{$isbn} == UNKNOWN) ? "Unknown" : commify($ISBN_Rank{$isbn}), "\n"; print map { wrap("", " ", $_) . "\n" } split /\n/; print "\n"; } sub commify { my $text = reverse $_[0]; $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; return scalar reverse $text; }