#!/usr/bin/perl # pathinfo -- Scan news headers, record and print various bits of information # # Author: Brad Knowles # Date: Mon Jun 14 19:45:53 MET DST 1999 # # Inspired by a conversation with Keni Lorber # Additional ideas from Mark Filteau and Miquel van Smoorenburg # #============================================================ # # Copyright (c) 1998 by Belgacom Skynet, all rights reserved. # # This program is provided as-is, with absolutely no # warranty or suitability for any purpose implied. # # This program is not even guaranteed to run or compile, and # you use it entirely at your own risk! # # No support will be provided, although comments and # suggestions are appreciated, and attempts will be made to # fold in improvements and make them available under the # same terms as the rest of the program. # # This program is provided as freely available and # distributable Perl source code only, and you are free # to make whatever modifications you like, so long as you # contribute them back to me and do not make the modified # versions available to other persons without express prior # consent. # #============================================================ # # NOTE: This program is a *major* memory hog! I use multi- # dimensional hashes, and this program is assumed to be # processing some fairly large size log files. A log file # of around 95,000 lines can result in over 13MB virtual # RAM used by the process, and 10MB actually resident. # # With ~500,000-800,000 articles per day on a full feed, I # can only imagine how much RAM would be required to process # a "Path:" header file that covers 24 hours. And the mind # boggles at the potential RAM requirements to process an # entire week. # # # It would be nice to be able to tie this hash to a dbm file. # Although this would greatly increase the amount of disk # I/O necessary to process the data, this would at least let # the program run in a lot less memory. # # However, until then, USE AT YOUR OWN RISK!!! # #============================================================ ############################################################ ############################################################ ### ### ### DEFAULT SETTINGS FOR IMPORTANT VARIABLES ### ### ### ############################################################ ############################################################ $default_min_report||=35; # # $min_report is the minimum number of times a particular news site # has to be seen before it can potentially show up on the "Top ##" # reports. We do this because we do things like calculate mean and # standard deviation on how many news peers it is away from us, and # statistics like this are not meaningful if the sample size is not # large enough. Typically, at least thirty-two samples are required # before the data is likely to be statistically significant, but we # choose to use thirty-five for simplicity's sake. # $default_top||=30; # # $top is the maximum number of sites we'll display in our various # reports. If set to thirty, then we'll display the "Top 30" and # "Bottom 30" in each category. # $default_headers||="/usr/local/news/log/path/today"; # # $headers is assumed to be a file that contains all the "Path:" # headers that this site has seen in the previous reporting # period (usually a day). If this file exists, then we use it # to save a *BOATLOAD* of I/O that would be required to read in # all the various news articles, parse their headers, and pull # out the "Path:" headers ourselves. # # # If you're using INN 2.x, you can create this file by specifying # the following line in your "etc/newsfeeds" file (typically # already present, merely commented out): # # path!\ # :*,!junk,!control,!control.*\ # :Tc,WH\ # :/usr/local/news/bin/pathgrep path.log # # The "pathgrep" script looks something like this: # # #!/bin/sh # # =()<. @<_PATH_SHELLVARS>@> ()= # . /usr/local/news/lib/innshellvars # PATH_LOG="$MOST_LOGS/$1" # $EGREP '^Path:' >> "$PATH_LOG" # # # Or, you could create it as a direct file feed, like so: # # path!:*:Tf,Ap,WP:/usr/local/news/log/path/today # $default_dir||="/var/news/spool/articles/"; # # $dir is scanned for news articles to be parsed, if the file # in $headers does not exist. If $dir is a file instead, it is # assumed to contain a list of directories/files to be scanned, # as opposed to actually being read and parsed in and of itself. # $default_file||=""; #$default_file||="/usr/local/news/etc/search.hosts"; # # $file is assumed to contain a list of hosts for which we are # looking for information on. This could be a list of your # local news peers, so you can see if any of them are getting # screwed up and you're getting news from them that is not # coming in directly to you. Or, it could be a list of Freenix # Top 1000 hosts that you're interested in checking out, so you # can see which ones are "close" to you and which ones are not, # and therefore which ones you might be interested in setting up # a peering arrangement. # # If this value is empty (per the default), then we assume that # you want to consider all sites # $default_sleep||=0; # # $sleep is the amount of time we want to sleep between processing # files as defined by $file. Since this is a huge amount of I/O, # we may want to spread this out over a longer period of time and # create a lower impact per unit of time on the news server. We # would *very* much prefer that an archive file of collected # headers instead be fed to this program, so by default we assume # that there shouldn't be any sleeping between the files as defined # by $file. ############################################################ ############################################################ ### ### ### NO MODIFICATIONS BELOW THIS LINE SHOULD BE NECESSARY ### ### ### ############################################################ ############################################################ use Getopt::Std; use Sys::Hostname; use POSIX; use BSD::Resource; if (!getopts('DIVS:d:f:h:m:s:t:')) { usage(); exit 1; } if (defined $opt_D) { $validate_dns=$opt_D; $validate=$opt_D; } if (defined $opt_I) { $skip_ip=$opt_I; } if (defined $opt_V) { $validate=$opt_V; } $sleep||=$default_sleep; if (defined $opt_S) { $sleep=$opt_S; if ($sleep < 0) { $sleep=1; } } $dir||=$default_dir; if (defined $opt_d) { $dir=$opt_d; } if ($default_file ne "") { $file||=$default_file; if (defined $opt_f) { $file=$opt_f; } } $headers||=$default_headers; if (defined $opt_h) { $headers=$opt_h; } $min_report||=$default_min_report; if (defined $opt_m) { $min_report=$opt_m; if ($min_report < 0) { $min_report=$default_min_report; } } if (defined $opt_s) { $skip=$opt_s; } $top||=$default_top; if (defined $opt_t) { $top=$opt_t; if ($top < 0) { $top=$default_top; } } $host=hostname(); $now=POSIX::strftime("%a %b %d %T %Z %Y", localtime()); setpriority (PRIO_PROCESS, $$self, PRIO_MAX); if (-f $file) { my $input = "< " . $file; open (INPUT, $input); while () { tr/A-Z/a-z/; chop; @hosts=split; for my $i (0 .. $#hosts) { my $j = $hosts[$i]; next if (defined $hosts{$j}{name}); $hosts{$j}{name}=$j; } } close INPUT; } if (-f $headers) { snarf($headers); } elsif (-f $dir) { my $input = "< " . $input; open (INPUT, $input); while () { chop; my $myfile = $_; next if ((! -d $myfile) && (! -f $myfile)); recurse($myfile); } close INPUT; } elsif (-d $dir) { recurse($dir); } else { print "ERROR: $headers doesn't exist and $dir is not readable, not a directory, or not a normal file."; usage(); exit 2; } foreach $site (keys %path) { if ($path{$site}{count} != 0) { $path{$site}{mean}=$path{$site}{distance} / $path{$site}{count}; } else { $path{$site}{mean}=0 } } print_data($top,$min_report,"Mean",sort bymean keys %path); print_data($top,$min_report,"Reverse Mean",reverse sort bymean keys %path); print_data($top,$min_report,"Min",sort bymin keys %path); print_data($top,$min_report,"Reverse Min",reverse sort bymin keys %path); print_data($top,$min_report,"Max",sort bymax keys %path); print_data($top,$min_report,"Reverse Max",reverse sort bymax keys %path); print_data($top,$min_report,"Count",sort bycount keys %path); print_data($top,$min_report,"Reverse Count",reverse sort bycount keys %path); print_data($top,$min_report,"Number of Articles Sourced",sort bysource keys %path); print_data($top,$min_report,"Reverse Number of Articles Sourced", reverse sort bysource keys %path); print_data($top,$min_report,"Reverse Count",reverse sort bycount keys %path); print_data($top,$min_report,"Total Distance",sort bydist keys %path); print_data($top,$min_report,"Reverse Total Distance",reverse sort bydist keys %path); sub usage { my @prog=split(/\//, $0); my $prog=$prog[$#prog]; print "Usage: $prog [-DIV] [-S ] [-d

] [-f ] [-h ] [-m ] [-s ] [-t ]\n"; print "\n"; print "Where:\n"; print " = # of seconds to sleep between processing each file in

\n"; print " (default=$default_sleep)\n"; print "

= /path/to/news/spool/or/subdirectory/to/be/processed\n"; print " or file containing list of directories or files\n"; print " (default=$default_dir)\n"; print " = File containing names of sites to search for\n"; print " If does not exist, consider all sites\n"; print " (default=$default_file)\n"; print " = File containing archive of 'Path:' headers to be processed\n"; print " (default=$default_headers)\n"; print " = Minimum # of references required for site to show up on reports\n"; print " (default=$default_min_report)\n"; print " = Site names to skip (substring)\n"; print " (default=)\n"; print " = Show only the top # of sites\n"; print " (default=$default_top)\n"; print "\n"; print "Use '-I' if you want to ignore raw IP addresses, including those of the format\n"; print " 192.168.0.1.MISMATCH or 10.0.0.1.POSTED\n"; print "Use '-V' if you want a fast but simple validation scheme\n"; print "Use '-D' if you want to validate w/ DNS only those that pass the simpler test\n"; print "\n"; print "Notes:\n"; print "-D implies -V, but no harm is done if you use -DV\n"; print "-I is independant of both -D and -V\n"; print "-h is preferred over -d. If both are specified and exist,\n"; print "\tthe argument to -d will get silently ignored.\n"; } sub recurse { my $dir = shift; if (-f $dir) { snarf($dir); } elsif (-d $dir) { opendir DIR, $dir or return; @allfiles = readdir DIR; closedir(DIR); for $i (0 .. $#allfiles) { my $name = $allfiles[$i]; next if $name =~ /\.\.?/; my $fullname=$dir . "/" . $name; if (-f $fullname) { snarf($fullname); } elsif (-d $fullname) { recurse($fullname); } if (defined $sleep) { sleep $sleep; } } } else { print "WARNING: $dir is not readable, not a directory, or not a normal file."; } return; } sub snarf { my $input = shift; $input = "< " . $input; open (INPUT, $input); while () { chop; next if !((/^[A-Za-z0-9]+[A-Za-z0-9\.\!\-]*[A-Za-z0-9]+$/o) || (/^Path: /o)); if ((/^[A-Za-z0-9]+[A-Za-z0-9\.\!\-]*[A-Za-z0-9]+$/o) || (/^Path: /o)) { s/^Path: //; s/\.(POSTED|MISMATCH)//; tr/A-Z/a-z/; @route = split (/!/); for $i (0 .. $#route) { $site=$route[$i]; next if ((defined $file) && (!defined $hosts{$site}{name})); # Grab only those sites we're told to look for next if ((defined $skip) && ($site =~ $skip)); # Skip all of our own servers next if ((defined $skip_ip) && ($site =~ /^[0-9\.]*$/)); # Skip if numeric IP if (defined $validate) { my @tld = split(/\./, $site); my $j = $#tld; next if ($j < 1); # Require at least one dot my $k = $tld[$j]; my $l = length $k; next if ($l > 3 || $l < 2); # Pretty lame check to see if the TLD is legit } if (defined $validate_dns) # Actually look up in DNS { if (!defined $addr{$site}) { $addr{$site} = gethostbyname $site; } next if ($addr{$site} =~ /^$/); } if (!defined $path{$site}) { $path{$site}{name}=$site; } $path{$site}{count}++; $path{$site}{source}++; if ( $i > 0) { $path{$route[$i - 1]}{source}--; } $path{$site}{distance}+=$i; $path{$site}{distsq}+=($i * $i); if ($path{$site}{min} > $i || $path{$site}{min} == 0) { $path{$site}{min}=$i; } if ($path{$site}{max} < $i) { $path{$site}{max}=$i; } } } } close INPUT; } sub bymean { $path{$a}{mean} <=> $path{$b}{mean} } sub bymin { $path{$a}{min} <=> $path{$b}{min} } sub bymax { $path{$a}{max} <=> $path{$b}{max} } sub bycount { $path{$a}{count} <=> $path{$b}{count} } sub bydist { $path{$a}{distance} <=> $path{$b}{distance} } sub bysource { $path{$a}{source} <=> $path{$b}{source} } sub print_data { my ($top, $min_report, @path, $label, $top_label, $site, $current, $sd); ($top, $min_report, $label, @path) = @_; format HEADER1 = @|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| $top_label Site: @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< Date: @>>>>>>>>>>>>>>>>>>>>>>>>>> $host, $now . format SPOOL = Spool: @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< $dir . format HEADERS = Header file: @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< $headers . format FILE = Searching for sites in: @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< $file . format HEADER2 = Articles Site Art. Mean Distance Site Sourced Dist / Count = Dist (s.d.) Min Max ------------------------------------------------------------------------------- . format BODY = @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @###### @###### @###### @##.## @##.## @### @### $path{$site}{name}, $path{$site}{source}, $path{$site}{distance}, $path{$site}{count}, $path{$site}{mean}, $sd, $path{$site}{min}, $path{$site}{max} . format TRAILER= . $~ = "HEADER1"; $top_label = "Top $top Sites Sorted by $label"; write; if ((defined $headers) && (-f $headers)) { $~ = "HEADERS"; write; } elsif ((defined $dir) && (-d $dir)) { $~ = "SPOOL"; write; } if ((defined $file) && (-f $file)) { $~ = "FILE"; write; } $~ = "HEADER2"; write; $current = 0; $~ = "BODY"; foreach $site (@path) { if ($path{$site}{count} >= $min_report) { if ($path{$site}{count} > 1) { my $var=$path{$site}{distsq} / $path{$site}{count} - ($md * $md); $sd=sqrt($var); } else { $sd=0; } write; $current++; } last if $current > $top; } $~ = "TRAILER"; write; }