#!/usr/bin/perl -w ###################################################################### # # Program: fetch_unanswered.pl # # Retrieve articles from to which no reply has yet been posted. # Assumes that arguments passed to program are newsgroup names. # Articles are all printed to the standard output. # # options: # -j Turn off threading of articles by subject. # -m Look back at most headers/nov. # -n Fetch at most NOV records # with one request to server. # -s override default news server # -x exclude articles with header # matching pattern. # # Please send any comments to: RonaldWS@software-path.com # # A version with a reply feature exists. The reply feature is not included # here since it requires about 200 lines of additional unrelated code and # belongs in a separate script. CPAN script submission currently requires # that "It must be a single file ...". Contact the author if interested in # the reply feature. # # Written by: Ronald Schmidt, The Software Path # ###################################################################### use strict; use News::NNTPClient; use Getopt::Std; my $VERSION = '0.25'; use vars qw($opt_m $opt_j $opt_n $opt_s $opt_x $VERSION); # server will be set to (in order of decreasing priority) # -s command line parameter # NNTPSERVER environment variable # /etc/nntpserver # default set here my $server; my $default_server = '"set default_server or use -s parameter"'; my $xover_batch_size = 500; my $exclude_regex; ###################################################################### # "Nice to have" enhancements: # support for newnews # time estimation # FAQ filtering option/kill file. ###################################################################### my $news_client; my %unanswered; my %record_dup_subj; ###################################################################### # Print a status message to STDERR. If caller does not provide # line termination then terminate line with time stamp and LF("\n"). ###################################################################### sub post_console_message { print STDERR @_; print STDERR " (", scalar(localtime()), ")\n" unless ( $_[$#_] =~ /\n/ # Last parm has LF. ); } ###################################################################### # Here we remove messages with subjects that look like replies and # begin to track groups of messages with the same subject. # User may request no filter by subject. ###################################################################### sub FilterSubject { my $msg_id = shift; my $subj = lc(shift); my $has_ref = shift; $subj =~ s/^\s*//; $subj =~ s/\s*$//; # if subject filtering remove msgs with subject that looks like reply delete $unanswered{$msg_id} if ( ($subj =~ s/^re(\:?)\s+//) && (! $has_ref) ); # List of message id's by subject. Advanced technique - sorry! push @{$record_dup_subj{$subj}}, $msg_id; } ###################################################################### # Look through duplicate subject hash for cases where multiple messages # had the same subject and remove their message id's from the unanswered # list. ###################################################################### sub RemoveDuplicateSubject { foreach my $msg_id_lh (values %record_dup_subj) { if (scalar(@$msg_id_lh) > 1) { foreach my $dup_msg_id (@$msg_id_lh) { delete $unanswered{$dup_msg_id}; } } } } ###################################################################### # Use NNTP XOVER request to fetch header information needed to # determine which articles have not yet received a response. # This is one of the more efficient approaches. ###################################################################### sub SetUnansweredXover { my ($news_client, $first_num, $last_num, $batch_size) = @_; my ($batch_first, $batch_last); my $overview_fmt; my ($i, %overview_fields, $id_field, $ref_field, $subject_field); my @all_ref; $overview_fmt = $news_client->list('overview.fmt'); die $news_client->message() unless ($news_client->ok()); %overview_fields = map((uc($_), $i++), grep(s/\s*$//, @$overview_fmt)); $id_field = $overview_fields{'MESSAGE-ID:'}; $ref_field = $overview_fields{'REFERENCES:'}; $subject_field = $overview_fields{'SUBJECT:'}; for ( $batch_first = $first_num, $batch_last = $first_num + $batch_size -1; $batch_first < $last_num; $batch_first = $batch_last + 1, $batch_last = $batch_first + $batch_size -1 ) { $batch_last = $last_num if ($batch_last > $last_num); foreach my $xover_line ($news_client->xover("${batch_first}-${batch_last}")) { my ($msg_num, $msg_id, $ref, $subject) = (split /\t/, $xover_line) [0, $id_field +1, $ref_field +1, $subject_field +1]; my $has_ref = (defined($ref) && $ref); if ($has_ref) { foreach my $ref_id (split(' ', $ref)) { delete $unanswered{$ref_id}; } } else { $unanswered{$msg_id} = $msg_num; } FilterSubject($msg_id, $subject, $has_ref) unless ($opt_j); } post_console_message 'Processed requests for ', $batch_last - $first_num +1, " NOV records of ", $last_num - $first_num +1, '.'; } } ###################################################################### # Fetch each article header, one at a time, to determine which # articles have not yet received any response. # This is a very inefficient approach but does not require any # NNTP extension services. ###################################################################### sub SetUnansweredHead { my ($news_client, $first_num, $last_num) = @_; my ($article_num, $err_count); my $i = 0; for ( $article_num = $first_num; $article_num <= $last_num; $article_num++) { my $head; my ($msg_id, $ref_id); post_console_message("counting heads: $i") if ((++$i % 100)==0) ; $head = $news_client->head($article_num); unless ($news_client->ok()) { $err_count++ if ( $news_client->message() !~ /bad article number/i ); next; } ($msg_id) = grep(/Message\-ID\:/i, @$head); ($msg_id) = ($msg_id =~ /Message\-ID\: (\<.*?\>)/i); ($ref_id) = grep(/References\:/i, @$head); if (defined $ref_id) { ($ref_id) = ($ref_id =~ /References\: (\<.*?\>)/i); delete $unanswered{$ref_id}; } else { $unanswered{$msg_id} = $article_num; } unless ($opt_j) { my ($subject) = grep(/Subject\:/i, @$head); ($subject) = ($subject =~ /Subject: (.*)/i); FilterSubject($msg_id, $subject, defined($ref_id)); } } post_console_message("counting heads: $i") unless (($i % 100)==0); post_console_message("*Warning* errors: $err_count.") if ($err_count); } ###################################################################### # Here we expend too much effort to be platform independent. # We really should `cat ...` ###################################################################### sub read_etc_nntpserver { my $rc; open(FH, '); close(FH); $rc =~ s/\s*$//; return $rc || undef; } ###################################################################### # Decide whether to exclude messages. ###################################################################### sub excluded { my $msg = shift; if ($exclude_regex) { my $header = ''; for ( my $i = 0; $i < scalar(@$msg) && $msg->[$i] !~ /^\s*$/; $i++) { $header .= $msg->[$i]; } return ($header =~ $exclude_regex); } else { return 0; } } ###################################################################### # Fetch unanswered articles for one news group. ###################################################################### sub fetch_group_unanswered { my $group = shift; # counts for articles that will not be printed my ($exclude_count, $unavail_count) = (0, 0); # get news article number range my ($first_num, $last_num) = $news_client->group($group); die $news_client->message() unless ($news_client->ok()); $first_num = $last_num - $opt_m +1 if ($opt_m && ($opt_m =~ /^\d+$/)); # Test scaffolding. Under Linux this forces overview analysis to fail. # $news_client->quit(); # $news_client = new News::NNTPClient($server); # $news_client->debug(0); post_console_message('Finding unanswered articles.'); %unanswered = (); %record_dup_subj = (); ###################################################################### # The actual work of deciding which articles for the group are # unanswered is done here. ###################################################################### eval { SetUnansweredXover( $news_client, $first_num, $last_num, $xover_batch_size ); }; if ($@) { post_console_message 'Xover failed; trying one message at a ', 'time. This may take a while.', "\n"; # more test scaffolding # $news_client->mode_reader(); # $news_client->group($group); SetUnansweredHead($news_client, $first_num, $last_num); } unless ($opt_j) { RemoveDuplicateSubject(); %record_dup_subj = (); # free what may be substantial memory } ###################################################################### # End of "find unanswered" code block. ###################################################################### post_console_message('Done finding unanswered articles.'); post_console_message('Fetching ', scalar(keys %unanswered), ' unanswered articles.'); # Fetch each unanswered article from the news server # and print it to the standard output. foreach my $article_id ( sort {$unanswered{$b} <=> $unanswered{$a}} keys(%unanswered) ) { my $msg = $news_client->article($article_id); if (! $msg) { $unavail_count++; } elsif (excluded($msg)) { $exclude_count++; } else { print @$msg; } } post_console_message( "Excluded $exclude_count messages based on pattern." ) if ($exclude_count); post_console_message("$unavail_count messages were unavailable.") if ($unavail_count); post_console_message('A total of ', scalar(keys %unanswered) - $exclude_count - $unavail_count, ' available matching messages printed.') if ($exclude_count || $unavail_count); } ###################################################################### # Start of program. ###################################################################### # process command line options getopts("jm:n:rs:x:"); unless (@ARGV) { print <ok()) { $news_client->quit(); die $news_client->message(); } $news_client->debug(0); $news_client->mode_reader(); foreach my $news_group (@ARGV) { eval{fetch_group_unanswered($news_group);}; print STDERR $@ if($@); } post_console_message('Done.'); $news_client->quit(); =head1 NAME fetch_unanswered.pl - Retrieve news articles that do not have a reply. =head1 README Retrieve usenet news articles to which no reply has yet been posted. =head1 DESCRIPTION Retrieve articles from newsgroups to which no reply has yet been posted. Newsgroup names are passed as command line arguments to the program. Articles are all printed to the standard output and status messages are printed to STDERR. Usage: fetch_unanswered.pl [options] newsgroup [ngroup ...] =head1 COMMAND LINE OPTIONS =over 4 =item -j Turn off threading of articles by subject. Turning this off also saves (some) time and memory. Article threading eliminates articles starting with 'Re:' and groups of articles with the same subject. =item -m EMax headers to look back.E Look back at most -m headers/nov records. =item -n ENOV record batch sizeE Limit number of NOV records we read from server with one request. A small number will result in more frequent feedback to the user. =item -s Enews server nameE Override default news server. Default is: (in order of decreasing priority) value of NNTPSERVER environment variable value from /etc/nntpserver file value set at start of fetch_unanswered.pl source code. =item -x EpatternE Exclude messages with header matching pattern provided. Pattern is interpreted as a Perl regex with multiline and case insensitive options turned on. Exclusion is done on unanswered messages after they have been identified. E.G. to remove comp.lang.perl.misc FAQ postings try -x'from: perlfaq server' =back =head1 PREREQUISITES This script requires C and C. =pod OSNAMES any =pod SCRIPT CATEGORIES News =cut