package Proxy::HTML; # I guess this is a bad name.... # # $Id: HTML.pm,v 1.3 1997/10/15 00:24:11 abigail Exp abigail $ # # $Log: HTML.pm,v $ # Revision 1.3 1997/10/15 00:24:11 abigail # On the fly configuration. # Host dependent image selection. # # Revision 1.2 1997/08/24 05:15:11 abigail # Configurable ignoring tags/attributes. Either unconditionally, # or by looking at the attribute value. # Rewriting tags if images are not wanted. # # Revision 1.1 1997/08/23 09:32:05 abigail # Initial revision # # # # Strip nasty tags/attributes from HTML text. use strict; use Carp; use vars qw /@ISA/; use vars qw /$IMG %NO_TAGS %NO_ATTR %NO_TAG_BY_ATTR/; use HTML::Parser; @ISA = qw /HTML::Parser/; sub deal_with_img ($$); my $INLINE = "[INLINE]"; # Text of an inline image with no alt text. # We mask new to be able to get the inline status. sub new ($$) { my $class = shift; my $self = new HTML::Parser; bless $self, $class; } # To do interesting things, we keep track of the parsed and modified # part of html text in an instance variable "mod_text". It is set to '' # by masking parse (). sub parse ($@) { my $self = shift; $self -> {mod_text} = ''; $self -> {response} = shift; $self -> {host} = $self -> {response} -> base -> host; $self -> {content} = $self -> {response} -> content; $self -> {last_tag} = 'NIL'; $self -> {src_matches} = []; foreach my $host (keys %$IMG) { if ($self -> {host} =~ $host) { $self -> {src_matches} = $IMG -> {$host}; last; } } $self -> netscape_buggy_comment (1); # Silly netscape. $self -> SUPER::parse ($self -> {content}); } # Return the modified text. sub parsed_text ($) { my $self = shift; $self -> {mod_text}; } # Now we mask the various "call back" functions. # Declarations are just copied. (But what about comments?) sub declaration ($$) { my $self = shift; my $decl = shift; $self -> {mod_text} .= ""; } # This is the interesting function. # For now, just strip font tags. sub start ($$$$$) { my $self = shift; my ($tag, $attributes, $attrseq, $origtext) = @_; # Skip multiple
s. return if ($tag eq 'br' && $self -> {last_tag} eq 'br'); $self -> {last_tag} = $tag; if ($NO_TAG_BY_ATTR {$tag}) { foreach my $attribute (keys %$attributes) { my $sub = $NO_TAG_BY_ATTR {$tag} -> {$attribute} or next; return if $sub -> ($attributes -> {$attribute}, $self); } } if ($NO_ATTR {$tag}) { foreach my $attribute (keys %$attributes) { my $sub = $NO_ATTR {$tag} -> {$attribute} or next; if ($sub -> ($attributes -> {$attribute}, $self)) { delete $attributes -> {$attribute}; } } $self -> {mod_text} .= $self -> build_tag ($tag, $attributes); return; } $self -> {mod_text} .= $origtext unless $NO_TAGS {$tag}; } sub build_tag ($$$) { my $self = shift; my ($tag, $attributes) = @_; my $line = "<$tag "; foreach my $attr (keys %$attributes) { # We lost the info whether it was single or double quote delimited. # So we take the "safe" approach and use double quote delimited, # but first we need to eliminate the existing double quotes. $attributes -> {$attr} =~ s/"/"/g; $line .= qq ($attr = "$attributes->{$attr}" ); } $line .= ">"; } # Copy end tags, except for those being filtered out. sub end ($$) { my $self = shift; my $tag = shift; do {$self -> {mod_text} .= ""; $self -> {last_tag} = 'NIL';} unless $NO_TAGS {$tag}; } # Text is just copied. sub text ($$) { my $self = shift; my $text = shift; $self -> {last_tag} = 'NIL' if $text =~ /\S/; $text =~ s/(?: ?| ?)(?: ?| ?|\s)+/ /g; $self -> {mod_text} .= $text; } # Comments are just copied. # Hmm, 'man HTML::Parser' is unclear. Comments *are* part of a document # declaration. And you can have multiple comments in a document declarion. # Hope this will work.... sub comment ($$) { my $self = shift; my $comment = shift; $self -> {mod_text} .= ""; } # Replace the image with the alt text. sub img_to_alt ($$) { my $self = shift; my $attributes = shift; defined $attributes -> {alt} ? $attributes -> {alt} : $INLINE; } sub deal_with_img ($$) { my ($src, $self) = @_; foreach my $s (@{$self -> {src_matches}}, @{$IMG -> {ALL}}) { return 1 if $src =~ /$s/; } } 1;