package Proxy::HTML; # I guess this is a bad name....
#
# $Id: HTML.pm,v 1.3 1997/10/15 00:24:11 abigail Exp abigail $
#
# $Log: HTML.pm,v $
# Revision 1.3 1997/10/15 00:24:11 abigail
# On the fly configuration.
# Host dependent image selection.
#
# Revision 1.2 1997/08/24 05:15:11 abigail
# Configurable ignoring tags/attributes. Either unconditionally,
# or by looking at the attribute value.
# Rewriting tags if images are not wanted.
#
# Revision 1.1 1997/08/23 09:32:05 abigail
# Initial revision
#
#
#
# Strip nasty tags/attributes from HTML text.
use strict;
use Carp;
use vars qw /@ISA/;
use vars qw /$IMG %NO_TAGS %NO_ATTR %NO_TAG_BY_ATTR/;
use HTML::Parser;
@ISA = qw /HTML::Parser/;
sub deal_with_img ($$);
my $INLINE = "[INLINE]"; # Text of an inline image with no alt text.
# We mask new to be able to get the inline status.
sub new ($$) {
my $class = shift;
my $self = new HTML::Parser;
bless $self, $class;
}
# To do interesting things, we keep track of the parsed and modified
# part of html text in an instance variable "mod_text". It is set to ''
# by masking parse ().
sub parse ($@) {
my $self = shift;
$self -> {mod_text} = '';
$self -> {response} = shift;
$self -> {host} = $self -> {response} -> base -> host;
$self -> {content} = $self -> {response} -> content;
$self -> {last_tag} = 'NIL';
$self -> {src_matches} = [];
foreach my $host (keys %$IMG) {
if ($self -> {host} =~ $host) {
$self -> {src_matches} = $IMG -> {$host};
last;
}
}
$self -> netscape_buggy_comment (1); # Silly netscape.
$self -> SUPER::parse ($self -> {content});
}
# Return the modified text.
sub parsed_text ($) {
my $self = shift;
$self -> {mod_text};
}
# Now we mask the various "call back" functions.
# Declarations are just copied. (But what about comments?)
sub declaration ($$) {
my $self = shift;
my $decl = shift;
$self -> {mod_text} .= "";
}
# This is the interesting function.
# For now, just strip font tags.
sub start ($$$$$) {
my $self = shift;
my ($tag, $attributes, $attrseq, $origtext) = @_;
# Skip multiple
s.
return if ($tag eq 'br' && $self -> {last_tag} eq 'br');
$self -> {last_tag} = $tag;
if ($NO_TAG_BY_ATTR {$tag}) {
foreach my $attribute (keys %$attributes) {
my $sub = $NO_TAG_BY_ATTR {$tag} -> {$attribute} or next;
return if $sub -> ($attributes -> {$attribute}, $self);
}
}
if ($NO_ATTR {$tag}) {
foreach my $attribute (keys %$attributes) {
my $sub = $NO_ATTR {$tag} -> {$attribute} or next;
if ($sub -> ($attributes -> {$attribute}, $self)) {
delete $attributes -> {$attribute};
}
}
$self -> {mod_text} .= $self -> build_tag ($tag, $attributes);
return;
}
$self -> {mod_text} .= $origtext unless $NO_TAGS {$tag};
}
sub build_tag ($$$) {
my $self = shift;
my ($tag, $attributes) = @_;
my $line = "<$tag ";
foreach my $attr (keys %$attributes) {
# We lost the info whether it was single or double quote delimited.
# So we take the "safe" approach and use double quote delimited,
# but first we need to eliminate the existing double quotes.
$attributes -> {$attr} =~ s/"/"/g;
$line .= qq ($attr = "$attributes->{$attr}" );
}
$line .= ">";
}
# Copy end tags, except for those being filtered out.
sub end ($$) {
my $self = shift;
my $tag = shift;
do {$self -> {mod_text} .= "$tag>";
$self -> {last_tag} = 'NIL';} unless $NO_TAGS {$tag};
}
# Text is just copied.
sub text ($$) {
my $self = shift;
my $text = shift;
$self -> {last_tag} = 'NIL' if $text =~ /\S/;
$text =~ s/(?: ?| ?)(?: ?| ?|\s)+/ /g;
$self -> {mod_text} .= $text;
}
# Comments are just copied.
# Hmm, 'man HTML::Parser' is unclear. Comments *are* part of a document
# declaration. And you can have multiple comments in a document declarion.
# Hope this will work....
sub comment ($$) {
my $self = shift;
my $comment = shift;
$self -> {mod_text} .= "";
}
# Replace the image with the alt text.
sub img_to_alt ($$) {
my $self = shift;
my $attributes = shift;
defined $attributes -> {alt} ? $attributes -> {alt} : $INLINE;
}
sub deal_with_img ($$) {
my ($src, $self) = @_;
foreach my $s (@{$self -> {src_matches}}, @{$IMG -> {ALL}}) {
return 1 if $src =~ /$s/;
}
}
1;