#!/usr/bin/perl -p0777 # ######################################################### # striphtml ("striff tummel") # tchrist@perl.com # version 1.0: Thu 01 Feb 1996 1:53:31pm MST # version 1.1: Sat Feb 3 06:23:50 MST 1996 # (fix up comments in annoying places) ######################################################### # # how to strip out html comments and tags and transform # entities in just three -- count 'em three -- substitutions; # sed and awk eat your heart out. :-) # # as always, translations from this nacré rendition into # more characteristically marine, herpetoid, titillative, # or indonesian idioms are welcome for the furthering of # comparitive cyberlinguistic studies. # ######################################################### require 5.002; # for nifty embedded regexp comments ######################################################### # first we'll shoot all the ######################################################### s{ # up to a `>' }{ if ($1 || $3) { # this silliness for embedded comments in tags ""; } }gesx; # mutate into nada, nothing, and niente ######################################################### # next we'll remove all the ######################################################### s{ < # opening angle bracket (?: # Non-backreffing grouping paren [^>'"] * # 0 or more things that are neither > nor ' nor " | # or else ".*?" # a section between double quotes (stingy match) | # or else '.*?' # a section between single quotes (stingy match) ) + # repetire ad libitum # hm.... are null tags <> legal? XXX > # closing angle bracket }{}gsx; # mutate into nada, nothing, and niente ######################################################### # finally we'll translate all &valid; HTML 2.0 entities ######################################################### s{ ( & # an entity starts with a semicolon ( \x23\d+ # and is either a pound (#) and numbers | # or else \w+ # has alphanumunders up to a semi ) ;? # a semi terminates AS DOES ANYTHING ELSE (XXX) ) } { $entity{$2} # if it's a known entity use that || # but otherwise $1 # leave what we'd found; NO WARNINGS (XXX) }gex; # execute replacement -- that's code not a string ######################################################### # but wait! load up the %entity mappings enwrapped in # a BEGIN that the last might be first, and only execute # once, since we're in a -p "loop"; awk is kinda nice after all. ######################################################### BEGIN { %entity = ( lt => '<', #a less-than gt => '>', #a greater-than amp => '&', #a nampersand quot => '"', #a (verticle) double-quote nbsp => chr 160, #no-break space iexcl => chr 161, #inverted exclamation mark cent => chr 162, #cent sign pound => chr 163, #pound sterling sign CURRENCY NOT WEIGHT curren => chr 164, #general currency sign yen => chr 165, #yen sign brvbar => chr 166, #broken (vertical) bar sect => chr 167, #section sign uml => chr 168, #umlaut (dieresis) copy => chr 169, #copyright sign ordf => chr 170, #ordinal indicator, feminine laquo => chr 171, #angle quotation mark, left not => chr 172, #not sign shy => chr 173, #soft hyphen reg => chr 174, #registered sign macr => chr 175, #macron deg => chr 176, #degree sign plusmn => chr 177, #plus-or-minus sign sup2 => chr 178, #superscript two sup3 => chr 179, #superscript three acute => chr 180, #acute accent micro => chr 181, #micro sign para => chr 182, #pilcrow (paragraph sign) middot => chr 183, #middle dot cedil => chr 184, #cedilla sup1 => chr 185, #superscript one ordm => chr 186, #ordinal indicator, masculine raquo => chr 187, #angle quotation mark, right frac14 => chr 188, #fraction one-quarter frac12 => chr 189, #fraction one-half frac34 => chr 190, #fraction three-quarters iquest => chr 191, #inverted question mark Agrave => chr 192, #capital A, grave accent Aacute => chr 193, #capital A, acute accent Acirc => chr 194, #capital A, circumflex accent Atilde => chr 195, #capital A, tilde Auml => chr 196, #capital A, dieresis or umlaut mark Aring => chr 197, #capital A, ring AElig => chr 198, #capital AE diphthong (ligature) Ccedil => chr 199, #capital C, cedilla Egrave => chr 200, #capital E, grave accent Eacute => chr 201, #capital E, acute accent Ecirc => chr 202, #capital E, circumflex accent Euml => chr 203, #capital E, dieresis or umlaut mark Igrave => chr 204, #capital I, grave accent Iacute => chr 205, #capital I, acute accent Icirc => chr 206, #capital I, circumflex accent Iuml => chr 207, #capital I, dieresis or umlaut mark ETH => chr 208, #capital Eth, Icelandic Ntilde => chr 209, #capital N, tilde Ograve => chr 210, #capital O, grave accent Oacute => chr 211, #capital O, acute accent Ocirc => chr 212, #capital O, circumflex accent Otilde => chr 213, #capital O, tilde Ouml => chr 214, #capital O, dieresis or umlaut mark times => chr 215, #multiply sign Oslash => chr 216, #capital O, slash Ugrave => chr 217, #capital U, grave accent Uacute => chr 218, #capital U, acute accent Ucirc => chr 219, #capital U, circumflex accent Uuml => chr 220, #capital U, dieresis or umlaut mark Yacute => chr 221, #capital Y, acute accent THORN => chr 222, #capital THORN, Icelandic szlig => chr 223, #small sharp s, German (sz ligature) agrave => chr 224, #small a, grave accent aacute => chr 225, #small a, acute accent acirc => chr 226, #small a, circumflex accent atilde => chr 227, #small a, tilde auml => chr 228, #small a, dieresis or umlaut mark aring => chr 229, #small a, ring aelig => chr 230, #small ae diphthong (ligature) ccedil => chr 231, #small c, cedilla egrave => chr 232, #small e, grave accent eacute => chr 233, #small e, acute accent ecirc => chr 234, #small e, circumflex accent euml => chr 235, #small e, dieresis or umlaut mark igrave => chr 236, #small i, grave accent iacute => chr 237, #small i, acute accent icirc => chr 238, #small i, circumflex accent iuml => chr 239, #small i, dieresis or umlaut mark eth => chr 240, #small eth, Icelandic ntilde => chr 241, #small n, tilde ograve => chr 242, #small o, grave accent oacute => chr 243, #small o, acute accent ocirc => chr 244, #small o, circumflex accent otilde => chr 245, #small o, tilde ouml => chr 246, #small o, dieresis or umlaut mark divide => chr 247, #divide sign oslash => chr 248, #small o, slash ugrave => chr 249, #small u, grave accent uacute => chr 250, #small u, acute accent ucirc => chr 251, #small u, circumflex accent uuml => chr 252, #small u, dieresis or umlaut mark yacute => chr 253, #small y, acute accent thorn => chr 254, #small thorn, Icelandic yuml => chr 255, #small y, dieresis or umlaut mark ); #################################################### # now fill in all the numbers to match themselves #################################################### for $chr ( 0 .. 255 ) { $entity{ '#' . $chr } = chr $chr; } } ######################################################### # premature finish lest someone clip my signature ######################################################### __END__ Tom Christiansen's Mox.Perl.COM Home Page

perl / magic / usenix / boulder


The word of the day is nidificate.
Testing: E Ê Ä

Perl

DOCTYPE START1 -- > END1 DOCTYPE START2 END2
A ship then new they built for him
of mithril and of elven glass...