Generated: Tue Feb 2 17:54:57 2010 from stripms3.pl 2005/05/04 8.6 KB.
#!/Perl use HTML::Parser (); use Data::Dump (); my $program = "stripms"; ## user feature variables ## my $dodebug = 1; ##my $definp = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/My Webs/moon-01.htm"; my $definp = "C:/HOMEPAGE/P26/compgr.htm"; my $defout = "C:/HOMEPAGE/P26/temphtml.htm"; my $deflog = "temphtml.txt"; # output log file ... more if $dodebug = 1! my $defskip = "tempskip.txt"; # view what has been REJECTED, DELETED, CHOPPED my $WEBVERS = "P26.2005.05.04"; my $addcode = "<!-- $WEBVERS - geoffmclane.com - stripms.pl -->"; my $clearhtml = 1; # clear HTML attributes my $clearop = 1; # clear MS o:p paragraph thingy my $clearpatts = 1; # clear paragraph attributes my $clearhstyl = 1; # no SYTLE statment in head - include through file, if required ... *TBD* my $cleartdsty = 0; # clear TD attributes my $fiximg = 1; # modify the IMG tag my $clearhlink = 1; # clear a LINK REL statement my $clearspan = 1; # remove all SPAN tags my $cleardiv = 1; # remove all DIV tags my $clearmeta = 1; # remove META (head) tag ## BODY actions my $clearbsyle = 0; # no BODY attributes my $fixblstyle = 1; # modify body language, if given my $deflang = 'EN-AU'; # use English (Austrlian) ## program variables ## my $WHITE_PATTERN2 = "^[ \t\n\r]*\$"; # spacey if ($var ~= /$WHITE_PATTERN2/o ) { ...} my ($FH, $HH, $CH); # run log, html and strip log ... my $doout = 1; # do the OUTPUT, but can be off'ed ... my $inpfil = ""; my $subok = 0; my $inhtml = 0; # in document my $inpara = 0; # in paragraph tag my $inhead = 0; # processing header my $inbody = 0; # body processing my $instyle = 0; # style processing my $start_time = time(); open $HH, ">$defout" or die "No HTML output file ... [$defout]!\n"; open $FH, ">$deflog" or die "No OUT LOG file ...\n"; open $CH, ">$defskip" or die "No SKIP file ...\n"; my $p = HTML::Parser->new(api_version => 3); $p->handler(default => \&hand, "event, line, column, text, tagname, attr"); # $p->parse_file(@ARGV ? shift : die "No input given ....\n"); parse_args(@ARGV); # if we did NOT get an INPUT file, what to DO ... if ( !length($inpfil) ) { if ($dodebug) { $inpfil = $definp; } else { die "No input file given ...\n"; } } print "$program: Started on " . localtime($start_time) ; ### . " in $cwdir ...\n" if $shwtm; $p->parse_file($inpfil); close $FH; # log file output close $HH; # ouput HTML file close $CH; # log of discarded items system $defout; # run the HTML file if ($subok) { open $HH, "<$defout" or die "No HTML input file ... [$defout]!\n"; open $FH, "<$inpfil" or die "No re-open of the source ... [$inpfil]!\n"; my @infil = <$HH>; # slurp the file, from the disk my @outfil = <$FH>; # slurp the original source close $FH; # log file output close $HH; # ouput HTML file open $HH, ">$defout" or die "No HTML input file ... [$defout]!\n"; open $FH, ">$inpfil" or die "No re-open of the source ... [$inpfil]!\n"; print $HH @outfil; print $FH @infil; close $FH; # log file output close $HH; # ouput HTML file } ## Event table ## ["S", $tag, $attr, $attrseq, $text] ## ["E", $tag, $text] ## ["T", $text, $is_data] ## ["C", $text] ## ["D", $text] ## ["PI", $token0, $text] sub hand { my($event, $line, $column, $text, $tagname, $attr) = @_; my $typ = uc(substr($event,0,1)); ## get TYPE my @d = "$typ L$line C$column"; #substr($text, 40) = "..." if length($text) > 40; push(@d, $text); push(@d, $tagname) if defined $tagname; push(@d, $attr) if $attr; my $otxt = Data::Dump::dump(@d); #print $FH Data::Dump::dump(@d), "\n"; #print Data::Dump::dump(@d), "\n"; print "$otxt\n"; # now process the data ... my $locout = 1; # one time only output flag my $i; my $tag = '*NO_TAG*'; if (defined $tagname) { $tag = uc($tagname); } # Event table ######################################################################## if ($typ eq 'S') { # START OF TAG, and possible ATTRIBUTES ## ["S", $tag, $attr, $attrseq, $text] if ($tag eq 'HTML') { $inhtml = 1; if ($clearhtml) { $text = '<html>'; } if (defined $addcode) { ## = "<!-- $WEBVERS - geoffmclane.com - stripms.pl -->"; $text .= "\n"; $text .= "$addcode"; } } elsif ($tag eq 'P') { $inpara = 1; if ($clearpatts) { print $FH "Paragraph from $text to <p> ...\n"; $text = "<p>"; } } elsif ($tag eq 'HEAD') { $inhead = 1; } elsif ($tag eq 'BODY') { $inbody = 1; if ($clearbsyle) { # no BODY attributes $text = '<body>'; } elsif ($fixblstyle) { # modify body language, if given # use my $deflang = 'EN-AU'; # use English (Austrlian) my %att = %$attr; # copy the HASH, to do modifications $i = 0; prt( "Checking BODY attrib ...\n" ); foreach $key (keys %att) { prt ( "Checking attrib $key ...\n" ); if ($key eq 'lang') { prt ( "Found lang=$key ...\n" ); if ($att{$key} ne $deflang) { prt ( "Modifying 'lang=$att{$key} to [$deflang] ...\n" ); $att{$key} = $deflang; } $i++; } } if ($i) { # ok, change output, re-run to build new HTML $text = "<$tag"; # start tag again while (($key,$value) = each %att) { $text .= " $key=$value"; } $text .= '>'; # close tag } } } elsif ($tag eq 'STYLE') { $instyle =1; if ($clearhstyl) { # in head - close out S style to E sytle if ($inhead) { $doout = 0; # CLOSE output } } } elsif ($tag eq 'SPAN') { if ($clearspan) { # remove all SPAN tags $locout = 0; } } elsif ($tag eq 'O:P') { if ($clearop) { # clear MS o:p paragraph thingy $locout = 0; } } elsif ($tag eq 'LINK') { if ($clearhlink) { $locout = 0; } } elsif ($tag eq 'DIV') { if ($cleardiv) { $locout = 0; } } elsif ($tag eq 'TD') { if ($cleartdsty) { $text = '<td>'; } } elsif ($tag eq 'IMG') { if ($fiximg) { # modify the IMG tag my %att = %$attr; # copy the HASH, to do modifications $i = 0; foreach $key (keys %att) { if ($key eq 'v:shapes') { print $FH "Deleting attrib $key ...\n"; delete $att{$key}; # remove this MS reference $i++; } } if ($i) { $text = '<IMG'; # start IMG tag again while (($key,$value) = each %att) { $text .= " $key=$value"; } $text .= '>'; # close IMG tag } } } elsif ($tag eq 'META') { if ($clearmeta) { # remove META (head) tag $locout = 0; } } ######################################################################## } elsif ($typ eq 'E') { ## ["E", $tag, $text] if ($tag eq 'HTML') { $inhtml = 0; } elsif ($tag eq 'P') { $inpara = 0; } elsif ($tag eq 'HEAD') { $inhead = 0; } elsif ($tag eq 'BODY') { $inbody = 0; } elsif ($tag eq 'STYLE') { $instyle = 0; if ($clearhstyl) { # in head - close out S style to E sytle if ($inhead) { $doout = 1; # OPEN output $locout = 0; # but NOT for this style one } } } elsif ($tag eq 'SPAN') { if ($clearspan) { # remove all SPAN tags $locout = 0; } } elsif ($tag eq 'O:P') { if ($clearop) { # clear MS o:p paragraph thingy $locout = 0; } } elsif ($tag eq 'LINK') { if ($clearhlink) { $locout = 0; } } elsif ($tag eq 'DIV') { if ($cleardiv) { $locout = 0; } } } elsif ($typ eq 'T') { ## ["T", $text, $is_data] } elsif ($typ eq 'C') { ## ["C", $text] $locout = 0; # toss all CODE } elsif ($typ eq 'D') { ## ["D", $text] } elsif ($typ eq 'P') { ## ["PI", $token0, $text] } ### end event table ########################################################### if ($text =~ /$WHITE_PATTERN2/o) { print $CH "ws[$otxt]\n"; print $CH "ws[$text]\n"; } else { if ($doout && $locout) { print $FH "$otxt\n"; print $HH "$text\n"; } else { print $CH "$otxt\n"; print $CH "$text\n"; } } } sub parse_args { my (@av) = @_; # get stack while (@av) { my $arg = uc($av[0]); if ($arg =~ /^-/) { if ($arg eq '-V') { print "Version: 0.0.1 - May 2005\n"; } elsif (($arg eq '-H') || ($arg eq '-?')) { die "stripms infile [options]\n"; } else { die "ERROR: Unknown option [$arg]\n"; } } else { if (length($inpfil)) { die "ERROR: Can not handle two input files ...\n"; } $inpfil = $arg; if ( !(-f $inpfil) ) { die "ERROR: Can NOT locate file [$inpfil] ...\n"; } } shift @av; } } sub prt { if ($dodebug) { print $FH @_; } } # EOF