Generated: Tue Feb 2 17:54:42 2010 from html02.pl 2006/10/26 8.1 KB.
#!/Perl -w # html02.pl # AIM: Use HTML::Parser to CLEAN a WORD HTML output # This is an EVENT parser, so a sub is provided to be # called on each 'event' ... # ################################################## use HTML::Parser (); use Data::Dump (); my ($FH, $HH); my $instyle = 0; my $WHITE_PATTERN2 = "^[ \t\n\r]*\$"; # spacey if ($var ~= /$WHITE_PATTERN2/o ) { ...} my $stripC = 1; my $def_folder = 'C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/My Webs/'; my $definp = $def_folder . 'moon-01.htm'; my $defout = $def_folder . "tempmoon.htm"; my $inpfil; my $inhtml = 0; # in document my $inpara = 0; # in paragraph tag my $inhead = 0; # processing header my $inbody = 0; # body processing my $instyle = 0; # style processing my $clearhtml = 1; # clear HTML attributes my $clearop = 1; # clear MS o:p paragraph thingy my $clearpatts = 1; # clear paragraph attributes my $clearhstyl = 1; # no SYTLE statment in head - include through file, if required my $clearbsyle = 1; # no BODY attributes my $cleartdsty = 1; # clear TD attributes my $fiximg = 1; # modify the IMG tag my $clearhlink = 1; # clear a LINK REL statement my $clearspan = 1; # remove all SPAN tags my $cleardiv = 1; # remove all DIV tags my $clearmeta = 1; # remove META (head) tag my $doout = 1; # do the OUTPUT, but can be off'ed ... open $HH, ">$defout" or die "No HTML file ... [$defout]!\n"; open $FH, ">temphtml.txt" or die "No OUT file ...\n"; open $CH, ">tempskip.txt" or die "No SKIP file ...\n"; ## Event table ## ["S", $tag, $attr, $attrseq, $text] ## ["E", $tag, $text] ## ["T", $text, $is_data] ## ["C", $text] ## ["D", $text] ## ["PI", $token0, $text] sub addh { local ($typ,$txt,$atr) = @_; my $outtext = $txt; my $logtext = ''; my $outon = 1; $outtext .= "\n"; if ($txt =~ /$WHITE_PATTERN2/o) { # skip just space $logtext .= "WT[$txt]\n"; } else { $logtext .= "TT[$txt]\n"; } if ($atr) { local ($k, $v, $i); $logtext .= "Showing attributes ... \n"; # print $FH %$atr, "\n"; $i = 0; #while (($key, $value) = each %atr) { # print $FH $key, "\n"; # # delete $hash{$key}; # This is safe #} #foreach $k (keys %atr) { # $i++; # print $FH "$i [$k] = [$atr{$k}]\n"; #} #foreach $k (keys %atr) { # $i++; # $v = $atr{$k}; # #print $FH "$i $k = $atr{$k}\n"; # print $FH "$i $k = $v\n"; #} while (($k,$v) = each %$atr){ $i++; # print $FH "$i key[$k] = val[$v]\n"; #print $FH "$i $k=$v\n"; $logtext .= "$i $k=$v\n"; } if ($i == 0) { $logtext .= "NULL LIST!\n"; } } else { $logtext .= "NO attributes ... \n"; } if ($typ eq 'C') { if ($stripC) { $outon = 0; # switch OFF output } } if ($outon) { print $HH $outtext; print $FH $logtext; } else { print $CH $outtext; print $CH $logtext; } } sub h { my($event, $line, $column, $text, $tagname, $attr) = @_; my $typ = uc(substr($event,0,1)); ## get TYPE my @d = "$typ L$line C$column"; #substr($text, 40) = "..." if length($text) > 40; push(@d, $text); push(@d, $tagname) if defined $tagname; push(@d, $attr) if $attr; my $otxt = Data::Dump::dump(@d); #print $FH Data::Dump::dump(@d), "\n"; #print Data::Dump::dump(@d), "\n"; print "$otxt\n"; # now process the data ... my $tag = '*NO_TAG*'; my $locout = 1; # one time only output flag my $i; if (defined $tagname) { $tag = uc($tagname); } # Event table ######################################################################## if ($typ eq 'S') { ## ["S", $tag, $attr, $attrseq, $text] if ($tag eq 'HTML') { $inhtml = 1; if ($clearhtml) { $text = '<html>'; } } elsif ($tag eq 'P') { $inpara = 1; if ($clearpatts) { print $FH "Paragraph from $text to <p> ...\n"; $text = "<p>"; } } elsif ($tag eq 'HEAD') { $inhead = 1; } elsif ($tag eq 'BODY') { $inbody = 1; if ($clearbsyle) { # no BODY attributes $text = '<BODY>'; } } elsif ($tag eq 'STYLE') { $instyle =1; if ($clearhstyl) { # in head - close out S style to E sytle if ($inhead) { $doout = 0; # CLOSE output } } } elsif ($tag eq 'SPAN') { if ($clearspan) { # remove all SPAN tags $locout = 0; } } elsif ($tag eq 'O:P') { if ($clearop) { # clear MS o:p paragraph thingy $locout = 0; } } elsif ($tag eq 'LINK') { if ($clearhlink) { $locout = 0; } } elsif ($tag eq 'DIV') { if ($cleardiv) { $locout = 0; } } elsif ($tag eq 'TD') { if ($cleartdsty) { $text = '<td>'; } } elsif ($tag eq 'IMG') { if ($fiximg) { # modify the IMG tag my %att = %$attr; # copy the HASH, to do modifications $i = 0; foreach $key (keys %att) { if ($key eq 'v:shapes') { print $FH "Deleting attrib $key ...\n"; delete $att{$key}; # remove this MS reference $i++; } } if ($i) { $text = '<IMG'; # start IMG tag again while (($key,$value) = each %att) { $text .= " $key=$value"; } $text .= '>'; # close IMG tag } } } elsif ($tag eq 'META') { if ($clearmeta) { # remove META (head) tag $locout = 0; } } ######################################################################## } elsif ($typ eq 'E') { ## ["E", $tag, $text] if ($tag eq 'HTML') { $inhtml = 0; } elsif ($tag eq 'P') { $inpara = 0; } elsif ($tag eq 'HEAD') { $inhead = 0; } elsif ($tag eq 'BODY') { $inbody = 0; } elsif ($tag eq 'STYLE') { $instyle = 0; if ($clearhstyl) { # in head - close out S style to E sytle if ($inhead) { $doout = 1; # OPEN output $locout = 0; # but NOT for this style one } } } elsif ($tag eq 'SPAN') { if ($clearspan) { # remove all SPAN tags $locout = 0; } } elsif ($tag eq 'O:P') { if ($clearop) { # clear MS o:p paragraph thingy $locout = 0; } } elsif ($tag eq 'LINK') { if ($clearhlink) { $locout = 0; } } elsif ($tag eq 'DIV') { if ($cleardiv) { $locout = 0; } } } elsif ($typ eq 'T') { ## ["T", $text, $is_data] } elsif ($typ eq 'C') { ## ["C", $text] } elsif ($typ eq 'D') { ## ["D", $text] } elsif ($typ eq 'P') { ## ["PI", $token0, $text] } ### end event table ########################################################### if ($text =~ /$WHITE_PATTERN2/o) { print $CH "ws[$otxt]\n"; print $CH "ws[$text]\n"; } else { if ($typ eq 'C') { print $CH "cd[$otxt]\n"; print $CH "cd[$text]\n"; } else { if ($doout && $locout) { print $FH "$otxt\n"; print $HH "$text\n"; } else { print $CH "$otxt\n"; print $CH "$text\n"; } } } } sub h2_NOT_USED { my($event, $line, $column, $text, $tagname, $attr) = @_; my $typ = uc(substr($event,0,1)); ## get TYPE my @d = "$typ L$line C$column"; #substr($text, 40) = "..." if length($text) > 40; push(@d, $text); push(@d, $tagname) if defined $tagname; push(@d, $attr) if $attr; if ($typ ne "XXX") { print $FH Data::Dump::dump(@d), "\n"; } print Data::Dump::dump(@d), "\n"; my $msg = "$typ = "; my $doadd = 0; my $tag; if (defined $tagname) { $tag = $tagname; $msg .= "<$tagname> "; $doadd++; } else { $tag = "*NO_TAG*"; $msg .= "*NO TAG* "; } if ($text =~ /$WHITE_PATTERN2/o) { $msg .= "WHITE TEXT ONLY "; } else { $msg .= "with Text "; $doadd++; } if ($attr) { $msg .= "and attributes."; $doadd++; } else { $msg .= "NO ATTRIBUTES."; } print $FH $msg, "\n"; if ($doadd) { addh($typ, $text, $attr); } else { print $FH "Skipped ... " . @d . "\n"; } } my $p = HTML::Parser->new(api_version => 3); $p->handler(default => \&h, "event, line, column, text, tagname, attr"); # $p->parse_file(@ARGV ? shift : die "No input given ....\n"); $inpfil = $definp; $p->parse_file($inpfil); close $FH; # log file output close $HH; # ouput HTML file close $CH; # log of discarded items system $defout; # run the HTML file # eof - html02.pl