Generated: Tue Feb 2 17:54:57 2010 from striphtml02.pl 2005/11/30 9.3 KB.
#!/Perl # AIM: To strip HTML from a file using HTML::Parser # some regex that is similar - # FROM : http://www.trilug.org/pipermail/trilug/Week-of-Mon-20040216/024049.html # Doing some experimentation, I see that perl is normally greedy, but # if you postpend a quantifier with ? it turns that off. So, this # should remove all html tags from a file: # perl -pi -e 's/<.*?>//g' [filename] # Unfortunately, the non-greedy operator -- the question mark, is not # standard to the C library regexp() call, which I'm using. However, the # following accomplishes something similar (my thanks to 'scalar' on IRC) : # s/<[^>]+>//g use HTML::Parser (); use Data::Dump (); use Carp; # use HiRes timer use Time::HiRes qw( usleep ualarm gettimeofday tv_interval nanosleep ); use IO::File; use Fcntl; # for mode constants my $definp = 'C:/HOMEPAGE/P26/browser1.htm'; ###my $definp = 'c:/HOMEPAGE/P26/perl.htm'; ###my $definp = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/Wednesday.htm"; ###my $definp = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/My Webs/moon-01.htm"; ###my $definp = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/Russel/Russ-04.htm"; my $WHITE_PATTERN2 = "^[ \t\r\n]*\$"; # spacey if ($var =~ /$WHITE_PATTERN2/o ) { ...} my $defout = 'tempstrip.txt'; my $defstrip = 'tempout.txt'; my $defstrip2 = 'tempout2.txt'; my $verb2 = 0; my ($HO1, $HO2, $H03); my $dncr = 0; my @tagarr; ## tag array my $intable = 0; my $tnewline = 0; my $tnewhcol = 0; my $tnewcol = 0; my $colcount = 0; my $inbody = 0; my $inhead = 0; my $noheadout = 1; ### avoid HEAD tag text my $mxt = 60; ### 40 my $intd = 0; my $nointdcr = 1; my $t0 = [gettimeofday]; print "$0: Started on " . localtime(time()) . "\n"; ### . " in $cwdir ...\n" if $shwtm; open $HO1, ">$defout" or die "No output file ... [$defout]!\n"; open $HO2, ">$defstrip" or die "No output file ... [$defstrip]!\n"; ###open $HO3, ">$defstrip2" or die "No output file ... [$defstrip2]!\n"; ###close $H03; ###my $infile = shift || die "\nERROR: Must give an INPUT FILE ...\n"; my $infile = shift || $definp; ###$infile = $definp; print "Hello, World ... Strip HTML ... stripping $infile ..."; die "\nERROR: Can not locate the file [$infile]!\n" if (! -f $infile); prt ("From $infile ...\n"); my $p = HTML::Parser->new(api_version => 3); $p->handler(default => \&hand, "event, line, column, text, tagname, attr"); $p->parse_file($infile); close $HO1; close $HO2; print "$0: Ended on " . localtime(time()) . "\n"; ### . " in $cwdir ...\n" if $shwtm; $t1 = [gettimeofday]; $elapsed = tv_interval ( $t0, $t1 ); print "$0 ran for $elapsed seconds ...\n"; print "Check results in $defstrip ...\n"; #system ($defstrip); ### check out the RESULTS print "Check results in $defout ...\n"; #system ($defout); ### anaysis of data from parser ## now test the regex method my $text = read_file_f( $infile ) ; ###print $text; $text =~ s/<.*?>//g; ###$text =~ s/<[^>]+>//g; my $temp3 = 'temp005.txt'; ##open $H03, ">$temp3"; ##close $H03; write_file_f ( $temp3, $text ); ###print "<STRIPPED>\n$text\n</STRIPPED>\n"; ###my $jstrip = join('', $text); ##print "<JOINED>\n$jstrip\n</JOINED>\n"; ###open $HO3, ">$defstrip2" or carp "No output file ... [$defstrip2]! ... $!\n"; ###print $H03, $jstrip; ###print $H03, $text; ###close $H03; ###print "Check results in $defstrip2 ...\n"; $t2 = [gettimeofday]; $elap2 = tv_interval ( $t1, $t2 ); print "$0 ran for $elap2 seconds ...\n"; sub hand { my($event, $line, $column, $to, $tagname, $attr) = @_; my $typ = uc(substr($event,0,1)); ## get TYPE my $ll = length($to); my $msg = "$typ L$line C$column:"; my @d = $msg; push(@d, $to); push(@d, $tagname) if defined $tagname; push(@d, $attr) if $attr; my $asz = @d; ### get length of array my $np = 1; my $tag = '*NO_TAG*'; if (defined $tagname) { $tag = uc($tagname); } my $dtxt = Data::Dump::dump(@d); my $text = $to; my $txout = ''; ### start the final OUTPUT if ($typ eq 'S') { ## start of tag $msg .= " S-$tag"; if ($tag eq 'HEAD') { $inhead = 1; } elsif ($tag eq 'BODY') { $inbody = 1; } elsif ($tag eq 'TABLE') { $intable++; $tnewline = 0; $tnewhcol = 0; $tnewcol = 0; } elsif ($tag eq 'TR') { if ($colcount) { $txout .= "\n"; ## print $HO2 "\n"; $msg .= " Added <TR>NEW LINE!"; $colcount = 0; } else { $msg .= " Skipped <TR>NEW LINE!"; } $tnewline++; $tnewhcol = 0; $tnewcol = 0; } elsif ($tag eq 'TH') { $msg .= " Added <TH>SPACE!"; $txout .= " "; ### print $HO2 ' '; $tnewhcol++; } elsif ($tag eq 'TD') { $msg .= " Added <TD>SPACE! in td"; $txout .= ' '; ### print $HO2 ' '; $tnewcol++; $intd = 1; } else { $msg .= ' B tag with no case'; } } elsif ($typ eq 'E') { ## end tag $msg .= " E-$tag"; if ($tag eq 'HEAD') { $inhead = 0; } elsif ($tag eq 'BODY') { $inbody = 0; } elsif ($tag eq 'TABLE') { if ($intable) { $intable--; } } elsif ($tag eq 'TR') { $tnewline-- if $tnewline; } elsif ($tag eq 'TH') { $tnewhcol--; } elsif ($tag eq 'TD') { $tnewcol--; $intd = 0; } else { $msg .= ' E tag with no case'; } } elsif ($typ eq 'T') { ### text item if ($ll) { my $addtx = 0; $text =~ s/ / /g; ## get back spaces $text =~ s/</</g; ## get back less than $text =~ s/>/>/g; ## get back greater than $text =~ s/"/"/g; ## get back QUOTES ### note this LAST $text =~ s/&/&/g; ## get back ampersound if ($to ne $text) { my $l2 = length($text); $msg .= " *CHG* [$to]$ll to [$text]$l2 ! "; $ll = $l2; } my @sptxt = split (' ', $text); if ($text =~ $WHITE_PATTERN2) { my $iscr = 0; if ($text =~ /^[\r\n]*$/ ) { $msg .= "[all cr/lf stuff for $ll]"; $iscr = 1; } elsif ($text =~ /^ *$/ ) { $msg .= "[space for $ll]"; $txout .= $text; ### print $HO2 $text; $msg .= " Added <real>SPACE!"; $addtx = 1; $np = 0; } else { my $tt = $text; $tt =~ s/[\r\n]//g; ### kill the CR/LF $txout .= $tt; ### add this space??? $msg .= " [*CHK* mixed sp[$tt] + cr/lf for $ll]"; if ($intd && $nointdcr) { $msg .= 'intd cr avoided'; $iscr = 0; } else { $iscr = 1; } } if ($iscr) { if ($dncr >= 2) { $msg .= " *dup cr*"; ###return; } else { $msg .= " Added <newline>!"; $txout .= "\n"; ### print $HO2 "\n"; $np = 0; $dncr++; $colcount = 0; } } } else { if (( $ll > 4 ) && (( substr ($text, 0, 4) eq '<!--' )|| ( substr ($sptxt[0], 0, 4) eq '<!--' ))) { substr($text, $mxt) = "..." if length($text) > $mxt; ### limit, to say 40 $msg .= "[" . $text . "]"; $msg .= "html comment $msg *end c*"; } else { $dncr = 0; $txout .= $text; ### print $HO2 $text; $colcount += $ll; substr($text, $mxt) = "..." if length($text) > $mxt; $msg .= "[" . $text . "]ADDED"; $np = 0; } } } else { $msg .= " [$text] SKIPPED"; } } else { if (length($text)) { $msg .= '[' . $text . ']'; } else { $msg .= '[empty]'; } } ###if ($np) { my $omsg = $msg; prt ($dtxt . "\n"); if (length ($txout)) { ###if ($tag eq 'HEAD') { $inhead = 1; if ($inhead && $noheadout) { ### no output $omsg = "-=> $msg [SKIP-IN-HEAD]" } else { print $HO2 $txout; $omsg = "$msg (written)"; } } else { $omsg = "--> $msg (no out text length)"; } prt ($omsg . "\n"); } sub prt { my ($t) = @_; print $t if ($verb2); print $HO1 $t; } sub read_file_f { my( $file_name, %args ) = @_ ; my $buf ; my $buf_ref = $args{'buf_ref'} || \$buf ; my $mode = O_RDONLY ; $mode |= O_BINARY if $args{'binmode'} ; local( *FH ) ; sysopen( FH, $file_name, $mode ) or carp "Can't open $file_name: $!" ; my $size_left = -s FH ; while( $size_left > 0 ) { my $read_cnt = sysread( FH, ${$buf_ref}, $size_left, length ${$buf_ref} ) ; unless( $read_cnt ) { carp "read error in file $file_name: $!" ; last ; } $size_left -= $read_cnt ; } # handle void context (return scalar by buffer reference) return unless defined wantarray ; # handle list context return split m|?<$/|g, ${$buf_ref} if wantarray ; # handle scalar context return ${$buf_ref} ; } sub write_file_f { my $file_name = shift ; my $args = ( ref $_[0] eq 'HASH' ) ? shift : {} ; my $buf = join '', @_ ; ###my $mode = O_WRONLY ; my $mode = O_WRONLY | O_CREAT; $mode |= O_BINARY if $args->{'binmode'} ; $mode |= O_APPEND if $args->{'append'} ; local( *FH ) ; sysopen( FH, $file_name, $mode ) or carp "Can't open $file_name: $!" ; my $size_left = length( $buf ) ; my $offset = 0 ; while( $size_left > 0 ) { my $write_cnt = syswrite( FH, $buf, $size_left, $offset ) ; unless( $write_cnt ) { carp "write error in file $file_name: $!" ; last ; } $size_left -= $write_cnt ; $offset += $write_cnt ; } return ; } # EOF