Generated: Tue Jun 8 17:27:00 2010 from htmltools.pl 2010/03/31 43.5 KB.
#!/Perl # NAME: htmltools.pl # AIM: HTML tools - utility functions - 2006-08-26 # to include this, must declare @hrefs and @imgs if # 20100312 - collecthrefs - remove trailing '"', if present # collecthrefs( $txt, 1 ) or collectimgs( $txt, 1 ) resp. called # set added 2009-06-24 # inline_clean_paras($) inline_clean_td($) remove_empty_paras($) remove_doctype($) # and clean_line_with_wrap($) - and added $htmldbg8 to kill debug outputs my $htmtdbg1 = 0; my $htmtdbg2 = 0; my $htmtdbg3 = 0; my $htmtdbg4 = 0; my $htmtdbg5 = 0; my $htmtdbg6 = 0; my $htmtdbg7 = 0; # show acquired <body background="something"...> my $htmldbg8 = 0; # remove some debug from 2009-06-24 set added sub set_htools_dbg($) { my ($val) = @_; $htmtdbg1 = $val; $htmtdbg2 = $val; $htmtdbg3 = $val; $htmtdbg4 = $val; $htmtdbg5 = $val; $htmtdbg6 = $val; $htmtdbg7 = $val; $htmldbg8 = $val; } sub set_htools_dbg_on() { set_htools_dbg(1); } sub set_htools_dbg_off() { set_htools_dbg(0); } my @tools_htm = (); sub trimbothends { my ($txt) = shift; while ($txt =~ /^\s/) { $txt = substr($txt,1); } while ($txt =~ /\s$/) { $txt = substr($txt,0,length($txt)-1); } return $txt; } sub tag2newline { # ($txt2,'td'); my ($txt, $tag) = @_; my $len = length($txt); my $ntxt = ''; my $i; my $ch = ''; my $ft = ''; my $lcnt = 0; for ($i = 0; $i < $len; $i++ ) { $ch = substr($txt,$i,1); if ($lcnt && ($ch eq '<')) { $ft = $ch; $i++; for ( ; $i < $len; $i++ ) { $ch = substr($txt,$i,1); $ft .= $ch; if ($ch eq '>') { if ($ft =~ /^<$tag/i) { $ft = "\n".$ft; } last; } } $ntxt .= $ft; } else { $ntxt .= $ch; if ($ch eq "\n") { $lcnt = 0; } else { $lcnt++; } } } return $ntxt; } sub comments2newline($) { # ($txt2); my ($txt) = shift; my $len = length($txt); my $ntxt = ''; my $i; my $ch = ''; my $ft = ''; my $lcnt = 0; for ($i = 0; $i < $len; $i++ ) { $ch = substr($txt,$i,1); if ($lcnt && ($ch eq '<')) { $ft = $ch; # set start $i++; if ($i < $len) { $ch = substr($txt,$i,1); $ft .= $ch; #if (($ct eq '!')||($ch eq '?')) { if ($ch eq '!') { $ft = "\n".$ft; } } $ntxt .= $ft; } else { $ntxt .= $ch; if ($ch eq "\n") { $lcnt = 0; } else { $lcnt++; } } } return $ntxt; } # strip a tag completely ... # from <tag. ... to ... </tag> sub striptag { my ($txt, $tag) = @_; my $len = length($txt); my $ntxt = ''; my $ch = ''; my $ftag = ''; my $nline = ''; my $i = 0; my $intag = 0; ###prt("Processing $len chars for $tag ...\n"); for ( ; $i < $len; $i++) { $ch = substr($txt, $i, 1); if ($intag) { if ($ch eq "<") { ###prt("Got begin < ...\n"); $i++; $ftag = ''; for ( ; $i < $len; $i++ ) { $ch = substr($txt, $i, 1); if ($ch eq '>') { last; } else { $ftag .= $ch; } } ###prt("Got tag [$ftag] ...\n"); ###if (lc($ftag) eq lc($tag)) { if (lc(substr($ftag,1)) eq lc($tag)) { $intag = 0; } } } else { if ($ch eq "<") { ###prt("Got begin < ...\n"); $i++; $ftag = ''; for ( ; $i < $len; $i++ ) { $ch = substr($txt, $i, 1); if (($ch eq '>')||($ch eq ' ')) { last; } else { $ftag .= $ch; } } ###prt("Got tag [$ftag] ...\n"); if (lc($ftag) eq lc($tag)) { if ($ch eq ' ') { $i++; for ( ; $i < $len; $i++ ) { $ch = substr($txt, $i, 1); if ($ch eq '>') { last; } } } $intag = 1; } else { $ntxt .= '<'.$ftag.$ch; } } else { $ntxt .= $ch; } } } return $ntxt; } sub return_tag { my ($txt, $tag) = @_; my $len = length($txt); my $ntxt = ''; my $ch = ''; my $ftag = ''; my $nline = ''; my $i = 0; my $intag = 0; ###prt("Processing $len chars for $tag ...\n"); for ( ; $i < $len; $i++) { $ch = substr($txt, $i, 1); if ($intag) { if ($ch eq "<") { ###prt("Got begin < ...\n"); $i++; $ftag = ''; for ( ; $i < $len; $i++ ) { $ch = substr($txt, $i, 1); if ($ch eq '>') { last; } $ftag .= $ch; } ###prt("Got tag [$ftag] ...\n"); ###if (lc($ftag) eq lc($tag)) { if (lc(substr($ftag,1)) eq lc($tag)) { $intag = 0; return $ntxt; } $ntxt = ''; $ch = ''; } $ntxt .= $ch; } else { if ($ch eq "<") { ###prt("Got begin < ...\n"); $i++; $ftag = ''; for ( ; $i < $len; $i++ ) { $ch = substr($txt, $i, 1); if (($ch eq '>')||($ch eq ' ')||($ch =~ /\s/)) { last; } $ftag .= $ch; } ###prt("Got tag [$ftag] ...\n"); if (lc($ftag) eq lc($tag)) { if (($ch eq ' ')||($ch =~ /\s/)) { $i++; for ( ; $i < $len; $i++ ) { $ch = substr($txt, $i, 1); if ($ch eq '>') { last; } } } $intag = 1; } } } } return $ntxt; } sub dropcomments { # strip_comments - strip comments - comment strip my ($txt) = shift; my $ntxt = ''; my $len = length($txt); my $ch = ''; my $pch1 = ''; my $pch2 = ''; my $i = 0; for ($i = 0; $i < $len; $i++) { $ch = substr($txt, $i, 1); if ($ch eq '<') { if ((($i + 3) < $len)&& (substr($txt, $i+1, 3) eq '!--')) { $i += 2; $pch1 = ''; $pch2 = ''; for ( ; $i < $len; $i++) { $ch = substr($txt, $i, 1); if (($ch eq '>')&&($pch1 eq '-')&&($pch2 eq '-')) { last; } $pch2 = $pch1; $pch1 = $ch; } } else { $ntxt .= $ch; } } else { $ntxt .= $ch; } } return $ntxt; } sub dropcomments_from_array { my (@arr) = @_; my $txt = ''; foreach my $ln (@arr) { chomp $ln; $txt .= ' {=*==*=} ' if (length($txt)); $txt .= $ln; } $txt = dropcomments( $txt ); @arr = split( / \{=\*==\*=\} /, $txt ); return @arr; } # Collect HREF anchors from a TEXT stream # 25/07/2007 - Skip over comments <!-- to --> sub collecthrefs { my ($txt,$del) = @_; my $ntxt = ''; my $len = length($txt); my $ch = ''; my $hrf = ''; my $i; for ($i = 0; $i < $len; $i++) { $ch = substr($txt,$i,1); if ($ch eq '<') { $hrf = $ch; $i++; for ( ; $i < $len; $i++) { $ch = substr($txt,$i,1); $hrf .= $ch; # 25/07/2007 watch OUT for COMMENTS - skip these if ($ch eq '-') { if ($hrf eq '<!--') { # we have START of a COMMENT - YUK!!! $i++; # move to NEXT for ( ; $i < $len; $i++) { $ch = substr($txt,$i,1); $hrf .= $ch; if ($ch eq '>') { if ($hrf =~ /-->$/) { last; } } } } } if ($ch eq '>') { last; } } if ($hrf =~ /^<a\s/i) { if ($del == 0) { $ntxt .= $hrf; # no delete - add the text } prt("Got anchor [$hrf] ...\n") if ($htmtdbg3); #if ($hrf =~ /href=["']*(\S+)["']?./im) { # 20100312 - allow spaces before inverted commas if ($hrf =~ /href=\s*["']*(\S+)["']?./im) { $hrf = $1; $hrf =~ s/"$//; # 20100312 - remove trailing '"' push(@hrefs,$hrf); push(@tools_htm,$hrf); prt("Got [$hrf] ...\n") if ($htmtdbg2); } } elsif ($hrf =~ /^<\/a>$/i) { if ($del == 0) { $ntxt .= $hrf; } } else { $ntxt .= $hrf; } } else { $ntxt .= $ch; } } prt( "Collected ". scalar @hrefs . " HREF ...\n" ) if ($htmtdbg2); return $ntxt; } # Collect HREF anchors from a TEXT stream # 25/07/2007 - Skip over comments <!-- to --> sub collect_hrefs { my ($txt) = shift; my $len = length($txt); my $ch = ''; my $hrf = ''; my @hrarr = (); my $i; for ($i = 0; $i < $len; $i++) { $ch = substr($txt,$i,1); if ($ch eq '<') { $hrf = $ch; # start a tag $i++; for ( ; $i < $len; $i++) { $ch = substr($txt,$i,1); $hrf .= $ch; # 25/07/2007 watch OUT for COMMENTS - skip these if ($ch eq '-') { if ($hrf eq '<!--') { # we have START of a COMMENT - YUK!!! $i++; # move to NEXT for ( ; $i < $len; $i++) { $ch = substr($txt,$i,1); $hrf .= $ch; if ($ch eq '>') { if ($hrf =~ /-->$/) { last; } } } } } if ($ch eq '>') { last; } } if ($hrf =~ /^<a\s/i) { prt("Got anchor [$hrf] ...\n") if ($htmtdbg3); if ($hrf =~ /href=(["']?\S+["']?)./im) { $hrf = $1; $hrf =~ s/"//g; $hrf =~ s/'//g; push(@hrarr,$hrf); prt("Got [$hrf] ...\n") if ($htmtdbg2); } } } } prt( "Collected ". scalar @hrarr . " HREF ...\n" ) if ($htmtdbg2); return @hrarr; } sub collecthrefs_nearly_ok { my ($txt,$del) = @_; my $ntxt = ''; my $len = length($txt); my $ch = ''; my $hrf = ''; my $i; for ($i = 0; $i < $len; $i++) { $ch = substr($txt,$i,1); if ($ch eq '<') { $hrf = $ch; $i++; for ( ; $i < $len; $i++) { $ch = substr($txt,$i,1); $hrf .= $ch; if ($ch eq '>') { last; } } if ($hrf =~ /^<a\s/i) { if ($del == 0) { $ntxt .= $hrf; # no delete - add the text } prt("Got anchor [$hrf] ...\n") if ($htmtdbg3); if ($hrf =~ /href=["'](\S+)["']./i) { $hrf = $1; push(@hrefs,$hrf); push(@tools_htm,$hrf); prt("Got [$hrf] ...\n") if ($htmtdbg2); } } elsif ($hrf =~ /^<\/a>$/i) { if ($del == 0) { $ntxt .= $hrf; } } else { $ntxt .= $hrf; } } else { $ntxt .= $ch; } } prt( "Collected ". scalar @hrefs . " HREF ...\n" ) if ($htmtdbg2); return $ntxt; } sub collect_anchors { my ($txt,$del) = @_; my $ntxt = ''; my $len = length($txt); my $ch = ''; my $hrf = ''; my $i; for ($i = 0; $i < $len; $i++) { $ch = substr($txt,$i,1); if ($ch eq '<') { $hrf = $ch; $i++; for ( ; $i < $len; $i++) { $ch = substr($txt,$i,1); $hrf .= $ch; # 25/07/2007 watch OUT for COMMENTS - skip these if ($ch eq '-') { if ($hrf eq '<!--') { # we have START of a COMMENT - YUK!!! $i++; # move to NEXT for ( ; $i < $len; $i++) { $ch = substr($txt,$i,1); $hrf .= $ch; if ($ch eq '>') { if ($hrf =~ /-->$/) { last; } } } } } if ($ch eq '>') { last; } } if ($hrf =~ /^<a\s/i) { prt("Got anchor [$hrf] ...\n") if ($htmtdbg3); push(@hrefs,$hrf); push(@tools_htm,$hrf); if ($del == 0) { $ntxt .= $hrf; # no delete - add the text } } elsif ($hrf =~ /^<\/a>$/i) { if ($del == 0) { $ntxt .= $hrf; } } else { $ntxt .= $hrf; } } else { $ntxt .= $ch; } } prt( "Collected ". scalar @tools_htm . " anchors ...\n" ) if ($htmtdbg2); return $ntxt; } sub ret_anchor_array { my ($txt) = shift; @tools_htm = (); collect_anchors( $txt, 0 ); return @tools_htm; } sub ret_hrefs_array { my ($txt) = shift; @tools_htm = (); collecthrefs( $txt, 0 ); return @tools_htm; } sub collectimgs { my ($txt,$del) = @_; my $ntxt = ''; my $len = length($txt); my $ch = ''; my $hrf = ''; my $i; for ($i = 0; $i < $len; $i++) { $ch = substr($txt,$i,1); if ($ch eq '<') { $hrf = $ch; $i++; for ( ; $i < $len; $i++) { $ch = substr($txt,$i,1); $hrf .= $ch; if ($ch eq '>') { last; } } if ($hrf =~ /^<img\s+/i) { if ($del == 0) { $ntxt .= $hrf; } prt("Got [$hrf] ...\n") if ($htmtdbg3); if ($hrf =~ /src=["']*(\S+)['"]*.*/i) { $hrf = $1; $hrf =~ s/['"]$//; push(@imgs,$hrf); prt("Got [$hrf] ...\n") if ($htmtdbg2); } } else { $ntxt .= $hrf; } } else { $ntxt .= $ch; } } return $ntxt; } # return image array # BUT NOT ONLY # <img border="0" src="[images/construc.gif]" width="87" height="87" alt="under construction"> # BUT ALSO # <link rel="stylesheet" type="text/css" href="[home.css]"> # AND # <link rel="shortcut icon" href="[favicon.ico]"> # AND # <script language="JavaScript" type="text/javascript" src="[fgtoc.js]"> # AND # <body background="[clds4.jpg]" ...> # AND # <applet code="[TimerClass.class]" # width="90" # height="20"> sub ret_imgs_array { my ($txt) = shift; my @ims = (); my $len = length($txt); my $ch = ''; my $hrf = ''; my $i; my $lnum = 1; my $cnum = 0; for ($i = 0; $i < $len; $i++) { $ch = substr($txt,$i,1); $cnum++; if ($ch eq '<') { ### prt( "htmltools:$lnum:$cnum: Start TAG ...\n" ) if ($htmtdbg6); $hrf = $ch; $i++; for ( ; $i < $len; $i++) { $ch = substr($txt,$i,1); $cnum++; if ($ch eq "\n") { $hrf .= ' '; $lnum++; $cnum = 0; } else { $hrf .= $ch; } if ($ch eq '>') { last; } elsif ($ch eq '-') { if ($hrf eq '<!--') { prt( "htmltools:$lnum:$cnum: Entered a COMMENT - get to comment end ...\n" ) if ($htmtdbg6); $i++; $hrf = ''; for ( ; $i < $len; $i++) { $ch = substr($txt,$i,1); $cnum++; if ($ch eq "\n") { $hrf = ''; $lnum++; $cnum = 0; } else { $hrf .= $ch; } if ($ch eq '>') { if ($hrf =~ /-->$/) { prt( "htmltools:$lnum:$cnum: End COMMENT ...[$hrf]\n" ) if ($htmtdbg6); $hrf = ''; last; # out of inner inner } } } $hrf = ''; last; # out of inner } } } ### prt( "htmltools:$lnum:$cnum: [$hrf]\n" ); if ($hrf =~ /^<img\s+/i) { prt("htmltools:$lnum:$cnum: Got [$hrf] ...\n") if ($htmtdbg3); if ($hrf =~ /src=\s*["']*(\S+)['"]*.*/i) { $hrf = $1; $hrf =~ s/['"]$//; push(@ims,$hrf); prt("htmltools:$lnum:$cnum: Got IMG SRC [$hrf] ...\n") if ($htmtdbg4); } else { prt( "WARNING: htmltools:$lnum:$cnum: IMG sans source [$hrf]\n" ); } } elsif ($hrf =~ /<link\s+/i) { ###if ($hrf =~ /href=["']*([\w\.]+)['"]*.*/i) { if ($hrf =~ /href=["']*(\S+)['"]*.*/i) { $hrf = $1; $hrf =~ s/>$//; $hrf =~ s/['"]$//; push(@ims,$hrf); prt("htmltools:$lnum:$cnum: Got LINK HREF [$hrf] ...\n") if ($htmtdbg4); } } elsif ($hrf =~ /<script\s+/i) { if ($hrf =~ /src=["']*(\S+)['"]*.*/i) { $hrf = $1; $hrf =~ s/>$//; $hrf =~ s/['"]$//; push(@ims,$hrf); prt("htmltools:$lnum:$cnum: Got SCRIPT SRC [$hrf] ...\n") if ($htmtdbg4); } #else { # prt( "WARNING: htmltools: SCRIPT sans SRC [$hrf]\n" ); #} } elsif ($hrf =~ /^<body\s+(.*)>$/i) { $hrf = $1; if ($hrf =~ /background=["']*(\S+)['"]*.*/i) { $hrf = $1; $hrf =~ s/>$//; $hrf =~ s/['"]$//; push(@ims,$hrf); prt("htmltools:$lnum:$cnum: Got body background [$hrf] ...\n") if ($htmtdbg7); } #else { # prt( "WARNING: htmltools: body sans background [$hrf]\n" ); #} } elsif ($hrf =~ /^<applet\s+(.*)>$/i) { $hrf = $1; if ($hrf =~ /code=["']*(\S+)['"]*.*/i) { $hrf = $1; $hrf =~ s/>$//; $hrf =~ s/['"]$//; push(@ims,$hrf); prt("htmltools:$lnum:$cnum: Got applet code [$hrf] ...\n") if ($htmtdbg7); } #else { # prt( "WARNING: htmltools: applet sans code [$hrf]\n" ); #} } } if ($ch eq "\n") { $lnum++; $cnum = 0; } } if ($htmtdbg5) { $i = scalar @ims; prt( "Returning $i IMG/OTHER items ...\n" ); foreach $hrf (@ims) { prt( "$hrf " ); } prt("\n"); } return @ims; } # just remove a <tag>, and </tag> ... # but leave the stuff between sub removetag { my ($txt, $tg) = @_; my $ntxt = ''; my $len = length($txt); my $ch = ''; my $hrf = ''; my $i; for ($i = 0; $i < $len; $i++) { $ch = substr($txt,$i,1); if ($ch eq '<') { $hrf = $ch; $i++; for ( ; $i < $len; $i++) { $ch = substr($txt,$i,1); $hrf .= $ch; if ($ch eq '>') { last; } } if ($hrf =~ /^<$tg\s/i) { } elsif ($hrf =~ /^<$tg>$/i) { } elsif ($hrf =~ /^<\/$tg>$/i) { } else { $ntxt .= $hrf; } } else { $ntxt .= $ch; } } return $ntxt; } sub remove_script { my ($txt) = shift; my $dbgsc = 0; # only if a LOT of noise wanted my $tg = 'script'; my $ntxt = ''; my $len = length($txt); my $ch = ''; my $hrf = ''; my $i; my $insc = 0; my $quot = ''; my $pch = ''; my $qtxt = ''; my $lstl = ''; my $lnum = 1; for ($i = 0; $i < $len; $i++) { $ch = substr($txt,$i,1); if ($insc) { if ( (($ch eq '"')||($ch eq "'")) && ( $pch ne "\\" ) ) { $quot = $ch; $qtxt = $ch; $pch = $ch; prt( "$lnum:$i: Begin QUOTES [$ch] ...[$lstl]\n" ) if ($dbgsc); $i++; for ( ; $i < $len; $i++) { $ch = substr($txt,$i,1); if (($ch eq $quot) && ( $pch ne "\\" )) { $qtxt .= $ch; prt( "$lnum:$i: End QUOTES [$quot] [$qtxt]\n" ) if ($dbgsc); last; } elsif ($ch eq "\n") { prt( "$lnum:$i: End QUOTES ON NEW LINE [$qtxt]\n" ) if ($dbgsc); last; } $pch = $ch; $qtxt .= $ch; } } elsif (($ch eq '*')&&($pch eq '/')) { prt( "$lnum:$i: Begin /* comment ... [$lstl]\n" ) if ($dbgsc); $qtxt = "$pch$ch"; $pch = $ch; $i++; for ( ; $i < $len; $i++) { $ch = substr($txt,$i,1); if (($ch eq '/')&&($pch eq '*')) { prt( "$lnum:$i: End /* comment ... [$qtxt]\n" ) if ($dbgsc); last; } $pch = $ch; $qtxt .= $ch; } } elsif (($ch eq '/')&&($pch eq '/')) { prt( "$lnum:$i: Begin // comment ...[$lstl]\n" ) if ($dbgsc); $qtxt = "$pch$ch"; $pch = $ch; $i++; for ( ; $i < $len; $i++) { $ch = substr($txt,$i,1); if ($ch eq "\n") { prt( "$lnum:$i: End comment ... [$qtxt]\n" ) if ($dbgsc); last; } $pch = $ch; $qtxt .= $ch; } } elsif ($ch eq '<') { $hrf = $ch; $lstl .= $ch; $i++; prt( "$lnum:$i: Being tag ... [$lstl]\n" ) if ($dbgsc); if ($i < $len) { $ch = substr($txt,$i,1); if ($ch =~ /[\w\/!]/) { # if alphanumeric, or '/' or '!' for ( ; $i < $len; $i++) { $ch = substr($txt,$i,1); $hrf .= $ch if ($ch ne "\n"); if ($ch eq '>') { prt( "$lnum:$i: End tag ... [$hrf]\n" ) if ($dbgsc); last; } elsif ($hrf eq '<!--') { prt( "$lnum:$i: Skip comment tag ... [$hrf]\n" ) if ($dbgsc); last; } if ($ch eq "\n") { $lstl = ''; } else { $lstl .= $ch; } } if ($hrf =~ /<\/$tg>/i) { prt( "$lnum:$i: End $tg [$hrf]\n" ) if ($dbgsc); $insc = 0; } } else { prt( "$lnum:$i: Non-alphanumeric follows - assume NOT tag ...\n" ) if ($dbgsc); } } } $pch = $ch; if ($ch eq "\n") { $lstl = ''; $lnum++; } else { $lstl .= $ch; } } else { if ($ch eq '<') { $hrf = $ch; $i++; for ( ; $i < $len; $i++) { $ch = substr($txt,$i,1); $hrf .= $ch; if ($ch eq '>') { last; } $lnum++ if ($ch eq "\n"); } if ($hrf =~ /^<$tg\s+/i) { prt( "$lnum:$i: Begin $tg sp [$hrf]\n" ) if ($dbgsc); $insc = 1; $pch = ''; } elsif ($hrf =~ /^<$tg>$/i) { prt( "$lnum:$i: Begin $tg [$hrf]($i)\n" ) if ($dbgsc); $insc = 1; $pch = ''; } else { $ntxt .= $hrf; } } else { $ntxt .= $ch; } $lnum++ if ($ch eq "\n"); } } return $ntxt; } sub removefont { my ($txt) = shift; my $ntxt = removetag($txt,'font'); return $ntxt; } sub removetagattrib { my ($txt, $tag) = @_; my $ntxt = ''; my $len = length($txt); my $ch = ''; my $hrf = ''; my $i; for ($i = 0; $i < $len; $i++) { $ch = substr($txt,$i,1); if ($ch eq '<') { $hrf = $ch; $i++; for ( ; $i < $len; $i++) { $ch = substr($txt,$i,1); $hrf .= $ch; if ($ch eq '>') { last; } } if ($hrf =~ /^<$tag>$/i) { $ntxt .= $hrf; } elsif ($hrf =~ /^<$tag\s+/i) { ###prt("Removing $tag attrib [$hrf]\n"); $ntxt .= substr($hrf,0,length($tag)+1).'>'; } else { $ntxt .= $hrf; } } else { $ntxt .= $ch; } } return $ntxt; } # add 2009-06-24 sub remove_table_attribs { my ($txt) = shift; my $ntxt = removetagattrib($txt,'table'); return $ntxt; } sub removetdattrib { my ($txt) = shift; my $ntxt = removetagattrib($txt,'td'); return $ntxt; } sub removetrattrib { my ($txt) = shift; my $ntxt = removetagattrib($txt,'tr'); return $ntxt; } sub substitutions { # ($txt2); my ($txt) = shift; $txt =~ s/ / /gm; $txt =~ s/&/&/gm; return $txt; } sub trimblanklines { my ($txt) = shift; my $len = length($txt); my $ntxt = ''; my $ln = ''; my $ch = ''; my $i = 0; for ($i = 0; $i < $len; $i++) { $ch = substr($txt,$i,1); if (($ch eq "\n")||($ch eq "\r")) { if (length($ln)) { if ($ln =~ /\S+/) { # if got NOT space $ln = trimbothends($ln); if (length($ln)) { $ntxt .= $ln . $ch; } } } $ln = ''; } else { $ln .= $ch; } } if (length($ln)) { if ($ln =~ /\S+/) { $ln = trimbothends($ln); if (length($ln)) { $ntxt .= $ln; } } } return $ntxt; } sub trimblanks { my ($txt) = shift; my $len = length($txt); my $ntxt = ''; my $ln = ''; my $ch = ''; my $i = 0; for ($i = 0; $i < $len; $i++) { $ch = substr($txt,$i,1); if (($ch eq "\n")||($ch eq "\r")) { if (length($ln)) { if ($ln =~ /\S+/) { # if got NOT space ###$ln = trimbothends($ln); ###if (length($ln)) { $ntxt .= $ln . $ch; ###} } } $ln = ''; } else { $ln .= $ch; } } if (length($ln)) { if ($ln =~ /\S+/) { ###$ln = trimbothends($ln); ###if (length($ln)) { $ntxt .= $ln; ###} } } return $ntxt; } sub trimblanklines_OK_maybe { my ($txt) = shift; my $len = length($txt); my $ntxt = ''; my $ln = ''; my $ch = ''; my $i = 0; for ($i = 0; $i < $len; $i++) { $ch = substr($txt,$i,1); if ($ch eq "\n") { if (length($ln)) { if ($ln =~ /\S+/) { while ($ln =~ /^\s/) { $ln = substr($ln,1); } if (length($ln)) { $ntxt .= $ln . $ch; } } } $ln = ''; } else { $ln .= $ch; } } if (length($ln)) { if ($ln =~ /\S?/) { $ntxt .= $ln; } } return $ntxt; } sub triminlinetd { my ($txt) = shift; my $len = length($txt); my $ntxt = ''; my $ln = ''; my $ch = ''; my $lt = ''; my $nlt = ''; my $nln = ''; my $i = 0; for ($i = 0; $i < $len; $i++) { $ch = substr($txt,$i,1); if ($ch eq "\n") { if ($ln =~ /.*<td.*>(.*)<\/td>/i) { $lt = $1; # get text between <td>...</td> # $nlt =~ s/\s//g; this removes ALL spaces - NOT GOOD! $nlt = trimbothends($lt); if (length($nlt)) { ###prt("Got inline <td>...</td> - [$ln] [$lt] [$nlt]...\n"); $nln = $ln; $nln =~ s/$lt/$nlt/; ###prt("New line [$nln]...\n"); $ln = $nln; } } $ntxt .= $ln.$ch; $ln = ''; } else { $ln .= $ch; } } if (length($ln)) { if ($ln =~ /\S?/) { $ntxt .= $ln; } } return $ntxt; } # strip from '<?' to '?>', excluding within quotes sub strip_php_script { my ($txt) = shift; my $ntxt = ''; my $max = length($txt); my $pch = ''; my $inphp = 0; my $inquote = ''; for (my $i = 0; $i < $max; $i++) { my $ch = substr($txt,$i,1); if ($inphp) { ##print "Should be END PHP ...\n" if (($ch eq '>')&&($pch eq '?')); if (length($inquote)) { # wating for end of QUOTE if ( ($ch eq $inquote) && ($pch ne "\\") ) { #print "End of QUOTE\n"; $inquote = ''; } } else { if ( (($ch eq '"')||($ch eq "'")) && ($pch ne "\\") ) { #print "Start of QUOTE\n"; $inquote = $ch; } if (($ch eq '>')&&($pch eq '?')) { $inphp = 0; #print "End of PHP ...\n"; } } $pch = $ch; next; } else { if (($ch eq '?') && ($pch eq '<')) { $ntxt = substr($ntxt, 0, length($ntxt) - 1); #print "Start of PHP ...\n"; $inphp = 1; next; } } $pch = $ch; $ntxt .= $ch; } return $ntxt; } sub drop_php_from_array { my (@arr) = @_; my $txt = ''; foreach my $ln (@arr) { chomp $ln; $txt .= ' {=*==*=} ' if (length($txt)); $txt .= $ln; } $txt = strip_php_script( $txt ); @arr = split( / \{=\*==\*=\} /, $txt ); my $lnc = scalar @arr; for (my $i = 0; $i < $lnc; $i++) { $arr[$i] .= "\n"; } return @arr; } sub htmlexpand { my ($rtxt) = shift; my $tlen = length($rtxt); prt( "len=$tlen - Add STYLE TAG to new line ...\n") if ($htmtdbg1); $rtxt = tag2newline($rtxt,'style'); $tlen = length($rtxt); prt( "len=$tlen - Add TABLE TAG to new line ...\n") if ($htmtdbg1); $rtxt = tag2newline($rtxt,'table'); $tlen = length($rtxt); prt( "len=$tlen - Add TR TAG to new line ...\n") if ($htmtdbg1); $rtxt = tag2newline($rtxt,'tr'); $tlen = length($rtxt); prt( "len=$tlen - Add TH TAG to new line ...\n") if ($htmtdbg1); $rtxt = tag2newline($rtxt,'th'); $tlen = length($rtxt); prt( "len=$tlen - Add TD TAG to new line ...\n") if ($htmtdbg1); $rtxt = tag2newline($rtxt,'td'); $tlen = length($rtxt); prt( "len=$tlen - Add SCRIPT TAG to new line ...\n") if ($htmtdbg1); $rtxt = tag2newline($rtxt,'script'); $tlen = length($rtxt); prt( "len=$tlen - Add PRE TAG to new line ...\n") if ($htmtdbg1); $rtxt = tag2newline($rtxt,'pre'); $tlen = length($rtxt); prt( "len=$tlen - Returned from htmlexpand ...\n" ) if ($htmtdbg1); return $rtxt; } sub dropdoctype { my ($txt) = shift; my $tlen = length($txt); my $pch = ''; my $ch = ''; my $rtxt = ''; for (my $i = 0; $i < $tlen; $i++) { $ch = substr($txt,$i,1); if ($ch eq '<') { $pch = substr($txt,$i); if ($pch =~ /^<!DOCTYPE\s+/i) { ###prt( "Got DOCTYPE ...\n" ); $i++; # move to next for (; $i < $tlen; $i++) { $ch = substr($txt,$i,1); if ($ch eq '>') { $ch = ''; last; } } } } $rtxt .= $ch; } return $rtxt; } sub html_clean_up1 { my ($rtxt) = shift; my $tlen = length($rtxt); prt( "len=$tlen - Drop DOCTYPE <!DOCTYPE... > ...\n"); $rtxt = dropdoctype($rtxt); $tlen = length($rtxt); prt( "len=$tlen - Drop comments <!--...--> ...\n"); $rtxt = dropcomments($rtxt); $tlen = length($rtxt); prt( "len=$tlen - Strip <HEAD>...</HEAD> tag ...\n"); $rtxt = striptag($rtxt, 'HEAD'); $tlen = length($rtxt); prt( "len=$tlen - Strip <script>...</script> tag ...\n"); $rtxt = striptag($rtxt,'script'); $tlen = length($rtxt); prt( "len=$tlen - Strip <noscript>...</noscript> tag ...\n"); $rtxt = striptag($rtxt,'noscript'); $tlen = length($rtxt); prt( "len=$tlen - Strip <SELECT>...</SELECT> tag ...\n"); $rtxt = striptag($rtxt,'select'); $tlen = length($rtxt); prt( "len=$tlen - Remove <font ...> tags ...\n"); $rtxt = removefont($rtxt); $tlen = length($rtxt); prt( "len=$tlen - Remove <b> tags ...\n"); $rtxt = removetag($rtxt,'b'); $tlen = length($rtxt); prt( "len=$tlen - Remove <tt> tags ...\n"); $rtxt = removetag($rtxt,'tt'); $tlen = length($rtxt); prt( "len=$tlen - Remove <nobr> tags ...\n"); $rtxt = removetag($rtxt,'nobr'); $tlen = length($rtxt); prt( "len=$tlen - Remove <span> tags ...\n"); $rtxt = removetag($rtxt,'span'); $tlen = length($rtxt); prt( "len=$tlen - Remove <div> tags ...\n"); $rtxt = removetag($rtxt,'div'); $tlen = length($rtxt); if ($rtxt =~ /<strong>/) { prt( "len=$tlen - Remove <strong> tags ...\n"); $rtxt = removetag($rtxt,'strong'); $tlen = length($rtxt); } prt( "len=$tlen - Remove <ul> tags ...\n"); $rtxt = removetag($rtxt,'ul'); $tlen = length($rtxt); prt( "len=$tlen - Remove <u> tags ...\n"); $rtxt = removetag($rtxt,'u'); $tlen = length($rtxt); prt( "len=$tlen - Remove <h1> tags ...\n"); $rtxt = removetag($rtxt,'h1'); $tlen = length($rtxt); prt( "len=$tlen - Remove <h2> tags ...\n"); $rtxt = removetag($rtxt,'h2'); $tlen = length($rtxt); prt( "len=$tlen - Remove <li> tags ...\n"); $rtxt = removetag($rtxt,'li'); $tlen = length($rtxt); prt( "len=$tlen - Remove <br> tags ...\n"); $rtxt = removetag($rtxt,'br'); $tlen = length($rtxt); prt( "len=$tlen - Remove <html> tags ...\n"); $rtxt = removetag($rtxt,'html'); $tlen = length($rtxt); prt( "len=$tlen - Remove <body> tags ...\n"); $rtxt = removetag($rtxt,'body'); $tlen = length($rtxt); prt( "len=$tlen - Remove p attributes ...\n"); $rtxt = removetagattrib($rtxt,'p'); $tlen = length($rtxt); prt( "len=$tlen - Remove th attributes ...\n"); $rtxt = removetagattrib($rtxt,'th'); $tlen = length($rtxt); prt( "len=$tlen - Remove tr attributes ...\n"); $rtxt = removetrattrib($rtxt); $tlen = length($rtxt); prt( "len=$tlen - Remove td attributes ...\n"); $rtxt = removetdattrib($rtxt); $tlen = length($rtxt); $rtxt = trimblanklines($rtxt); $tlen = length($rtxt); return $rtxt; } sub htmlcleanall { my ($rtxt) = shift; my $tlen = length($rtxt); prt( "len=$tlen - Drop comments <!--...--> ...\n"); $rtxt = dropcomments($rtxt); $tlen = length($rtxt); prt( "len=$tlen - Strip <HEAD>...</HEAD> tag ...\n"); $rtxt = striptag($rtxt, 'HEAD'); $tlen = length($rtxt); prt( "len=$tlen - Strip <script>...</script> tag ...\n"); $rtxt = striptag($rtxt,'script'); $tlen = length($rtxt); prt( "len=$tlen - Strip <noscript>...</noscript> tag ...\n"); $rtxt = striptag($rtxt,'noscript'); $tlen = length($rtxt); prt( "len=$tlen - Strip <SELECT>...</SELECT> tag ...\n"); $rtxt = striptag($rtxt,'select'); $tlen = length($rtxt); prt( "len=$tlen - Remove <font ...> tags ...\n"); $rtxt = removefont($rtxt); $tlen = length($rtxt); prt( "len=$tlen - Remove <b> tags ...\n"); $rtxt = removetag($rtxt,'b'); $tlen = length($rtxt); prt( "len=$tlen - Remove <tt> tags ...\n"); $rtxt = removetag($rtxt,'tt'); $tlen = length($rtxt); prt( "len=$tlen - Remove <nobr> tags ...\n"); $rtxt = removetag($rtxt,'nobr'); $tlen = length($rtxt); prt( "len=$tlen - Remove <span> tags ...\n"); $rtxt = removetag($rtxt,'span'); $tlen = length($rtxt); prt( "len=$tlen - Remove th attributes ...\n"); $rtxt = removetagattrib($rtxt,'th'); $tlen = length($rtxt); prt( "len=$tlen - Remove tr attributes ...\n"); $rtxt = removetrattrib($rtxt); $tlen = length($rtxt); prt( "len=$tlen - Remove td attributes ...\n"); $rtxt = removetdattrib($rtxt); $tlen = length($rtxt); prt( "len=$tlen - Delete <a...> & </a>\n"); $rtxt = collecthrefs($rtxt,1); $tlen = length($rtxt); prt( "len=$tlen - Delete <img...>\n"); $rtxt = collectimgs($rtxt,1); $tlen = length($rtxt); prt( "len=$tlen - Do substitutions ...\n"); $rtxt = substitutions($rtxt); $tlen = length($rtxt); prt( "len=$tlen - Trim blank lines ...\n"); $tlen = length($rtxt); $rtxt = trimblanklines($rtxt); $tlen = length($rtxt); prt( "len=$tlen - Trim inline td ...\n"); $rtxt = triminlinetd($rtxt); $tlen = length($rtxt); prt( "len=$tlen - Returned from htmlcleanall ...\n"); return $rtxt; } # added 18/07/2008 sub get_tag_attr_array { my ($tag) = shift; my ($i, $i2, $ch, $ln, @arr, $tg, $spc, $com, $incom, $pch, $nch); $tg = ''; $ln = length($tag); @arr = (); $spc = 0; $incom = 0; $pch = ''; my $indent = ' '; for ($i = 0; $i < $ln; $i++) { $i2 = $i + 1; $ch = substr($tag,$i,1); $nch = ''; $nch = substr($tag,$i2,1) if ($i2 < $ln); $tg .= $ch; # add it to the tag if ($incom) { if ($ch eq $com) { $incom = 0; } } else { if (($ch eq '"')||($ch eq "'")) { $com = $ch; $incom = 1; } elsif (($ch =~ /^\s$/) && ($nch ne '/') && ($nch ne '>')) { $spc++; if ($spc > 1) { if (length($tg)) { $tg = $indent.$tg if (@arr); push(@arr,$tg); } $tg = ''; } } } $pch = $ch; } if (length($tg)) { $tg = $indent.$tg if (@arr); push(@arr,$tg); } return @arr; } sub split_tag_attrs { my ($tag) = shift; my @arr = (); if ($tag =~ /\s+/) { # there is a chance it has more than ONE attribute @arr = get_tag_attr_array($tag); } else { push(@arr,$tag); } return @arr; } sub array_tags2newline { my (@arr) = @_; my @narr = (); my ($ch, $len); my ($ln, $i, $pre, $lc, $l, $tag, $intag); my $maxtag = 60; $pre = ''; $lc = scalar @arr; $intag = 0; $tag = ''; for ($l = 0; $l < $lc; $l++) { $ln = $arr[$l]; # get LINE $len = length($ln); # and its LENGTH for ($i = 0; $i < $len; $i++) { # process char by char $ch = substr($ln,$i,1); if ($intag) { # seek END of tag $tag .= $ch; if ($ch eq '>') { if ($tag =~ /^<.+>$/) { # got WHOLE tag if (length($tag) > $maxtag) { push(@narr, split_tag_attrs($tag)); } else { push(@narr,$tag); } } else { push(@narr,$tag); } $tag = ''; $intag = 0; } } else { # seek start of TAG if ($ch eq '<') { if (length($pre)) { if ($pre =~ /^\s+$/) { # is all space - dump it } else { push(@narr,$pre); } } $pre = ''; # clear anything before $tag = $ch; # START tag $intag = 1; # and now IN A TAG } else { $pre .= $ch; } } } # done LINE, so add this tag push(@narr,$tag) if length($tag); $tag = ''; } return @narr; } # set added 2009-06-24 # inline_clean_paras($) inline_clean_td($) remove_empty_paras($) remove_doctype($) sub clean_line_with_wrap { my ($t, $m) = @_; my $ln = length($t); my $nl = ''; my $nll = 0; for (my $j = 0; $j < $ln; $j++) { my $c = substr($t,$j,1); if ($c =~ /\n/) { if (length($nl)) { my $j2 = $j + 1; if ($j2 < $ln) { my $c2 = substr($t,$j2,1); # get first after if (($c2 eq '.')||($c2 eq ':')||($c2 eq ',')||($c2 =~ /\s/)) { $c = ''; # kill CR } else { $c = ' '; # conv to space } } else { $c = ''; # last, kill CR } } else { $c = ''; # first, kill CR } } if (length($c)) { $nll++; if ($nll > $m) { if ($c =~ /\s/) { $c = "\n"; $nll = 0; } } $nl .= $c; } } return $nl; } # take things like #<p> #params #returns one value, #the symbolic constant identifying the RGB destination blend #function. The initial value is #GL_ZERO #. #See #glBlendFunc #and #glBlendFuncSeparate #. #</p> # and return #<p> #params returns one value, the symbolic constant identifying the RGB destination blend #function. The initial value is GL_ZERO. See glBlendFunc and glBlendFuncSeparate. #</p> sub inline_clean_paras { my ($tx,$mx) = @_; my $len = length($tx); my $ntxt = ''; my @wlist = (); my ($i, $ch, $intg, $inp, $ino, $tag, $bp, $ep, $wt, $tlen); my ($btxt, $etxt); $intg = 0; $inp = 0; $ino = 0; $tag = ''; $bp = 0; $ep = 0; prt( "Doing [$len] chars for P... max line $mx\n" ) if ($htmldbg8); for ($i = 0; $i < $len; $i++) { $ch = substr($tx,$i,1); if ($intg) { if ($ch eq '>') { #prt( "$i: End tag\n" ); $intg = 0; if ($tag =~ /^p$/i) { $inp = 1; $ino = 0; $bp = $i + 1; #prt( "$bp: Begin p [$tag]\n" ); } elsif ($tag =~ /^\/p$/i) { if ($inp) { $inp = 0; $ep = $i - 4; #prt( "$ep: End p [$tag][$bp - $ep]\n" ); } else { prt( "$ep: End p [$tag] NOT inp!\n" ) if ($htmldbg8); } } else { $ino = 1; #prt( "$i: Other [$tag]\n" ); } $tag = ''; } else { $tag .= $ch; } } else { if ($ch eq '<') { $tag = ''; $intg = 1; #prt( "$i: Start tag\n" ); } } $ntxt .= $ch; $tlen = $ep - $bp; if (($bp > 0) && ($tlen > 0)) { $wt = substr($tx,$bp,$tlen); #prt( "Work:$bp:$ep:$tlen: [$wt]\n" ); # exit 1; push(@wlist, [$bp,$ep]); $ep = 0; $bp = 0; } } $len = scalar @wlist; prt( "Done [$i] chars. Got $len work items...\n" ) if ($htmldbg8); for ($i = $len - 1; $i >= 0; $i--) { $bp = $wlist[$i][0]; $ep = $wlist[$i][1]; $tlen = $ep - $bp; $wt = substr($tx,$bp,$tlen); $btxt = substr($tx,0,$bp); $etxt = substr($tx,$ep+1); #prt( "Work:$bp:$ep:$tlen: [$wt]\n" ); $wt = clean_line_with_wrap($wt,$mx); #prt( "AWork:$bp:$ep:$tlen: [$wt]\n" ); $tx = $btxt.$wt.$etxt; $btxt = substr($btxt,length($btxt)-3) if (length($btxt) > 3); $etxt = substr($etxt,0,4) if (length($etxt) > 4); #prt( "Bgn:[$btxt] End:[$etxt]\n" ); } $len = length($tx); prt( "P Returning [$len] chars...\n" ) if ($htmldbg8); return $tx; } # added 2009-06-24 sub inline_clean_td { my ($tx,$mx) = @_; my $len = length($tx); my $ntxt = ''; my @wlist = (); my ($i, $ch, $intg, $inp, $ino, $tag, $bp, $ep, $wt, $tlen); my ($btxt, $etxt); $intg = 0; $inp = 0; $ino = 0; $tag = ''; $bp = 0; $ep = 0; prt( "Doing [$len] chars for TD... max line $mx\n" ) if ($htmldbg8); for ($i = 0; $i < $len; $i++) { $ch = substr($tx,$i,1); if ($intg) { if ($ch eq '>') { #prt( "$i: End tag\n" ); $intg = 0; if ($tag =~ /^td$/i) { $inp = 1; $ino = 0; $bp = $i + 1; #prt( "$bp: Begin TD [$tag]\n" ); } elsif ($tag =~ /^\/td$/i) { if ($inp) { $inp = 0; $ep = $i - 5; #prt( "$ep: End TD [$tag][$bp - $ep]\n" ); } else { prt( "$ep: End td [$tag] NOT inp!\n" ) if ($htmldbg8); } } else { $ino = 1; #prt( "$i: Other [$tag]\n" ); } $tag = ''; } else { $tag .= $ch; } } else { if ($ch eq '<') { $tag = ''; $intg = 1; #prt( "$i: Start tag\n" ); } } $ntxt .= $ch; $tlen = $ep - $bp; if (($bp > 0) && ($tlen > 0)) { $wt = substr($tx,$bp,$tlen); #prt( "Work:$bp:$ep:$tlen: [$wt]\n" ); # exit 1; push(@wlist, [$bp,$ep]); $ep = 0; $bp = 0; } } $len = scalar @wlist; prt( "Done [$i] chars. Got $len work items...\n" ) if ($htmldbg8); for ($i = $len - 1; $i >= 0; $i--) { $bp = $wlist[$i][0]; $ep = $wlist[$i][1]; $tlen = $ep - $bp; $wt = substr($tx,$bp,$tlen); $btxt = substr($tx,0,$bp); $etxt = substr($tx,$ep+1); #prt( "Work:$bp:$ep:$tlen: [$wt]\n" ); $wt = clean_line_with_wrap($wt,$mx); #prt( "AWork:$bp:$ep:$tlen: [$wt]\n" ); $tx = $btxt.$wt.$etxt; $btxt = substr($btxt,length($btxt)-3) if (length($btxt) > 3); $etxt = substr($etxt,0,4) if (length($etxt) > 4); #prt( "Bgn:[$btxt] End:[$etxt]\n" ); } $len = length($tx); prt( "TD Returning [$len] chars...\n" ) if ($htmldbg8); return $tx; } # added 2009-06-24 - will remove <p>( |\n)+</p> sub remove_empty_paras { my ($t) = shift; my $l = length($t); my ($c, $p, $p2, $c2, $tx); my $nt = ''; my $it = 0; my $tg = ''; $tx = ''; # start no text in para for ($p = 0; $p < $l; $p++) { $c = substr($t,$p,1); if ($it) { if ($c eq '>') { $it = 0; if ($tg =~ /^p$/) { # got para start $p2 = $p + 1; # bump to next $tx = ''; # start text accumulation for (; $p2 < $l; $p2++) { $c2 = substr($t,$p2,1); if ($c2 eq '<') { # reached NEXT tag if ($tx =~ /\S/) { # has NON space, so leave it } else { # all spacey stuff up to thsi next tag $tg = ''; $p2++; # move on, and get this tag for (; $p2 < $l; $p2++) { $c2 = substr($t,$p2,1); if ($c2 eq '>') { if ($tg =~ /^\/p$/) { $p = $p2; # SUCCESS - skip it $c = ''; # kill anything to add # and have already add '<p', so $nt = substr($nt,0,length($nt)-2); } last; # out of 2nd inner loop } else { $tg .= $c2; } } # second inner loop on finding next end of tag } last; # out of 1st inner loop } else { $tx .= $c2; # accumulate text BETWEEN tags } } # for - inner loop on finding <P> tag } # if is <p> $tg = ''; } else { $tg .= $c; } } elsif ($c eq '<') { $it = 1; # tag commencing $tg = ''; # so start with nothing } $nt .= $c; } return $nt; } # added 2009-06-24 - only checks initial lines sub remove_doctype { my ($tx) = shift; my $l = length($tx); for (my $p = 0; $p < $l; $p++) { my $c = substr($tx,$p,1); if ($c eq '<') { my $dt = ''; # start possible doctype accumulation $p++; # bump to next for (; $p < $l; $p++) { $c = substr($tx,$p,1); if ($c eq '>') { # reached end of tag if ($dt =~ /^!DOCTYPE\s+/i) { $p++; return substr($tx,$p); } last; # not doctype - end of search } else { $dt .= $c; } } last; } elsif ($c =~ /\S/) { last; # not '<' or space - end of search } } return $tx; } 1; # eof - htmltools.pl