Generated: Mon Aug 16 14:14:26 2010 from linkhtml.pl 2010/04/13 26.7 KB.
#!/perl -w # NAME: linkhtml.pl # AIM: Given one input HTML file, parse HTML elements, and show each 'link' given in the file... # 2010/04/13 - looking good... # 2010/04/12 - geoff mclane - http://geoffair.net/mperl/ use strict; use warnings; use File::Basename; # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] ) use Cwd; unshift(@INC, 'C:\GTools\perl'); require 'logfile.pl' or die "Unable to load logfile.pl ...\n"; # log file stuff my ($LF); my $pgmname = $0; if ($pgmname =~ /\w{1}:\\.*/) { my @tmpsp = split(/\\/,$pgmname); $pgmname = $tmpsp[-1]; } my $perl_dir = 'C:\GTools\perl'; my $outfile = $perl_dir."\\temp.$pgmname.txt"; open_log($outfile); # user variables my $load_log = 0; my $def_infile = 'C:\HOMEPAGE\FG\testpage2.htm'; #my $def_infile = 'C:\HOMEPAGE\FG\Docs\getstart\getstartch9.html'; #my $def_infile = 'C:\HOMEPAGE\FG\Projects\747-JW\index.html'; my $in_file = ''; my $ignore_close_element = 0; # dangerous - ignoring a close element my $out_html = $perl_dir."\\temphtml.htm"; my @closed_tags = ( "meta", "link", "applet", "img", "input", "object", "embed", "servlet", "br", "hr", "area", "base", "basefont", "frame", "isindex", "param", "bgsound", "embed", "keygen" ); # tags which do NOT need a closing, like </p>, tag my @opt_tags = ( "body", "colgroup", "dd", "dt", "head", "html", "li", "optgroup", "option", "p", "tbody", "td", "tfoot", "th", "thead", "tr", "marquee" ); ### program variables my $verbosity = 0; my @warnings = (); my $cwd = cwd(); # debug my $dbg01 = 0; # show each item pushed to the stack my $dbg38 = 0; # prt( "[dbg38] Got [$hr2] = [$txt] [$fil]\n" ) if ($dbg38); my $dbg39 = 0; # prt( "[dbg39] Got [$hr2] = [$txt] - no inverted commas! [$fil]\n" ) if ($dbg39); sub VERB1() { return (($verbosity > 0) ? 1 : 0); } sub VERB5() { return (($verbosity >= 5) ? 1 : 0); } sub VERB9() { return (($verbosity >= 9) ? 1 : 0); } sub pgm_exit($$) { my ($val,$msg) = @_; if (length($msg)) { $msg .= "\n" if (!($msg =~ /\n$/)); prt($msg) } close_log($outfile,$load_log); exit($val); } sub prtw($) { my ($tx) = shift; $tx =~ s/\n$//; prt("$tx\n"); push(@warnings,$tx); } sub show_warnings() { if (@warnings) { prt( "\nGot ".scalar @warnings." WARNINGS...\n" ); foreach my $itm (@warnings) { prt("$itm\n"); } prt("\n"); } else { prt( "\nNo warnings issued.\n\n" ); } } sub is_closed_tag($) { my ($tt) = shift; my $lctt = lc($tt); foreach my $tag (@closed_tags) { return 1 if ($tag eq $lctt); } return 0; } sub is_opt_tag($) { my ($tt) = shift; my $lctt = lc($tt); foreach my $tag (@opt_tags) { return 1 if ($tag eq $lctt); } return 0; } # $drop = can_find_this_tag($tag,\@elements); sub can_find_this_tag($$) { my ($tag,$re) = @_; my $len = scalar @{$re}; my $drop = 0; my $bu = -1; my $last = ''; my $lctag = lc($tag); while ($len) { $drop++; # can pop this one $last = ${$re}[$bu][0]; # get tag if (($last eq $tag)||(lc($last) eq $lctag)) { # if the desired tag return $drop; # return drop value } elsif ( ! is_opt_tag($last) ) { return 0; # oop, have a non-optional tag } $bu--; # back up one more $len--; # and reduce available to check } return 0; } sub is_all_optional($) { my ($re) = @_; my $len = scalar @{$re}; my $bu = -1; my ($last); while ($len) { $last = ${$re}[$bu][0]; # get tag if ( ! is_opt_tag($last) ) { return 0; # oop, have a non-optional tag } $bu--; # back up one more $len--; # and reduce available to check } return 1; # ALL were optiona } sub pop_optional_elements($) { my ($re) = @_; my $len = scalar @{$re}; my $bu = -1; my $pop = 0; my ($last); while ($len--) { $last = ${$re}[$bu][0]; # get tag last if (!is_opt_tag($last)); $pop++; $bu--; } return $pop; } sub count_optional_elements($) { my ($re) = @_; my $len = scalar @{$re}; my $opts = 0; my $cnt = 0; my ($last); while ($len--) { $last = ${$re}[$cnt][0]; # get tag $opts++ if (is_opt_tag($last)); $cnt++; } return $opts; } sub show_stack_elements($$$) { my ($tag,$rele,$rlns) = @_; my $cnt = scalar @{$rele}; my $lcnt = scalar @{$rlns}; if ($cnt) { prt("The stack has $cnt elements... The current closing element is [$tag]\n"); for (my $i = 0; $i < $cnt; $i++) { my $e = ${$rele}[$i][0]; my $n = ${$rele}[$i][1]; prt("$n: element:[$e]"); prt(" SAME as tag [$tag]!") if ($e eq $tag); if ($n <= $lcnt) { my $ln = trim_all(${$rlns}[$n-1]); prt(" line=[$ln]"); } prt("\n"); } } } sub get_element_chain($) { my ($rele) = @_; my $cnt = scalar @{$rele}; my $chn = ''; if ($cnt) { for (my $i = 0; $i < $cnt; $i++) { my $e = ${$rele}[$i][0]; $chn .= '|' if length($chn); $chn .= $e; } } return $chn; } sub get_attribute_hash_ref($$$$) { my ($fank,$fil,$xml,$dbg) = @_; my %hash = (); my ($ank,$len,$i,$ch,$pc,$hr2,$txt); $ank = trim_all($fank); $len = length($ank); $ch = ''; $hr2 = ''; for ($i = 0; $i < $len; $i++) { $pc = $ch; $ch = substr($ank,$i,1); # if ($ch =~ /\w/) - this missed xml:link="abc" # and 'http-equiv="..." if ($ch =~ /(\w|:|-)/) { $hr2 .= $ch; # accumulate \w chars - alphanumeric, including _ } elsif (length($hr2)) { if (($ch ne '=') && ($ch =~ /\s/)) { $i++; for (; $i < $len; $i++) { $ch = substr($ank,$i,1); last if ($ch eq '='); last if !($ch =~ /\s/); } } if ($ch eq '=') { # found our equal sign $i++; # move on... for (; $i < $len; $i++) { $ch = substr($ank,$i,1); last if ($ch =~ /('|")/); last if !($ch =~ /\s/); } if (($ch eq '"')||($ch eq "'")) { $pc = $ch; $i++; # move on... $txt = ''; for (; $i < $len; $i++) { $ch = substr($ank,$i,1); last if ($ch eq $pc); $txt .= $ch; } if ($ch eq $pc) { $hr2 = lc($hr2) if ($xml); $hash{$hr2} = $txt; prt( "[dbg38] Got [$hr2] = [$txt] [$fil]\n" ) if ($dbg38); } else { prtw("PROBLEM: got [$hr2]. At pos $i in [$ank], from [$fank], and NO END INVERTED COMMA! ($pc) [$fil]\n"); pgm_exit(1,"NEED CODE FIX!") if ($dbg); } } else { #if (($ch =~ /\w/) && (($hr2 =~ /name/i)||($hr2 =~ /id/i))) { if ($ch =~ /(\w|-)/) { # accept ALL WITHOUT inverted comma $txt = $ch; # start the text $i++; # MOVING ON for (; $i < $len; $i++) { $ch = substr($ank,$i,1); #last if !($ch =~ /\w/); # can ONLY stop on NOT alphanumeric last if !($ch =~ /(\w|-|:)/); # can ONLY stop on NOT alphanumeric or some specials $txt .= $ch; } $hr2 = lc($hr2) if ($xml); $hash{$hr2} = $txt; prt( "[dbg39] Got [$hr2] = [$txt] - no inverted commas! [$fil]\n" ) if ($dbg39); } else { prtw("PROBLEM: got [$hr2]. At pos $i in [$ank], from [$fank], and NO START INVERTED COMMA! [$fil]\n"); pgm_exit(1,"NEED CODE FIX!") if ($dbg); } } } else { prtw("PROBLEM: got [$hr2]. At pos $i in [$ank], from [$fank], and NO EQUAL SIGN! [$fil]\n"); pgm_exit(1,"NEED CODE FIX!") if ($dbg); } $hr2 = ''; } } return \%hash; } sub get_html_file_hash($) { my ($inf) = shift; if (!open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]!\n"); } my @lines = <INF>; close INF; my $lncnt = scalar @lines; prt("Got $lncnt lines, from file [$inf]...\n"); my ($i,$line,$ch,$tag,$len,$intag,$txt,$j,$pc,$ppc,$incdata,$hadsp,$attrs); my ($lnn,$last,$lln,$bgnlnn,$endlnn,$clnn,$stkdep,$maxdep); my ($maxelement,$echn,$hr,$msg,$ctag); my ($incomm,$pppc,$drop); $tag = ''; $attrs = ''; $intag = 0; $incdata = 0; $hadsp = 0; $txt = ''; $ch = ''; $pc = ''; $ppc = ''; my @elements = (); $lnn = 0; $maxdep = 0; $maxelement = ''; $incomm = 0; my @html = (); for ($i = 0; $i < $lncnt; $i++) { $line = $lines[$i]; chomp $line; $len = length($line); $lnn++; #$clnn = sprintf("%3d",$lnn); $clnn = "$lnn"; for ($j = 0; $j < $len; $j++) { $pppc = $ppc; $ppc = $pc; $pc = $ch; $ch = substr($line,$j,1); if ($incdata) { if (($ch eq '>')&&($pc eq ']')&&($ppc eq ']')) { $incdata = 0; prt("$clnn: End CDATA\n") if (VERB5()); # ========================================== push(@html,[$txt,$tag,$attrs,$hr]); $msg = "$lnn: Store1 {".$txt."}{".$tag."}{".$attrs."}"; $msg =~ s/\n/\*nl\*/g; prt( "$msg\n" ) if ( $dbg01 || VERB9() ); $hr = get_attribute_hash_ref("",$inf,1,1); # ========================================== prtw("WARNING: CDATA: Attribute collect has length! [$attrs]\n") if (length($attrs)); # reset $txt = ''; $tag = ''; $attrs = ''; $hadsp = 0; $intag = 0; next; } $tag .= $ch; } elsif ($incomm) { # very specific --> exit for this tag if (($ch eq '>')&&($pc eq '-')&&($ppc eq '-')) { $incomm = 0; prt("$clnn: End comment\n") if (VERB5()); # ========================================== push(@html,[$txt,$tag,$attrs,$hr]); $msg = "$lnn: Store2 {".$txt."}{".$tag."}{".$attrs."}"; $msg =~ s/\n/\*nl\*/g; prt( "$msg\n" ) if ( $dbg01 || VERB9() ); $hr = get_attribute_hash_ref("",$inf,1,1); # ========================================== prtw("WARNING: end comment: Attribute collect has length! [$attrs]\n") if (length($attrs)); # reset $txt = ''; $tag = ''; $attrs = ''; $hadsp = 0; $intag = 0; next; } $tag .= $ch; } elsif ($intag) { if ($hadsp) { $attrs .= $ch if !($ch eq '>'); } elsif ($ch =~ /\s/) { $hadsp = 1; $attrs .= $ch; } else { $tag .= $ch if !($ch eq '>'); } if ($ch eq '>') { $intag = 0; $endlnn = $lnn; } elsif (($ch eq '[')&&($pc eq 'A')&&($tag =~ /\!\[CDATA\[$/)) { $incdata = 1; prt("$clnn: Begin CDATA\n") if (VERB5()); next; } elsif (($ch eq '-')&&($pc eq '-')&&($ppc eq '!')&&($pppc eq '<')) { $incomm = 1; prt("$clnn: Begin comment\n") if (VERB5()); next; } if (!$intag) { $tag = trim_all($tag); # $clnn = sprintf("%3d",$lnn); $clnn = "$lnn"; $msg = "$clnn: "; $msg .= "Text [".trim_all($txt)."]\n$clnn: " if (length($txt) && !($txt =~ /^\s+$/)); $msg .= "End tag [$tag] "; $msg .= "Attrs [".trim_all($attrs)."] " if (length($attrs)); if ($tag =~ /^(\!|\?)/) { $hr = get_attribute_hash_ref("",$inf,1,1); $msg .= "Special"; } else { # if ($attrs =~ /\/$/) but it may NOT end with '/' $hr = get_attribute_hash_ref(trim_all($attrs),$inf,1,1); if (($attrs =~ /\/$/) || is_closed_tag($tag)) { $msg .= "self-closed"; } elsif ($tag =~ /^\//) { $ctag = substr($tag,1); $msg .= "Close"; if (@elements) { $last = $elements[-1][0]; $lln = $elements[-1][1]; if ($last eq $ctag) { pop @elements; } else { # but may have 'opt' tags - tags that need no close on the stack, which # can be dropped to get to this tag $drop = can_find_this_tag($ctag,\@elements); if ($drop) { while($drop--) { pop @elements; } } else { if ($ignore_close_element) { $echn = get_element_chain(\@elements); prtw("WARNING: Last close [<$tag>] NOT LAST in stack [$echn]\n"); prtw("WARNING:$lnn: IGNORING close element [</$tag>]\n"); } else { prt("Was processing [$msg] in FILE:[$inf]\n"); $msg = ''; prt("\nERROR: Last [$last]$lln NE [$ctag]$lnn line=[".trim_all($line)."]\n"); show_stack_elements($ctag,\@elements,\@lines); pgm_exit(1,"ERROR:[1]: It is useless to continue when the element stack is out of order!\n"); } } } $echn = get_element_chain(\@elements); prt("$lnn: Popped element [$tag] remains [$echn]\n") if (VERB9()); } else { prt("$msg\n"); $msg = ''; prt("\nERROR: The stack has NO elements... The current closing element is [$ctag]\n"); pgm_exit(1,"ERROR:[2]: It is useless to continue when the element stack is out of order! [2]\n"); } } else { $msg .= "Open"; push(@elements,[$tag,$bgnlnn,$endlnn]); $echn = get_element_chain(\@elements); $stkdep = scalar @elements; if ($stkdep > $maxdep) { $maxdep = $stkdep; $maxelement = "$clnn: $tag $bgnlnn $endlnn [$echn]"; } prt("$lnn: Pushed element [$tag] chain=[$echn]\n") if (VERB9()); } } prt("$msg\n") if (VERB1()); # ========================================== push(@html,[$txt,$tag,$attrs,$hr]); $msg = "$lnn: Store3 {".$txt."}{".$tag."}{".$attrs."}"; $msg =~ s/\n/\*nl\*/g; prt("$msg\n") if ( $dbg01 || VERB9() ); # ========================================== # reset $txt = ''; $tag = ''; $attrs = ''; $hadsp = 0; } } else { if ($ch eq '<') { $tag = ''; $intag = 1; $hadsp = 0; $bgnlnn = $lnn; prt("$lnn: Begin tag line=[$line]\n") if (VERB9()); } else { $txt .= $ch; } } } # reached end of line - get next #================================= $ch = "\n"; if ($incdata) { $tag .= $ch; } else { if ($intag) { if ($hadsp) { $attrs .= $ch; # if (length($attrs)); # && !($attrs =~ /\s$/)); } else { $tag .= $ch; # if (length($tag)); # && !($tag =~ /\s$/)); } } else { $txt .= $ch; # if (length($txt)); # && !($txt =~ /\s$/)); } } $pppc = $ppc; $ppc = $pc; $pc = $ch; } prt("Max. element stack $maxdep...$maxelement\n"); if (@elements && !is_all_optional(\@elements)) { $drop = pop_optional_elements(\@elements); if ($drop) { prt("Dropping $drop optional elements from stack... "); while($drop--) { pop @elements; } $drop = scalar @elements; if ($drop) { prt("leaving $drop..."); $drop = count_optional_elements(\@elements); if ($drop) { prt(" $drop are optional..."); } } else { prt("leaving none..."); } prt("\n"); } if (@elements) { show_stack_elements("At-End-of-File",\@elements,\@lines); pgm_exit(1,"WARNING: This file [$inf] is NOT clean!\n"); } } prt("Done $lncnt lines... [$inf] appears ok...\n"); my %hash = (); $hash{$inf} = [@html]; return \%hash; } sub get_href_type($) { my ($src) = shift; if ($src =~ /^http:/i) { #push(@httprefs, [$src, $fil, $lnnos] ); return 1; # remote HREF } elsif ($src =~ /^https:/i) { return 1; # remote HREF #push(@httpsrefs, [$src, $fil, $lnnos] ); } elsif ($src =~ /^ftp:/i) { #push(@ftprefs, [$src, $fil, $lnnos] ); return 3; # remote HREF } elsif ($src =~ /^mailto:/i) { #push(@mtrefs, [$src, $fil, $lnnos] ); return 4; # remote HREF } elsif ( $src =~ /^javascript:/i ) { return 5; # a JAVASCRIPT HREF } elsif ($src =~ /^file:/i) { return 5; # remote HREF } elsif ( substr($src,0,1) eq '#') { # local in page HREF return 6; } else { my $ind = index($src,'#'); $src = substr($src,0,$ind) if ( $ind != -1 ); $ind = index($src,'?'); $src = substr($src,0,$ind) if ( $ind != -1 ); $src =~ s/\/$//; return 7 if (length($src)); } return 0; } sub dos_2_unix($) { my ($du) = shift; $du =~ s/\\/\//g; return $du; } sub fix_rel_unix_path($) { my ($path) = shift; $path = dos_2_unix($path); # pgm_exit(1,"ERROR: Passed PATH that starts relative! [$path]\n") if (($path =~ /^\.\./)||($path =~ /^\.(\\|\/)\.\./)); my @a = split(/\//, $path); my $npath = ''; my $max = scalar @a; my @na = (); for (my $i = 0; $i < $max; $i++) { my $p = $a[$i]; if ($p eq '.') { # ignore this } elsif ($p eq '..') { if (@na) { pop @na; # discard previous } else { prt( "WARNING: Got relative .. without previous!!! path=[$path]\n" ); } } else { push(@na,$p); } } foreach my $pt (@na) { $npath .= "/" if length($npath); $npath .= $pt; } return $npath; } sub get_local_href($) { my ($src) = shift; my $ind = index($src,'#'); $src = substr($src,0,$ind) if ( $ind != -1 ); $ind = index($src,'?'); $src = substr($src,0,$ind) if ( $ind != -1 ); $src =~ s/\/$//; # remove any TRAILING '/' char # 25/07/2007 - also 'convert' '%20' to space $src =~ s/%20/ /g; return $src; } sub find_anchor_name($$) { my ($nm,$rhtml) = @_; my $len = scalar @{$rhtml}; for (my $i = 0; $i < $len; $i++) { my $tag = ${$rhtml}[$i][1]; if ($tag =~ /^a$/i) { my $rah = ${$rhtml}[$i][3]; if (defined ${$rah}{'name'}) { return 1 if (${$rah}{'name'} eq $nm); } } } return 0; # NOT found } sub show_hash_ref($) { my ($hr) = @_; my ($fil,$rhtml,$len,$htxt,$i,$txt,$tag,$attrs,$rah,$ra); my ($ftit,$fdir); my %h = (); foreach $fil (keys %{$hr}) { ($ftit,$fdir) = fileparse($fil); $fdir = $cwd.'/' if ($fdir =~ /^\.(\\|\/)$/); $rhtml = ${$hr}{$fil}; $len = scalar @{$rhtml}; $htxt = ''; for ($i = 0; $i < $len; $i++) { # 0 1 2 3 # push(@html,[$txt,$tag,$attrs,$hr]); $txt = ${$rhtml}[$i][0]; $tag = ${$rhtml}[$i][1]; $attrs = ${$rhtml}[$i][2]; $rah = ${$rhtml}[$i][3]; $htxt .= $txt; $htxt .= '<'.$tag; $htxt .= $attrs; $htxt .= '>'; if (defined ${$rah}{'src'}) { $h{$tag} = [] if (!defined $h{$tag}); $ra = $h{$tag}; push(@{$ra},${$rah}{'src'}); } if (defined ${$rah}{'href'}) { $h{$tag} = [] if (!defined $h{$tag}); $ra = $h{$tag}; push(@{$ra},${$rah}{'href'}); } } $htxt .= "\n" if !($htxt =~ /\n$/); write2file($htxt,$out_html); prt("Written to $out_html file...\n"); my ($key,$val,$itm,$typ,$loc,$ok,$ff,$msg,$cnt); my $min = 65; prt("Link contents of $fil...\n"); foreach $key (keys %h) { $val = $h{$key}; $cnt = scalar @{$val}; prt("$key: Has $cnt items...\n"); foreach $itm (@{$val}) { $typ = get_href_type($itm); $msg = "[$itm]$typ"; $msg .= ' ' while (length($msg) < $min); $ok = 'extern'; if ($typ == 6) { $ok = 'ok1'; if (length($itm) > 1) { if (find_anchor_name(substr($itm,1),$rhtml)) { $ok = 'ok'; } else { $ok = 'NF'; } } } elsif ($typ == 7) { $loc = get_local_href($itm); $ff = $fdir.$loc; if (-f $ff) { $ok = 'ok'; } elsif (-d $ff) { $ok = 'okd'; } else { $ok = 'NF'; } } prt(" $msg $ok\n"); } prt("\n"); } } } ######################################### ### MAIN ### parse_args(@ARGV); prt( "$pgmname: in [$cwd]: Process $in_file...\n" ); my $hash_ref = get_html_file_hash($in_file); show_hash_ref($hash_ref); pgm_exit(0,"Normal exit(0)"); ######################################## sub give_help { prt("$pgmname: version 0.0.1 2010-04-06\n"); prt("Usage: $pgmname [options] in_file_name\n"); prt("Options:\n"); prt(" -h (or -?) = THis help, and exit 0\n"); prt(" -l = Load log file at end.\n"); prt(" -v[num] = Bump, or set verbosity to [num]\n"); prt("Parse input file, and report any problems...\n"); } sub parse_args { my (@av) = @_; my ($arg,$sarg,$ch); while (@av) { $arg = $av[0]; if ($arg =~ /^-/) { $sarg = substr($arg,1); $sarg = substr($sarg,1) while ($sarg =~ /^-/); $ch = substr($sarg,0,1); if ($ch =~ /h/i) { give_help(); pgm_exit(0,"Help exit"); } elsif ($ch =~ /l/i) { $load_log = 1; prt("Set to load log at end\n"); } elsif ($ch =~ /v/i) { $sarg = substr($sarg,1); if (length($sarg)) { if ($sarg =~ /^\d+$/) { $verbosity = $sarg; prt("Set verbosity to [$verbosity]\n"); } else { pgm_exit(1,"Unknown argument [$arg] - verbosity is -v[num]. Try -h for help\n"); } } else { $verbosity++; prt("Bumped verbosity to [$verbosity]\n"); } } else { pgm_exit(1,"Unknown argument [$arg] Try -h for help\n"); } } else { $in_file = $arg; prt("Set input file to [$in_file]\n"); } shift @av; } if (!length($in_file)) { $in_file = $def_infile; $load_log = 1; $verbosity = 9; prt("Set DEFAULT input file to [$in_file], and set load_log=1, and verbosity=$verbosity\n"); } } # eof - template.pl