Generated: Mon Aug 16 14:14:37 2010 from showanchors.pl 2010/03/27 13 KB.
#!/perl -w # NAME: showanchors.pl # AIM: Given a HTML file, extract an show each ANCHOR in the file # 2010/03/25 - geoff mclane - http://geoffair.net/mperl/ use strict; use warnings; use File::Basename; use Cwd; unshift(@INC, 'C:\GTools\perl'); require 'logfile.pl' or die "Unable to load logfile.pl ...\n"; require 'htmltools.pl' or die "Unable to load htmltools.pl ...\n"; # log file stuff my ($LF); my $pgmname = $0; if ($pgmname =~ /\w{1}:\\.*/) { my @tmpsp = split(/\\/,$pgmname); $pgmname = $tmpsp[-1]; } my $perl_dir = 'C:\GTools\perl'; my $outfile = $perl_dir."\\temp.$pgmname.txt"; open_log($outfile); # user variables my $load_log = 1; my $anchor_text_key = '_anchor_text_'; my $add_full_ank = 0; # prt(" - <a $ank>$text</a>") if ($add_full_ank); my $out_as_html = 1; my $out_file = $perl_dir."\\tempanch.html"; ### program variables my @warnings = (); my $cwd = cwd(); # debug my $sa_dbg01 = 0; # prt("[dbg01] Got anchor [$hrf] ...\n") if ($sa_dbg01); my $sa_dbg02 = 0; # prt( "[dbg02] Got [$hr2] = [$txt] [$fil]\n" ) if ($sa_dbg02); my $sa_dbg03 = 0; # prt("[dbg03] Got anchor text [$atxt]\n") if ($sa_dbg03); my $in_file = 'C:\HOMEPAGE\FG\docs.html'; my @input_files = (); sub pgm_exit($$) { my ($val,$msg) = @_; if (length($msg)) { $msg .= "\n" if (!($msg =~ /\n$/)); prt($msg) } close_log($outfile,$load_log); exit($val); } sub prtw($) { my ($tx) = shift; $tx =~ s/\n$//; prt("$tx\n"); push(@warnings,$tx); } sub show_warnings() { if (@warnings) { prt( "\nGot ".scalar @warnings." WARNINGS...\n" ); foreach my $itm (@warnings) { prt("$itm\n"); } prt("\n"); } else { prt( "\nNo warnings issued.\n\n" ); } } sub get_anchor_hash_ref_lc($$$) { my ($fank,$fil,$dbg) = @_; my %hash = (); my ($ank,$len,$i,$ch,$pc,$hr2,$txt); my ($lchr2); if ($fank =~ /<a\s+(.+)>$/) { $ank = trim_all($1); $len = length($ank); $ch = ''; $hr2 = ''; for ($i = 0; $i < $len; $i++) { $pc = $ch; $ch = substr($ank,$i,1); if ($ch =~ /\w/) { $hr2 .= $ch; # accumulate \w chars - alphanumeric, including _ } elsif (length($hr2)) { if (($ch ne '=') && ($ch =~ /\s/)) { $i++; for (; $i < $len; $i++) { $ch = substr($ank,$i,1); last if ($ch eq '='); last if !($ch =~ /\s/); } } if ($ch eq '=') { # found our equal sign $i++; # move on... for (; $i < $len; $i++) { $ch = substr($ank,$i,1); last if ($ch =~ /('|")/); last if !($ch =~ /\s/); } if (($ch eq '"')||($ch eq "'")) { $pc = $ch; $i++; # move on... $txt = ''; for (; $i < $len; $i++) { $ch = substr($ank,$i,1); last if ($ch eq $pc); $txt .= $ch; } if ($ch eq $pc) { $lchr2 = lc($hr2); $hash{$lchr2} = $txt; prt( "[dbg02] Got [$hr2] = [$txt] [$fil]\n" ) if ($sa_dbg02); } else { prtw("PROBLEM: got [$hr2]. At pos $i in [$ank], from [$fank], and NO END INVERTED COMMA! ($pc) [$fil]\n"); pgm_exit(1,"") if ($dbg); } } else { if (($ch =~ /\w/) && (($hr2 =~ /name/i)||($hr2 =~ /id/i))) { # accept these WITHOUT inverted comma $txt = $ch; $i++; # MOVING ON for (; $i < $len; $i++) { $ch = substr($ank,$i,1); last if !($ch =~ /\w/); $txt .= $ch; } $lchr2 = lc($hr2); $hash{$lchr2} = $txt; prt( "[dbg02] Got [$hr2] = [$txt] - no inverted commas! [$fil]\n" ) if ($sa_dbg02); } else { prtw("PROBLEM: got [$hr2]. At pos $i in [$ank], from [$fank], and NO START INVERTED COMMA! [$fil]\n"); pgm_exit(1,"") if ($dbg); } } } else { prtw("PROBLEM: got [$hr2]. At pos $i in [$ank], from [$fank], and NO EQUAL SIGN! [$fil]\n"); pgm_exit(1,"") if ($dbg); } $hr2 = ''; } } } return \%hash; } sub get_anchor_list { my ($txt,$fil) = @_; my $atxt = ''; my $len = length($txt); my $ppc = ''; my $pc = ''; my $ch = ''; my $hrf = ''; my $i; my @list; my ($hr,$inanchor,$lnn,$blnn); $inanchor = 0; $lnn = 1; for ($i = 0; $i < $len; $i++) { $ppc = $pc; $pc = $ch; $ch = substr($txt,$i,1); $lnn++ if ($ch eq "\n"); if ($ch eq '<') { $hrf = $ch; # start with first '<' $blnn = $lnn; $i++; # bump to next for ( ; $i < $len; $i++) { $ppc = $pc; $pc = $ch; $ch = substr($txt,$i,1); $lnn++ if ($ch eq "\n"); $hrf .= $ch; # 25/07/2007 watch OUT for COMMENTS - skip these if ($ch eq '-') { if ($hrf eq '<!--') { # we have START of a COMMENT - YUK!!! $i++; # move to NEXT for ( ; $i < $len; $i++) { $ppc = $pc; $pc = $ch; $ch = substr($txt,$i,1); $hrf .= $ch; if ($ch eq '>') { if ($hrf =~ /-->$/) { $hrf = ''; last; } } } } } if ($ch eq '>') { last; } } if ($hrf =~ /^<a\s/i) { $hrf = trim_all($hrf); $hrf =~ s/=\s+"/="/g; prt("[dbg01] Got anchor [$hrf] ...\n") if ($sa_dbg01); $hr = get_anchor_hash_ref_lc($hrf,$fil,0); #push(@list,$hr); push(@list,[$hr,$blnn]); $inanchor = 1; $atxt = ''; $ch = ''; } elsif ($hrf =~ /^<\/a>$/i) { if ($inanchor) { $inanchor = 0; $atxt = trim_all($atxt); ${$hr}{$anchor_text_key} = $atxt; prt("[dbg03] Got anchor text [$atxt]\n") if ($sa_dbg03); $ch = ''; $atxt = ''; } else { prtw("WARNING: anchor close, without open!\n"); } } elsif ($inanchor) { $atxt .= $hrf; $ch = ''; } } $atxt .= $ch if ($inanchor); } $len = scalar @list; prt( "Done $len anchor collection ...\n" ); return \@list; } sub process_file($) { my ($fil) = shift; my %hash = (); if (open INF, "<$fil") { my @lines = <INF>; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines from $fil...\n"); my $ft = join( '', @lines ); my $title = return_tag( $ft, 'title' ); $title =~ s/\n/ /gm; $title = trim_all($title); my $ntxt = remove_script( $ft ); $ntxt = dropcomments($ntxt); # write2file($ntxt,"tempnew.txt"); $ntxt = trimblanklines($ntxt); my $ra = get_anchor_list($ntxt,$fil); $hash{$fil} = $ra; # store the array ref in the hash } else { prt("ERROR: Failed to open file [$fil]!\n"); } return \%hash; } sub get_href_type($) { my ($src) = shift; if ($src =~ /^http:/i) { #push(@httprefs, [$src, $fil, $lnnos] ); return 1; # remote HREF } elsif ($src =~ /^https:/i) { return 1; # remote HREF #push(@httpsrefs, [$src, $fil, $lnnos] ); } elsif ($src =~ /^ftp:/i) { #push(@ftprefs, [$src, $fil, $lnnos] ); return 3; # remote HREF } elsif ($src =~ /^mailto:/i) { #push(@mtrefs, [$src, $fil, $lnnos] ); return 4; # remote HREF } elsif ( $src =~ /^javascript:/i ) { return 5; # a JAVASCRIPT HREF } elsif ($src =~ /^file:/i) { return 5; # remote HREF } elsif ( substr($src,0,1) eq '#') { # local in page HREF return 6; } else { my $ind = index($src,'#'); if ( $ind != -1 ) { $src = substr($src,0,$ind); } $ind = index($src,'?'); if ( $ind != -1 ) { $src = substr($src,0,$ind); } $src =~ s/\/$//; if (length($src)) { return 7; } } return 0; } sub get_href_type_name($) { my ($src) = shift; my $typ = get_href_type($src); if ($typ == 1) { # ($src =~ /^http:/i) return "1: remote HREF (http)"; } elsif ($typ == 2) { # ($src =~ /^https:/i) return "2: remote HREF (https)"; } elsif ($typ == 3) { # ($src =~ /^ftp:/i) return "3: remote HREF (ftp)"; } elsif ($typ == 4) { # ($src =~ /^mailto:/i) { return "4: remote HREF (mailto)"; } elsif ($typ == 5) { # if ($src =~ /^javascript:/i ) { return "5: a JAVASCRIPT HREF"; } elsif ($src =~ /^file:/i) { return "5: a FILE HREF"; } return "5: a ???? HREF CHECKME"; } elsif ($typ == 6) { # ( substr($src,0,1) eq '#') return "6: infile link"; # (".substr($src,1).")"; } elsif ($typ == 7) { return "7: local link"; } return "0: UNCASED [$src] CHECKME!"; } sub show_ref_hash($) { my ($rh) = @_; my ($key,$ra,$lst,$key2,$val,$text,$ank,$href,$hnum,$ff,$ok); my ($cnt,$i,$lra,$lnn); my @list = (); foreach $key (keys %{$rh}) { my ($nm,$dir) = fileparse($key); $ra = ${$rh}{$key}; # extract anchor list (ARRAY REF) $cnt = scalar @{$ra}; prt("FILE: $key - $cnt anchors\n"); for ($i = 0; $i < $cnt; $i++) { $lra = ${$ra}[$i]; $lst = ${$lra}[0]; # ref hash for anchor $lnn = ${$lra}[1]; $ank = ''; $text = ''; $href = ''; $hnum = -1; $ok = ' '; foreach $key2 (keys %{$lst}) { $val = ${$lst}{$key2}; if ($key2 eq $anchor_text_key) { $text = $val; } else { $ank .= ' ' if (length($ank)); $ank .= $key2.'="'.$val.'"'; if ($key2 eq 'href') { $href = $val; $hnum = get_href_type($val); if ($hnum == 7) { $ff = $dir.$val; if (-f $ff) { $ok = "ok"; } elsif (-d $ff) { $ok = "okd"; } else { $ok = "NF"; } } } } #prt( "$hr = $val\n"); } prt("$lnn: HREF=[$href] ($hnum) $ok"); prt(" - <a $ank>$text</a>") if ($add_full_ank); prt("\n"); push(@list, [$hnum,$href,$ok,$ank,$text,$lnn,$key,$lst]); } } return \@list; } ######################################### ### MAIN ### parse_args(@ARGV); foreach $in_file (@input_files) { my $ref_hash = process_file($in_file); my $ref_arr = show_ref_hash($ref_hash); } pgm_exit(0,"Normal exit(0)"); ######################################## sub show_help { prt("$pgmname: Version 0.0.1 Mar 25, 2010\n"); prt("Usage: $pgmname [options] file [file ...]\n"); prt("Options:\n"); prt(" --help (-h or -?) = Show this help and exit 0\n"); } # Ensure argument exists, or die. sub require_arg { my ($arg, @arglist) = @_; pgm_exit(1,"ERROR: no argument given for option '$arg' ...\n" ) if ! @arglist; } ########################################################## # Parse USER input # Largerly still to be done ########################################################## sub parse_args { my (@av) = @_; my ($arg,$sarg,$ch); while (@av) { $arg = $av[0]; if ($arg =~ /^-/) { $sarg = substr($arg,1); $sarg = substr($sarg,1) while ($sarg =~ /^-/); $ch = substr($sarg,0,1); if (($ch eq '-h')||($ch eq '-?')) { prt("Showing help...\n"); show_help(); pgm_exit(0,"Help exit 0"); } else { pgm_exit(1,"ERROR: Unknown options [$arg]"); } } else { push(@input_files,$arg); prt("Added [$arg] file to input list...\n"); } } if (!@input_files) { if (-f $in_file) { push(@input_files,$in_file); prt("Added [$in_file] default file to input list...\n"); } else { show_help(); pgm_exit(1,"ERROR: No input file given!\n"); } } } # eof - template.pl