showanchors.pl to HTML.

index -|- end

Generated: Mon Aug 16 14:14:37 2010 from showanchors.pl 2010/03/27 13 KB.

#!/perl -w
# NAME: showanchors.pl
# AIM: Given a HTML file, extract an show each ANCHOR in the file
# 2010/03/25  - geoff mclane - http://geoffair.net/mperl/
use strict;
use warnings;
use File::Basename;
use Cwd;
unshift(@INC, 'C:\GTools\perl');
require 'logfile.pl' or die "Unable to load logfile.pl ...\n";
require 'htmltools.pl' or die "Unable to load htmltools.pl ...\n";
# log file stuff
my ($LF);
my $pgmname = $0;
if ($pgmname =~ /\w{1}:\\.*/) {
    my @tmpsp = split(/\\/,$pgmname);
    $pgmname = $tmpsp[-1];
}
my $perl_dir = 'C:\GTools\perl';
my $outfile = $perl_dir."\\temp.$pgmname.txt";
open_log($outfile);

# user variables
my $load_log = 1;
my $anchor_text_key = '_anchor_text_';
my $add_full_ank = 0; # prt(" - <a $ank>$text</a>") if ($add_full_ank);
my $out_as_html = 1;
my $out_file = $perl_dir."\\tempanch.html";

### program variables
my @warnings = ();
my $cwd = cwd();

# debug
my $sa_dbg01 = 0; # prt("[dbg01] Got anchor [$hrf] ...\n") if ($sa_dbg01);
my $sa_dbg02 = 0; # prt( "[dbg02] Got [$hr2] = [$txt] [$fil]\n" ) if ($sa_dbg02);
my $sa_dbg03 = 0; # prt("[dbg03] Got anchor text [$atxt]\n") if ($sa_dbg03);

my $in_file = 'C:\HOMEPAGE\FG\docs.html';
my @input_files = ();

sub pgm_exit($$) {
    my ($val,$msg) = @_;
    if (length($msg)) {
        $msg .= "\n" if (!($msg =~ /\n$/));
        prt($msg)
    }
    close_log($outfile,$load_log);
    exit($val);
}


sub prtw($) {
   my ($tx) = shift;
   $tx =~ s/\n$//;
   prt("$tx\n");
   push(@warnings,$tx);
}

sub show_warnings() {
   if (@warnings) {
      prt( "\nGot ".scalar @warnings." WARNINGS...\n" );
      foreach my $itm (@warnings) {
         prt("$itm\n");
      }
      prt("\n");
   } else {
      prt( "\nNo warnings issued.\n\n" );
   }
}

sub get_anchor_hash_ref_lc($$$) {
   my ($fank,$fil,$dbg) = @_;
   my %hash = ();
   my ($ank,$len,$i,$ch,$pc,$hr2,$txt);
   my ($lchr2);
   if ($fank =~ /<a\s+(.+)>$/) {
      $ank = trim_all($1);
      $len = length($ank);
      $ch = '';
      $hr2 = '';
      for ($i = 0; $i < $len; $i++) {
         $pc = $ch;
         $ch = substr($ank,$i,1);
         if ($ch =~ /\w/) {
            $hr2 .= $ch;   # accumulate \w chars - alphanumeric, including _
         } elsif (length($hr2)) {
            if (($ch ne '=') && ($ch =~ /\s/)) {
               $i++;
               for (; $i < $len; $i++) {
                  $ch = substr($ank,$i,1);
                  last if ($ch eq '=');
                  last if !($ch =~ /\s/);
               }
            }
            if ($ch eq '=') {
               # found our equal sign
               $i++; # move on...
               for (; $i < $len; $i++) {
                  $ch = substr($ank,$i,1);
                  last if ($ch =~ /('|")/);
                  last if !($ch =~ /\s/);
               }
               if (($ch eq '"')||($ch eq "'")) {
                  $pc = $ch;
                  $i++; # move on...
                  $txt = '';
                  for (; $i < $len; $i++) {
                     $ch = substr($ank,$i,1);
                     last if ($ch eq $pc);
                     $txt .= $ch;
                  }
                  if ($ch eq $pc) {
                     $lchr2 = lc($hr2);
                     $hash{$lchr2} = $txt;
                     prt( "[dbg02] Got [$hr2] = [$txt] [$fil]\n" ) if ($sa_dbg02);
                  } else {
                     prtw("PROBLEM: got [$hr2]. At pos $i in [$ank], from [$fank], and NO END INVERTED COMMA! ($pc) [$fil]\n");
                     pgm_exit(1,"") if ($dbg);
                  }
               } else {
                  if (($ch =~ /\w/) && (($hr2 =~ /name/i)||($hr2 =~ /id/i))) {
                     # accept these WITHOUT inverted comma
                     $txt = $ch;
                     $i++; # MOVING ON
                     for (; $i < $len; $i++) {
                        $ch = substr($ank,$i,1);
                        last if !($ch =~ /\w/);
                        $txt .= $ch;
                     }
                     $lchr2 = lc($hr2);
                     $hash{$lchr2} = $txt;
                     prt( "[dbg02] Got [$hr2] = [$txt] - no inverted commas! [$fil]\n" ) if ($sa_dbg02);
                  } else {
                     prtw("PROBLEM: got [$hr2]. At pos $i in [$ank], from [$fank], and NO START INVERTED COMMA! [$fil]\n");
                     pgm_exit(1,"") if ($dbg);
                  }
               }
            } else {
               prtw("PROBLEM: got [$hr2]. At pos $i in [$ank], from [$fank], and NO EQUAL SIGN! [$fil]\n");
               pgm_exit(1,"") if ($dbg);
            }
            $hr2 = '';
         }
      }
   }
   return \%hash;
}

sub get_anchor_list {
   my ($txt,$fil) = @_;
   my $atxt = '';
   my $len = length($txt);
    my $ppc = '';
    my $pc = '';
   my $ch = '';
   my $hrf = '';
   my $i;
    my @list;
    my ($hr,$inanchor,$lnn,$blnn);
    $inanchor = 0;
    $lnn = 1;
   for ($i = 0; $i < $len; $i++) {
        $ppc = $pc;
        $pc = $ch;
      $ch = substr($txt,$i,1);
        $lnn++ if ($ch eq "\n");
      if ($ch eq '<') {
         $hrf = $ch; # start with first '<'
            $blnn = $lnn;
         $i++;   # bump to next
         for ( ; $i < $len; $i++) {
                $ppc = $pc;
                $pc = $ch;
            $ch = substr($txt,$i,1);
                $lnn++ if ($ch eq "\n");
            $hrf .= $ch;
            # 25/07/2007 watch OUT for COMMENTS - skip these
            if ($ch eq '-') {
               if ($hrf eq '<!--') {
                  # we have START of a COMMENT - YUK!!!
                  $i++;   # move to NEXT
                  for ( ; $i < $len; $i++) {
                            $ppc = $pc;
                            $pc = $ch;
                     $ch = substr($txt,$i,1);
                     $hrf .= $ch;
                     if ($ch eq '>') {
                        if ($hrf =~ /-->$/) {
                                    $hrf = '';
                           last;
                        }
                     }
                  }
               }
            }
            if ($ch eq '>') {
               last;
            }
         }

         if ($hrf =~ /^<a\s/i) {
                $hrf = trim_all($hrf);
                $hrf =~ s/=\s+"/="/g;
            prt("[dbg01] Got anchor [$hrf] ...\n") if ($sa_dbg01);
                $hr = get_anchor_hash_ref_lc($hrf,$fil,0);
                #push(@list,$hr);
                push(@list,[$hr,$blnn]);
                $inanchor = 1;
                $atxt = '';
                $ch = '';
            } elsif ($hrf =~ /^<\/a>$/i) {
                if ($inanchor) {
                    $inanchor = 0;
                    $atxt = trim_all($atxt);
                    ${$hr}{$anchor_text_key} = $atxt;
                prt("[dbg03] Got anchor text [$atxt]\n") if ($sa_dbg03);
                    $ch = '';
                    $atxt = '';
                } else {
                    prtw("WARNING: anchor close, without open!\n");
                }
         } elsif ($inanchor) {
                $atxt .= $hrf;
                $ch = '';
            }
      }
      $atxt .= $ch if ($inanchor);
   }
    $len = scalar @list;
   prt( "Done $len anchor collection ...\n" );
    return \@list;
}

sub process_file($) {
    my ($fil) = shift;
    my %hash = ();
    if (open INF, "<$fil") {
        my @lines = <INF>;
        close INF;
        my $lncnt = scalar @lines;
        prt("Processing $lncnt lines from $fil...\n");
      my $ft = join( '', @lines );
        my $title = return_tag( $ft, 'title' );
      $title =~ s/\n/ /gm;
      $title = trim_all($title);
      my $ntxt = remove_script( $ft );
        $ntxt = dropcomments($ntxt);
        # write2file($ntxt,"tempnew.txt");
        $ntxt = trimblanklines($ntxt);
        my $ra = get_anchor_list($ntxt,$fil);
        $hash{$fil} = $ra# store the array ref in the hash
    } else {
        prt("ERROR: Failed to open file [$fil]!\n");
    }
    return \%hash;
}

sub get_href_type($) {
   my ($src) = shift;
   if ($src =~ /^http:/i) {
      #push(@httprefs, [$src, $fil, $lnnos] );
      return 1; # remote HREF
   } elsif ($src =~ /^https:/i) {
      return 1; # remote HREF
      #push(@httpsrefs, [$src, $fil, $lnnos] );
   } elsif ($src =~ /^ftp:/i) {
      #push(@ftprefs, [$src, $fil, $lnnos] );
      return 3; # remote HREF
   } elsif ($src =~ /^mailto:/i) {
      #push(@mtrefs, [$src, $fil, $lnnos] );
      return 4; # remote HREF
   } elsif ( $src =~ /^javascript:/i ) {
      return 5; # a JAVASCRIPT HREF
   } elsif ($src =~ /^file:/i) {
      return 5; # remote HREF
   } elsif ( substr($src,0,1) eq '#') {
      # local in page HREF
      return 6;
   } else {
      my $ind = index($src,'#');
      if ( $ind != -1 ) {
         $src = substr($src,0,$ind);
      }
      $ind = index($src,'?');
      if ( $ind != -1 ) {
         $src = substr($src,0,$ind);
      }
      $src =~ s/\/$//;
      if (length($src)) {
         return 7;
      }
   }
   return 0;
}

sub get_href_type_name($) {
   my ($src) = shift;
    my $typ = get_href_type($src);
   if ($typ == 1) { # ($src =~ /^http:/i)
      return "1: remote HREF (http)";
   } elsif ($typ == 2) { # ($src =~ /^https:/i)
      return "2: remote HREF (https)";
   } elsif ($typ == 3) { # ($src =~ /^ftp:/i)
      return "3: remote HREF (ftp)";
   } elsif ($typ == 4) { # ($src =~ /^mailto:/i) {
      return "4: remote HREF (mailto)";
   } elsif ($typ == 5) { #
        if ($src =~ /^javascript:/i ) {
            return "5: a JAVASCRIPT HREF";
       } elsif ($src =~ /^file:/i) {
          return "5: a FILE HREF";
        }
        return "5: a ???? HREF CHECKME";
   } elsif ($typ == 6) { # ( substr($src,0,1) eq '#')
      return "6: infile link";   #  (".substr($src,1).")";
   } elsif ($typ == 7) {
      return "7: local link";
   }
   return "0: UNCASED [$src] CHECKME!";
}

sub show_ref_hash($) {
    my ($rh) = @_;
    my ($key,$ra,$lst,$key2,$val,$text,$ank,$href,$hnum,$ff,$ok);
    my ($cnt,$i,$lra,$lnn);
    my @list = ();
    foreach $key (keys %{$rh}) {
        my ($nm,$dir) = fileparse($key);
        $ra = ${$rh}{$key}; # extract anchor list (ARRAY REF)
        $cnt = scalar @{$ra};
        prt("FILE: $key - $cnt anchors\n");
        for ($i = 0; $i < $cnt; $i++) {
            $lra = ${$ra}[$i];
            $lst = ${$lra}[0];  # ref hash for anchor
            $lnn = ${$lra}[1];
            $ank = '';
            $text = '';
            $href = '';
            $hnum = -1;
            $ok = ' ';
            foreach $key2 (keys %{$lst}) {
                $val = ${$lst}{$key2};
                if ($key2 eq $anchor_text_key) {
                    $text = $val;
                } else {
                    $ank .= ' ' if (length($ank));
                    $ank .= $key2.'="'.$val.'"';
                    if ($key2 eq 'href') {
                        $href = $val;
                        $hnum = get_href_type($val);
                        if ($hnum == 7) {
                            $ff = $dir.$val;
                            if (-f $ff) {
                                $ok = "ok";
                            } elsif (-d $ff) {
                                $ok = "okd";
                            } else {
                                $ok = "NF";
                            }
                        }
                    }
                }
                #prt( "$hr = $val\n");
            }
            prt("$lnn: HREF=[$href] ($hnum) $ok");
            prt(" - <a $ank>$text</a>") if ($add_full_ank);
            prt("\n");
            push(@list, [$hnum,$href,$ok,$ank,$text,$lnn,$key,$lst]);
        }
    }
    return \@list;
}


#########################################
### MAIN ###
parse_args(@ARGV);
foreach $in_file (@input_files) {
    my $ref_hash = process_file($in_file);
    my $ref_arr  = show_ref_hash($ref_hash);
}
pgm_exit(0,"Normal exit(0)");
########################################

sub show_help {
    prt("$pgmname: Version 0.0.1 Mar 25, 2010\n");
    prt("Usage: $pgmname [options] file [file ...]\n");
    prt("Options:\n");
    prt("  --help (-h or -?) = Show this help and exit 0\n");
}

# Ensure argument exists, or die.
sub require_arg {
    my ($arg, @arglist) = @_;
    pgm_exit(1,"ERROR: no argument given for option '$arg' ...\n" )
        if ! @arglist;
}

##########################################################
# Parse USER input
# Largerly still to be done
##########################################################
sub parse_args {
   my (@av) = @_;
    my ($arg,$sarg,$ch);
   while (@av) {
      $arg = $av[0];
        if ($arg =~ /^-/) {
            $sarg = substr($arg,1);
            $sarg = substr($sarg,1) while ($sarg =~ /^-/);
            $ch = substr($sarg,0,1);
         if (($ch eq '-h')||($ch eq '-?')) {
                prt("Showing help...\n");
            show_help();
                pgm_exit(0,"Help exit 0");
            } else {
                pgm_exit(1,"ERROR: Unknown options [$arg]");
            }
        } else {
            push(@input_files,$arg);
            prt("Added [$arg] file to input list...\n");
        }
    }
    if (!@input_files) {
        if (-f $in_file) {
            push(@input_files,$in_file);
            prt("Added [$in_file] default file to input list...\n");
        } else {
            show_help();
            pgm_exit(1,"ERROR: No input file given!\n");
        }
    }
}

# eof - template.pl

index -|- top

checked by tidy  Valid HTML 4.01 Transitional