Generated: Tue Feb 2 17:54:24 2010 from chklinks01.pl 2007/06/18 38.5 KB.
#!/perl -w # NAME: chklinks01.pl # AIM: Given a input FOLDER, check all the HTML found for a <a href="...." # reference and make sure that reference EXISTS, # either as a LOCAL file, # or that an IP address can be obtained for the HOST if http://<something> ... # AND check ALL image links <img src="..."...>, if it is a LOCAL file, # and other 'link' items, like .zip, .txt, etc. # 02/06/2007 - geoff mclane - geoffair.com/mperl/index.htm use strict; use warnings; use File::Basename; use Socket; unshift(@INC, 'C:/GTools/perl'); require 'logfile.pl' or die "Unable to load logfile.pl ...\n"; require 'htmltools.pl' or die "Unable to load htmltools.pl ...\n"; # for htmltools, if functions used my @imgs = (); my @hrefs = (); # log file stuff my ($LF); my $pgmname = $0; if ($pgmname =~ /\w{1}:\\.*/) { my @tmpsp = split(/\\/,$pgmname); $pgmname = $tmpsp[-1]; } my $outfile = "temp.$pgmname.txt"; open_log($outfile); prt( "$pgmname ... Hello, World ...\n" ); # SET A DEFAULT INPUT FOLDER / FILE ###my $in_folder = "C:\\HOMEPAGE\\simple\\index.htm"; my $in_folder = "C:\\HOMEPAGE\\GA\\index.html"; ###my $in_folder = "C:\\HOMEPAGE\\GA\\flags\\index.htm"; ###my $in_folder = "C:\\HOMEPAGE\\GeoffAir\\welcome.html"; # some FEATURES and USER variables my $recurse = 1; # recursive my $ignfpd = 1; # ignore FRONTPAGE folders my $chkip = 1; # check the IP address my $showhreflinks = 0; # show a WARNING when an IMG, ICO, etc is a REMOTE link my $writeips = 1; # write IP found to a file my $ipfile = "iplinks.txt"; my @ipsfound = (); my @html_ext = qw( .htm .html .shtml .php ); my @graf_ext = qw( .jpg .jpeg .gif .png .bmp .ico .mpg ); my @css_ext = qw( .css ); my @script_ext = qw( .js .class .cgi ); my @fpfolders = qw( _vti_cnf _vti_pvt _private _derived ); my @excused = ( '?dir=test', '?dir=.' ); # program variables # my @excludes = qw( cvineng2.htm ); my @excludes = qw( desktop.ini ); # NOTE: Each of these is a multidimensional array - see offset below my @htm_files = (); # store files found in folder my @img_files = (); my @css_files = (); my @zip_files = (); my @txt_files = (); my @script_files = (); my @other_files = (); # offsets in above arrays my $of_ff = 0; # full file name my $of_hr = 1; # array ref of href links my $of_im = 2; # array ref of image links my $of_lk = 3; # linked count my $of_sp = 4; # spare my @donesrcs = (); my @doneimgs = (); my %ext_hash = (); my $cnt = 0; my $file = ''; my @warnings = (); # list of errors, warnings during running my @httprefs = (); # set of HREF src values my @httpsrefs = (); my @ftprefs = (); my @mtrefs = (); my $hcnt = 0; my $href = ''; my %hrefs = (); my $val = ''; my $msg = ''; my @scripts = (); my $scnt = 0; my $imgcnt = 0; my $procnt = 0; my $homefile = ''; my $total_hrefs = 0; my $total_imgs = 0; my @missed = (); my $excusecnt = 0; my $hrflnkcnt = 0; # $showhreflinks # debug only bits my $dbg1 = 0; # show entering folder ... my $dbg2 = 0; # show ALL HREF entries ... my $dbg3 = 0; # show IP found ... my $dbg4 = 0; # show entered/exit script my $dbg5 = 0; # show 'ok' when found my $dbg6 = 0; # show processing lines my $dbg7 = 0; # show anchor count my $dbg8 = 0; # show unique anchor href my $dbg9 = 0; # show files with SCRIPTS my $dbg10 = 0; # show diag for get_img_srcs() ... my $dbg11 = 0; # in image processing show entered/exits script my $dbg12 = 0; # in image processing show processing count my $dbg13 = 0; # in image processing show ok - found file my $dbg14 = 0; # in image processing show image count found my $dbg15 = 0; # in image processing show image count when NONE found my $dbg16 = 0; # show WARNINGS during run ... my $dbg17 = 0; # show MISSING or BLANK HREF in PHP file my $dbg18 = 0; # check_linkages: show 'ok', in 2nd link check my $dbg19 = 0; # check_local_links: show progress ... my $dbg20 = 0; # check_local_links: show ALL links - NONE IS ALWAYS SHOWN ... my $dbg21 = 0; # check_local_links: show LINK when found ... my $dbg22 = 0; # mark_image_link: show comparing, and comparision ... my $dbg23 = 0; # mark_image_link: show count of new images marked ... my $dbg24 = 0; # show each image file being marked my $dbg25 = 0; # show NO LINK FOUND my $dbg26 = 0; # show EACH HTML FILE BEING PROCESSED my $dbg27 = 0; # show EACH extesnions, and counts parse_args(@ARGV); if (length($in_folder) == 0) { mydie( "No input folder (or file) given/found in command ...\n" ); } if (-f $in_folder) { ($homefile, $in_folder) = fileparse($in_folder); $in_folder =~ s/[\\\/]$//; } prt( "Checking $in_folder ...\n" ); process_folder( $in_folder ); show_found_counts(); process_file_array(); process_host_array(); ############################ prt( "\n###### SHOW RESULTS ########\n" ); prt( "WARNING: $hrflnkcnt images by HREF not shown! (change \$showhreflinks)\n" ) if ($hrflnkcnt); $scnt = scalar @scripts; if ($scnt && $dbg9) { prt( "Got $scnt files containing SCRIPTS ...\n" ); # push(@scripts, [$fil, $lns]); for (my $i = 0; $i < $scnt; $i++) { $file = $scripts[$i][0]; $val = $scripts[$i][1]; prt( "$file $val\n" ); } } if (length($homefile)) { check_linkages( $homefile ); check_local_links( $homefile ); show_link_counts("HTML File", \@htm_files); show_link_counts("IMG Files", \@img_files); show_link_counts("CSS Files", \@css_files); show_link_counts("ZIP Files", \@zip_files); show_link_counts("TXT Files", \@txt_files); show_link_counts("Script Files", \@script_files); show_link_counts("Other Files", \@other_files); } $cnt = scalar @warnings; if ($cnt) { prt( "\nWARNINGS FOLLOW ($cnt):\n" ); foreach my $w (@warnings) { prt( "$w\n" ); } } else { prt( "No warnings ...\n" ); } if (@missed) { prt( "\nMISSING FOLLOW: ".scalar @missed."\n" ); foreach $file (@missed) { prt( "$file\n"); } } prt( "###### END RESULTS ########\n" ); close_log($outfile,1); exit(0); ################################## sub process_file_array { my $max =scalar @htm_files; for (my $i = 0; $i < $max; $i++) { $file = $htm_files[$i][$of_ff]; my ($nm,$dir,$ext) = fileparse( $file, qr/\.[^.]*/ ); my $htot = 0; my $itot = 0; $procnt++; if (open INF, "<$file") { my @lines = <INF>; close INF; @lines = drop_php_from_array( @lines ) if (lc($ext) eq '.php'); # THIS IS USING htmltool.pl - get a single line of TEXT ... my $txt = join( '', @lines ); # get whole text my @is = ret_imgs_array( $txt ); my $ntxt = remove_script( $txt ); $ntxt = trimblanklines($ntxt); @hrefs = (); # clear my @hr = ret_hrefs_array( $ntxt ); ### collecthrefs( $txt, 0 ); ### collectimgs( $txt, 0 ); # bump the counts of HREF and IMGS collected $itot += scalar @is; $htot += scalar @hr; # store the references ... that is a reference to an array $htm_files[$i][$of_hr] = \@hr; $htm_files[$i][$of_im] = \@is; # new code to do some similar things, but while in the array ###@lines = drop_php_from_array( @lines ) if (lc($ext) eq '.php'); @lines = dropcomments_from_array(@lines); my @srcs = get_img_srcs($file, @lines); $imgcnt += check_images( $file, @srcs ); @srcs = get_href_srcs($file, @lines); check_hrefs( $file, @srcs ); } $total_hrefs += $htot; $total_imgs += $itot; if ((($procnt % 100) == 0)||($max < 10)) { ###local $| = 1; ###prt( "\rDone $procnt HTML files ..." ); if ($max < 10) { prt( "Done $file HTML file ... href,other($htot,$itot)\n" ); } else { prt( "Done $procnt HTML files ... href,other ($total_hrefs,$total_imgs)\n" ); } } } prt( "Completed $procnt HTML files ... Found $total_hrefs HREF, and $total_imgs IMG/OTHER tokens.\n" ); } sub get_href_type { my ($src) = shift; if ($src =~ /^http:/i) { #push(@httprefs, [$src, $fil, $lnnos] ); return 1; # remote HREF } elsif ($src =~ /^https:/i) { return 2; # remote HREF #push(@httpsrefs, [$src, $fil, $lnnos] ); } elsif ($src =~ /^ftp:/i) { #push(@ftprefs, [$src, $fil, $lnnos] ); return 3; # remote HREF } elsif ($src =~ /^mailto:/i) { #push(@mtrefs, [$src, $fil, $lnnos] ); return 4; # remote HREF } elsif ( $src =~ /^javascript:/i ) { return 5; # a JAVASCRIPT HREF } elsif ( substr($src,0,1) eq '#') { # local in page HREF return 6; } else { my $ind = index($src,'#'); if ( $ind != -1 ) { $src = substr($src,0,$ind); } $ind = index($src,'?'); if ( $ind != -1 ) { $src = substr($src,0,$ind); } $src =~ s/\/$//; if (length($src)) { return 7; } } return 0; } sub get_local_href { my ($src) = shift; my $ind = index($src,'#'); if ( $ind != -1 ) { $src = substr($src,0,$ind); } $ind = index($src,'?'); if ( $ind != -1 ) { $src = substr($src,0,$ind); } $src =~ s/\/$//; return $src; } sub dos_2_unix($) { my ($du) = shift; $du =~ s/\\/\//g; return $du; } ### my @donesrcs = (); sub in_done_srcs { my ($f) = shift; foreach my $fd (@donesrcs) { if ($fd eq $f) { return 1; } } return 0; } sub in_done_imgs { my ($f) = shift; foreach my $fd (@doneimgs) { if ($fd eq $f) { return 1; } } return 0; } sub fix_rel_unix_path { my ($path) = shift; $path = dos_2_unix($path); my @a = split(/\//, $path); my $npath = ''; my $max = scalar @a; my @na = (); for (my $i = 0; $i < $max; $i++) { my $p = $a[$i]; if ($p eq '.') { # ignore this } elsif ($p eq '..') { if (@na) { pop @na; # discard previous } else { prt( "WARNING: Got relative .. without previous!!!\n" ); } } else { push(@na,$p); } } foreach my $pt (@na) { $npath .= "/" if length($npath); $npath .= $pt; } return $npath; } sub mark_image_link { my ($fnd, $src, $lev) = @_; my $fcnt = scalar @img_files; my $msrc = lc(dos_2_unix($src)); prt( "Seeking [$msrc] in $fcnt images files ...\n" ) if ($dbg22); for (my $i = 0; $i < $fcnt; $i++) { my $fil = $img_files[$i][$of_ff]; my $mfil = lc(dos_2_unix($fil)); prt( "Comparing to $mfil ...\n" ) if ($dbg22); if ($msrc eq $mfil) { $val = $img_files[$i][$of_lk]; $val++; $img_files[$i][$of_lk] = $val; prt( "IMG link to $fil ($i) $val\n" ) if ($dbg22); return 0; } } prt( "$src - NOT FOUND!\n" ) if ($dbg21); return 1; } sub mark_other_links { my ($fnd, $src, $lev) = @_; my $totcnt = 0; my $msrc = lc(dos_2_unix($src)); my $fcnt = scalar @img_files; my $i; $totcnt += $fcnt; if (mark_image_link( $fnd, $src, $lev ) == 0) { return 0; } # maybe ZIP files $fcnt = scalar @zip_files; $totcnt += $fcnt; for ($i = 0; $i < $fcnt; $i++) { my $fil = $zip_files[$i][$of_ff]; my $mfil = lc(dos_2_unix($fil)); if ($msrc eq $mfil) { $val = $zip_files[$i][$of_lk]; $val++; $zip_files[$i][$of_lk] = $val; prt( "ZIP link to $fil ($i) $val\n" ) if ($dbg21); return 0; } } # maybe TXT files $fcnt = scalar @txt_files; $totcnt += $fcnt; for ($i = 0; $i < $fcnt; $i++) { my $fil = $txt_files[$i][$of_ff]; my $mfil = lc(dos_2_unix($fil)); if ($msrc eq $mfil) { $val = $txt_files[$i][$of_lk]; $val++; $txt_files[$i][$of_lk] = $val; prt( "TXT link to $fil ($i) $val\n" ) if ($dbg21); return 0; } } # maybe CSS files $fcnt = scalar @css_files; $totcnt += $fcnt; for ($i = 0; $i < $fcnt; $i++) { my $fil = $css_files[$i][$of_ff]; my $mfil = lc(dos_2_unix($fil)); if ($msrc eq $mfil) { $val = $css_files[$i][$of_lk]; $val++; $css_files[$i][$of_lk] = $val; prt( "CSS link to $fil ($i) $val\n" ) if ($dbg21); return 0; } } # maybe SCRIPT files $fcnt = scalar @script_files; $totcnt += $fcnt; for ($i = 0; $i < $fcnt; $i++) { my $fil = $script_files[$i][$of_ff]; my $mfil = lc(dos_2_unix($fil)); if ($msrc eq $mfil) { $val = $script_files[$i][$of_lk]; $val++; $script_files[$i][$of_lk] = $val; prt( "SCRIPT link to $fil ($i) $val\n" ) if ($dbg21); return 0; } } # OK, OTHER $fcnt = scalar @other_files; $totcnt += $fcnt; for ($i = 0; $i < $fcnt; $i++) { my $fil = $other_files[$i][$of_ff]; my $mfil = lc(dos_2_unix($fil)); if ($msrc eq $mfil) { $val = $other_files[$i][$of_lk]; $val++; $other_files[$i][$of_lk] = $val; prt( "OTHER link to $fil ($i) $val\n" ) if ($dbg21); return 0; } } $totcnt += 1 if ($totcnt == 0); return $totcnt; } sub mark_link { my ($fnd, $src, $lev) = @_; my $fcnt = scalar @htm_files; my $msrc = lc(dos_2_unix($src)); my $ff = $htm_files[$fnd][$of_ff]; my $i = 0; my $totcnt = $fcnt; my $fil = ''; my $mfil = ''; my $val = 0; for ($i = 0; $i < $fcnt; $i++) { if ($i != $fnd) { $fil = $htm_files[$i][$of_ff]; $mfil = lc(dos_2_unix($fil)); if ($msrc eq $mfil) { $val = $htm_files[$i][$of_lk]; $val++; $htm_files[$i][$of_lk] = $val; prt( "$ff ($fnd) linked to $fil ($i) $val\n" ) if ($dbg21); my $hr = $htm_files[$i][$of_hr]; # extract HREF ref.array my $im = $htm_files[$i][$of_im]; # extract IMAGE ref.array my $hrc = scalar @{$hr}; my $imc = scalar @{$im}; # get count of images my $j = 0; my ($itmnam, $itmdir) = fileparse($fil); # get name and path prt( "$lev [$fil] has $hrc href, $imc img/other links ..\n" ) if ($dbg19 || $dbg26); for ($j = 0; $j < $hrc; $j++) { my $hrf = ${$hr}[$j]; my $hrt = get_href_type($hrf); if ($hrt == 7) { my $nsrc = fix_rel_unix_path($itmdir.get_local_href($hrf)); if ( !in_done_srcs($nsrc) ) { push(@donesrcs, $nsrc); # put it in DONE list mark_link( $i, $nsrc, $lev + 1 ); # and MARK its links now } } } $val = 0; prt( "$fil - Checking $imc images files ...\n") if ($dbg24); for ($j = 0; $j < $imc; $j++) { # do each, in this linked file my $img = ${$im}[$j]; # get the image string my $isrc = $itmdir.$img; # join it with the path my $nisrc = fix_rel_unix_path($isrc); # fix rel, and force unix path prt( "Marking [$nisrc] - ".($j+1)." of $imc img/other links ..\n" ) if ($dbg19 || $dbg26); if ( !in_done_imgs($nisrc) ) { push(@doneimgs, $nisrc); # put it in DONE list mark_other_links( $j, $nisrc, 0 ); # and MARK the link in @img_files $val++; } else { prt( "Already IN doneimgs ...\n" ) if ($dbg19 || $dbg26); } } prt( "$fil - Marked $val of $imc images files ...\n") if ($val && $dbg24); return 0; } } } # hmmmm, LINK not found in HREF files, maybe IMAGES, zip, etc ... $val = mark_other_links( $fnd, $src, $lev ); if ($val) { $totcnt += $val; prt( "NO LINK FOUND HREF [$src]($msrc) in $totcnt file - $ff ($fnd) - ($lev)!\n" ) if ($dbg25); return 1; } return 0; } sub show_link_counts { my ($m, $hf) = @_; my $fcnt = scalar @{$hf}; my $mcnt = 0; my $mss = "Showing LINKS for $fcnt $m files ...\n"; if ($fcnt) { for (my $i = 0; $i < $fcnt; $i++) { my $fil = ${$hf}[$i][$of_ff]; my $hrt = ${$hf}[$i][$of_lk]; if ($hrt) { if ($dbg20) { prt( $mss ) if (length($mss)); $mss = ''; prt( "$i: $fil has $hrt links\n" ); } } else { prt( $mss ) if (length($mss)); $mss = ''; prt( "$i: $fil has NO LINKS!\n" ); $mcnt++; } } if ($mcnt) { prt( "Done LINKS for $fcnt $m files ... MISSED $mcnt!!!\n" ); } } else { prt( "There are NO $m files ...\n" ) if ($dbg20); } } sub in_excused { my ($tx) = shift; foreach my $t (@excused) { if ($t eq $tx) { return 1; } } return 0; } # using the given HOME PAGE, try to TRACE ALL LINKS sub check_local_links { my ($hf) = shift; my $fcnt = scalar @htm_files; my ($hfnm,$hfdir) = fileparse($hf); my $lchf = lc($hfnm); my $fnd = -1; my ($fil,$nm,$dir,$ext); if ($hfdir eq ".\\") { $hfdir = $in_folder."\\"; } my $itmdir = ''; my $itmnam = ''; my $i = 0; my $i2 = 0; prt( "Checking local links, for $fcnt files. Finding $hf ...\n"); for ($i = 0; $i < $fcnt; $i++) { $fil = $htm_files[$i][$of_ff]; ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ ); if (lc($nm.$ext) eq $lchf) { # have at least the NAME, but maybe not the FOLDER if (lc($hfdir) eq lc($dir)) { $fnd = $i; prt( "Found $nm$ext at index $fnd ... folder $dir ...\n" ); last; } } ### prt( "[$nm$ext] is NOT [$lchf] ... [$fil] $hf ...\n" ); } if ($fnd == -1) { prt( "WARNING: check_local_links: Unable to find [$hf] ...\n" ); return 1; } # process item 1 ... $procnt = 1; my $hr = $htm_files[$fnd][$of_hr]; my $im = $htm_files[$fnd][$of_im]; my $hrc = scalar @{$hr}; my $imc = scalar @{$im}; my $hrf = ''; my $img = ''; my $hrt = 0; my $src = ''; my $nsrc = ''; my $ff = ''; my $shwerr = 0; my $emsg = ''; $fil = $htm_files[$fnd][$of_ff]; # extract FULL PATH name of file ... $htm_files[$fnd][$of_lk] = 1; ($itmnam, $itmdir) = fileparse($fil); # get name and path prt( "HOME [$fil] has $hrc href, $imc img/other links ..\n" ) if ($dbg19 || $dbg26); for ($i = 0; $i < $hrc; $i++) { $hrf = ${$hr}[$i]; $hrt = get_href_type($hrf); $i2 = $i + 1; $shwerr = 0; $emsg = "IN[$fil] $i2 HREF [$hrf]$hrt"; if ($hrt == 0) { if (in_excused($hrf)) { $excusecnt++; } else { $emsg .= " CHECK UNKNOWN ****** CHECK ME ******[1]"; push(@warnings, "WARNING: $emsg!" ); $shwerr = 1; } } elsif ($hrt < 5) { $emsg .= " REMOTE"; } elsif ($hrt == 5) { $emsg .= " JAVASCRIPT"; } elsif ($hrt == 6) { $emsg .= " IN PAGE"; } else { my $lref = get_local_href($hrf); $src = $itmdir.$lref; my $nusrc = fix_rel_unix_path($src); ### prt( "REL PATH [$src] to UNIX PATH [$nusrc]\n" ); push(@donesrcs, $nusrc); # put it in DONE list if ( mark_link( $fnd, $nusrc, 0 ) ) { $emsg .= " SITE REF [$nusrc] ***NO IN-SITE LINK***???"; $msg = "$i2 [$fil] HREF [$hrf]$hrt SITE REF [$nusrc] ***NO IN-SITE LINK***???"; if (-f $src) { $msg .= "\n*** BUT FILE EXISTS [$src] ***"; $emsg .= "\n*** BUT FILE EXISTS [$src] ***"; push(@warnings, "WARNING: Local HREF [$lref] in [$fil] OUTSIDE WEB! but EXISTS!" ); } else { push(@missed, $msg ); $shwerr = 1; } } else { $emsg .= " SITE REF [$src] ok" if ($dbg19); } } prt( "$emsg\n" ) if ($dbg19 || $shwerr); } prt( "HOME - Marking $imc images files ...\n") if ($dbg24); for ($i = 0; $i < $imc; $i++) { $img = ${$im}[$i]; $src = $itmdir.$img; $nsrc = fix_rel_unix_path($src); prt( "HOME $fil - Mark $src ($nsrc) image ...\n" ) if ($dbg24); push(@doneimgs, $nsrc); # put it in DONE list mark_other_links( $i, $nsrc, 0 ); } return 0; } sub check_linkages { my ($hf) = shift; my $fcnt = scalar @htm_files; my ($hfnm,$hfdir) = fileparse($hf); my $lchf = lc($hfnm); my $fnd = -1; my ($fil,$nm,$dir,$ext); if ($hfdir eq ".\\") { $hfdir = $in_folder."\\"; } my $itmdir = ''; my $itmnam = ''; my $i = 0; my $i2 = 0; prt( "Re-checking HREF and IMG/OTHER links, for $fcnt files ...\n"); for ($i = 0; $i < $fcnt; $i++) { $fil = $htm_files[$i][$of_ff]; ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ ); if (lc($nm.$ext) eq $lchf) { # have at least the NAME, but maybe not the FOLDER if (lc($hfdir) eq lc($dir)) { $fnd = $i; prt( "Found $hf at index $fnd\n" ); last; } } ### prt( "[$nm$ext] is NOT [$lchf] ... [$fil] $hf ...\n" ); } if ($fnd == -1) { prt( "WARNING: Unable to find [$hf] ...\n" ); push(@warnings, "WARNING: Unable to find [$hf] ..."); return 1; } # process item 1 ... $procnt = 1; my $hr = $htm_files[$fnd][$of_hr]; my $im = $htm_files[$fnd][$of_im]; my $hrc = scalar @{$hr}; my $imc = scalar @{$im}; my $hrf = ''; my $img = ''; my $hrt = 0; my $src = ''; my $ff = ''; my $shwerr = 0; my $emsg = ''; $fil = $htm_files[$fnd][$of_ff]; # extract FULL PATH name of file ... ($itmnam, $itmdir) = fileparse($fil); # get name and path prt( "\n" ) if ($dbg18); prt( "$procnt [$fil] has $hrc href, and $imc image links ..\n" ) if ($dbg18); for ($i = 0; $i < $hrc; $i++) { $hrf = ${$hr}[$i]; $hrt = get_href_type($hrf); $i2 = $i + 1; $shwerr = 0; $emsg = "HH[$fil] "; $emsg .= "$i2 HREF [$hrf]$hrt"; if ($hrt == 0) { if (in_excused($hrf)) { $excusecnt++; } else { $emsg .= " CHECK UNKNOWN ****** CHECK ME ******[2]"; push(@warnings, "WARNING: $emsg" ); $shwerr = 1; } } elsif ($hrt < 5) { $emsg .= " REMOTE"; } elsif ($hrt == 5) { $emsg .= " JAVASCRIPT"; } elsif ($hrt == 6) { $emsg .= " LOCAL"; } else { $src = $itmdir.get_local_href($hrf); if (-f $src) { $emsg .= " SITE REF [$src] ok"; } else { $emsg .= " SITE REF [$src] ***MISSING***?[1]"; push(@missed, $emsg ); $shwerr = 1; } } prt( "$emsg\n" ) if ($dbg18 || $shwerr); } # From this beginning for (my $j = 0; $j < $fcnt; $j++) { $fil = $htm_files[$j][$of_ff]; ($itmnam, $itmdir) = fileparse($fil); # get name and path if ($j != $fnd) { $procnt++; $hr = $htm_files[$j][$of_hr]; $im = $htm_files[$j][$of_im]; $hrc = scalar @{$hr}; $imc = scalar @{$im}; prt( "\n" ) if ($dbg18); prt( "$procnt [$fil] has $hrc href, and $imc image links ..\n" ) if ($dbg18); for ($i = 0; $i < $hrc; $i++) { $hrf = ${$hr}[$i]; $hrt = get_href_type($hrf); $i2 = $i + 1; $shwerr = 0; $emsg = "HF[$fil] "; $emsg .= "$i2 HREF [$hrf]$hrt"; if ($hrt == 0) { if (in_excused($hrf)) { $excusecnt++; } else { $emsg .= " CHECK UNKNOWN ****** CHECK ME ******[3]"; push(@warnings, "WARNING: $emsg"); $shwerr = 1; } } elsif ($hrt < 5) { $emsg .= " REMOTE"; } elsif ($hrt == 5) { $emsg .= " JAVASCRIPT"; } elsif ($hrt == 6) { $emsg .= " LOCAL"; } else { $src = $itmdir.get_local_href($hrf); if (-f $src) { $emsg .= " SITE REF [$src] ok"; } else { $emsg .= " SITE REF [$src] ***MISSING***?[3]"; push(@missed, $emsg ); $shwerr = 1; } } prt( "$emsg\n" ) if ($dbg18 || $shwerr); } for (my $i = 0; $i < $imc; $i++) { $img = ${$im}[$i]; $emsg = "IF[$fil] [$img] "; if ($img =~ /^http:\/\/.*/i) { if ($showhreflinks) { push(@warnings, "WARNING: IMG link is HREF $emsg [1]"); } else { $hrflnkcnt++; } } else { $src = $itmdir.$img; $shwerr = 0; if (-f $src) { $emsg .= " IMG ok"; } else { $emsg .= " IMG ***MISSING***?[5]"; push(@missed, $emsg ); $shwerr = 1; } } prt( "$emsg\n" ) if ($dbg18 || $shwerr); } } } return 0; } sub check_images { my ($ifile, @srcs) = @_; my ($nm, $dir) = fileparse($ifile); my $scnt = scalar @srcs; if ($scnt) { prt( "Found $scnt imgs in $nm ...\n" ) if ($dbg14); for (my $i = 0; $i < $scnt; $i++) { my $src = $srcs[$i][0]; my $lnn = $srcs[$i][1]; if ($src =~ /^http:\/\//i) { # remote HREF } else { my $ff = $dir.$src; if ( -f $ff ) { prt( "$src - ok\n" ) if ($dbg13); } else { my $msg = "WARNING: [$src] $ifile:$lnn NOT FOUND!"; push(@warnings, $msg); prt( "$msg\n" ) if ($dbg16); } } } } else { prt( "Found NO imgs in [$ifile] ...\n" ) if ($dbg15); } return $scnt; } sub get_img_srcs { my ($fil, @lns) = @_; my $lc = scalar @lns; my $scnt = 0; my ($nm,$dir) = fileparse( $fil ); #my $sdbg12 = $dbg12; #my $sdbg11 = $dbg11; #my $sdbg16 = $dbg16; #my $sdbg10 = $dbg10; #if (lc($nm) eq 'moon.htm') { # $dbg12 = 1; # $dbg11 = 1; # $dbg16 = 1; # $dbg10 = 1; #} prt( "Processing $lc lines from [$nm] dir=[$dir]...\n" ) if ($dbg12); my @isrc = (); my $ln = ''; my $bal = ''; my $inscript = 0; my $msg = ''; my $bgnln = 0; my $lnnos = ''; for (my $i = 0; $i < $lc; $i++) { $ln = $bal; $bal = ''; $ln .= $lns[$i]; chomp $ln; prt( "$i [$ln] ...\n" ) if ($dbg10); if ($inscript) { if ($ln =~ /<\/script>/i) { $inscript =0; prt( "EXIT a SCRIPT ...\n" ) if ($dbg11); } next; } if ( $ln =~ /<img\s+(.*)/i ) { my $iln = $1; if ( $ln =~ /<script.*>/i ) { $msg = "WARNING: Also found SCRIPT in IMG line ...[$ln]"; push(@warnings, $msg); prt( "$msg\n" ) if ($dbg16); } prt( "Found [$iln] ...\n" ) if ($dbg10); $bgnln = $i; while ( !($iln =~ />/) && ($i < $lc)) { $i++; my $nxln = $lns[$i]; chomp $nxln; prt( "Adding [$nxln] ...\n" ) if ($dbg10); $iln .= ' ' if !($iln =~ /=$/); $iln .= $nxln; } $lnnos = "$bgnln:$i"; my $ind = index($iln, '>'); if ($ind != -1) { $bal = substr($iln, $ind+1); $iln = substr($iln, 0, $ind+1); } $iln = trim_all($iln); #if ($iln =~ /src=\"(.+)\"/i) { if ($iln =~ /src=\s*\"(\S+)\"/i) { prt( "SRC = $1 In line [$iln]$lnnos...\n" ) if ($dbg10); push(@isrc, [$1, $lnnos, $fil]); $scnt++; } elsif ($iln =~ /src=\s*(\S+)/i) { # without QUOTES prt( "SRC = $1 In line [$iln]$lnnos...\n" ) if ($dbg10); push(@isrc, [$1, $lnnos, $fil]); $scnt++; } elsif ($iln =~ /src=\s*\'(\S+)\'/i) { # single QUOTES prt( "SRC = $1 In line [$iln]$lnnos...\n" ) if ($dbg10); push(@isrc, [$1, $lnnos, $fil]); $scnt++; } else { $msg = "WARNING: SRC NOT FOUND in [$iln]$fil:$lnnos..."; push(@warnings, $msg); prt( "$msg\n" ) if ($dbg16); } } elsif ( $ln =~ /<script.*>/i ) { $inscript = 1; prt( "Entered a SCRIPT ...\n" ) if ($dbg11); if ($ln =~ /<\/script>/i) { $inscript =0; prt( "EXIT a SCRIPT ...\n" ) if ($dbg11); } } } prt( "Returning $scnt img sources ...\n") if ($dbg10); #$dbg12 = $sdbg12; #$dbg11 = $sdbg11; #$dbg16 = $sdbg16; #$dbg10 = $sdbg10; return @isrc; } sub check_hrefs { my ($fil, @srcs) = @_; my ($fnm,$fdir,$fext) = fileparse( $fil, qr/\.[^.]*/ ); my $scnt = scalar @srcs; my $isphp = (lc($fext) eq '.php'); if ($scnt) { prt( "Found $scnt anchor href= in $fnm$fext ...\n" ) if ($dbg7); for (my $i = 0; $i < $scnt; $i++) { my $orgsrc = $srcs[$i][0]; my $lnnos = $srcs[$i][1]; my $src = $orgsrc; if ($src =~ /^http:/i) { # remote HREF push(@httprefs, [$src, $fil, $lnnos] ); } elsif ($src =~ /^https:/i) { # remote HREF push(@httpsrefs, [$src, $fil, $lnnos] ); } elsif ($src =~ /^ftp:/i) { # remote HREF push(@ftprefs, [$src, $fil, $lnnos] ); } elsif ($src =~ /^mailto:/i) { # remote HREF push(@mtrefs, [$src, $fil, $lnnos] ); } elsif ( $src =~ /^#/ ) { # local in page HREF } elsif ( $src =~ /^javascript:/i ) { # a JAVASCRIPT HREF } else { my $ind = index($src,'#'); if ( $ind != -1 ) { $src = substr($src,0,$ind); } $ind = index($src,'?'); if ( $ind != -1 ) { $src = substr($src,0,$ind); } $src =~ s/\/$//; if (length($src)) { my $ff = $fdir.$src; if ( -f $ff ) { prt( "$src - ok\n" ) if ($dbg5); } else { my $msg = "WARNING: href [$src] file NOT FOUND! in [$fil]$lnnos"; push(@warnings, $msg); prt( "$msg\n" ) if ($dbg16); } } else { if ($isphp) { prt( "Found BLANK HREF in PHP $fil ...\n" ) if ($dbg17); } else { $msg = "WARNING: Found BLANK HREF in $fil ..."; push(@warnings, $msg); prt( "$msg\n" ) if ($dbg16); } } } } } else { if ($isphp) { prt( "Found NO HREFs in PHP $fil ...\n" ) if ($dbg17); } else { prt( "Found NO HREFs in $fil ...\n" ); } } } ############################################################ # Only used is $chkip = 1; # Show IP Address # uses sockets, gethostbyname # Return 0, if can NOT be resolved. # else the number of IP addresses resolved. ############################################################ sub showIPAddress { my ($nm) = shift; my @addr = gethostbyname($nm); my $cnt = 0; if( !@addr ) { prt( "Can't resolve $nm: $!\n" ); return 0; } @addr = map { inet_ntoa($_) } @addr[4 .. $#addr]; foreach my $k (@addr) { $cnt++; prt( "$cnt: $nm resolves to IP [$k]\n" ) if ($dbg3); } return $cnt; } ################################################ # Add to @scripts multidimensional array, # if NOT already in there, when on the line # numbers are added. ############################################### sub add_2_scripts { my ($fil, $lns) = @_; my $sc = scalar @scripts; for (my $i = 0; $i < $sc; $i++) { my $cf = $scripts[$i][0]; if ($cf eq $fil) { my $lc = $scripts[$i][1]; $lc .= ":$lns"; $scripts[$i][1] = $lc; return 0; } } push(@scripts, [$fil, $lns]); return 1; } #################################################### # Get HREF sources # Given an ARRAY of file lines, check for # anchor href="something" ... # Return the "something" in an array #################################################### sub get_href_srcs { my ($fil, @lns) = @_; my $lc = scalar @lns; my $scnt = 0; my $slns = 0; # count the SCRIPT lines my ($nm,$dir) = fileparse( $fil ); prt( "Processing $lc lines from [$nm] dir=[$dir]...\n" ) if ($dbg6); my @isrc = (); my $ln = ''; my $bal = ''; my $inscript = 0; $slns = 0; my $bgnln = 0; my $endln = 0; for (my $i = 0; $i < $lc; $i++) { $ln = $bal; $bal = ''; $ln .= $lns[$i]; chomp $ln; prt( "$i [$ln] ...\n" ) if ($dbg10); if ($inscript) { if ($ln =~ /<\/script>/i) { $inscript =0; prt( "EXIT a SCRIPT ...\n" ) if ($dbg4); add_2_scripts( $fil, $slns ); $slns = 0; next; } $slns++; next; } if ( $ln =~ /<a\s+(.*)/i ) { my $iln = $1; prt( "Found [$iln] ...\n" ) if ($dbg10); $bgnln = $i; while ( !($iln =~ />/) && ($i < $lc)) { $i++; my $nxln = $lns[$i]; chomp $nxln; prt( "Adding [$nxln] ...\n" ) if ($dbg10); $iln .= ' ' if !($iln =~ /=$/); $iln .= $nxln; } $endln = $i; my $ind = index($iln, '>'); if ($ind != -1) { $bal = substr($iln, $ind+1); $iln = substr($iln, 0, $ind+1); } #if ($iln =~ /src=\"(.+)\"/i) { if ($iln =~ /href\s*=\s*\"(\S+)\"/i) { prt( "HREF = $1\nIn line [$iln]...\n" ) if ($dbg10); push(@isrc, [$1, "$bgnln:$endln"] ); $scnt++; } else { if (( $iln =~ /name=\s*\"(\S+)\"/i )||( $iln =~ /name=(\S+)/i )) { # ignore BOOKMARKS } else { $msg = "WARNING: HREF NOT FOUND in [$iln]..."; push(@warnings, $msg); prt( "$msg\n" ) if ($dbg16); } } } elsif ( $ln =~ /<script.*>/i ) { $inscript = 1; prt( "Entered a SCRIPT ...\n" ) if ($dbg4); $slns = 0; $ln = substr($ln, 7); if ($ln =~ /<\/script>/i) { $inscript =0; prt( "EXIT a SCRIPT ...\n" ) if ($dbg4); add_2_scripts( $fil, 1 ); $slns = 0; } } } if ($inscript) { $msg = "WARNING: EXIT WHILE IN SCRIPT in [$fil]..."; push(@warnings, $msg); prt( "$msg\n" ) if ($dbg16); } prt( "Returning $scnt HREF sources ...\n") if ($dbg10); return @isrc; } ########################################################## # Parse USER input # Largerly still to be done ########################################################## sub parse_args { my (@av) = @_; while (@av) { $in_folder = $av[0]; prt( "Input folder set to [$in_folder]...\n" ); shift @av; } } ######################################################### # Passed an array of extensions, # check if this is one of them? ######################################################### sub is_my_ext { my ($fil, @exts) = @_; my ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ ); foreach my $ex (@exts) { if (lc($ex) eq lc($ext)) { return 1; } } return 0; } ############################################ # only looking for HTM, HTML, PHP, # could be extended to others maybe ... ############################################ sub is_htm_ext { my ($fil) = shift; return( is_my_ext($fil, @html_ext) ); } sub is_graphic_ext { my ($fil) = shift; return( is_my_ext($fil, @graf_ext) ); } sub is_zip_ext { my ($fil) = shift; my @arr = qw( .zip ); return( is_my_ext($fil, @arr) ); } sub is_css_ext { my ($fil) = shift; return( is_my_ext($fil, @css_ext) ); } sub is_txt_ext { my ($fil) = shift; my @arr = qw( .txt ); return( is_my_ext($fil, @arr) ); } sub is_script_ext { my ($fil) = shift; return( is_my_ext($fil, @script_ext) ); } ################################################ # my $ignfpd = 1; # ignore FRONTPAGE folders ################################################ sub is_fp_folder { my ($inf) = shift; foreach my $fil (@fpfolders) { if (lc($inf) eq lc($fil)) { return 1; } } return 0; } #################################### # Check if FILE is in EXCLUDE list #################################### sub in_excludes { my ($fil) = shift; my $lcf = lc($fil); foreach my $f (@excludes) { if (lc($f) eq $lcf) { return 1; } } return 0; } #################################################################### # process_folder(folder) # Main DIRECTORY processing function # # Open the FOLDER given, and collect ALL files found, # iterate into sub-directories, if $recurse is non-zero, # and it is NOT a special FRONTPAGE (hidden) FOLDER. # # Files are collected into multidemensional arrays # if NOT in @exclude list. # if (is_htm_ext($fil)) { # push(@htm_files, [$ff, '', '', 0, 0] ); # } elsif (is_graphic_ext($fil)) { # push(@img_files, [$ff, '', '', 0, 0] ); # } elsif (is_zip_ext($fil)) { # push(@zip_files, [$ff, '', '', 0, 0] ); # } elsif (is_css_ext($fil)) { # push(@css_files, [$ff, '', '', 0, 0] ); # } elsif (is_txt_ext($fil)) { # push(@txt_files, [$ff, '', '', 0, 0] ); # } elsif (is_script_ext($fil)) { # push(@script_files, [$ff, '', '', 0, 0] ); # } else { # push(@other_files, [$ff, '', '', 0, 0] ); # } # #################################################################### sub process_folder { my ($inf) = shift; my $fcnt = 0; prt( "Processing $inf folder ...\n" ) if ($dbg1); if ( opendir( DIR, $inf ) ) { my @files = readdir(DIR); closedir DIR; foreach my $fil (@files) { if (($fil eq ".")||($fil eq "..")) { next; } my $ff = $inf."\\".$fil; if ( -d $ff ) { if ($recurse) { if ($ignfpd && is_fp_folder($fil)) { # ignore FRONTPAGE folders next; } process_folder( $ff ); } } else { # NOTE: multidimensional arrays pushed - offsets into arrays # my $of_ff = 0; # 1 - full file name # my $of_hr = 1; # 2 - array ref of href links # my $of_im = 2; # 3 - array ref of image links # my $of_lk = 3; # 4 - linked count # my $of_sp = 4; # 5 - spare if ( !in_excludes($fil) ) { my ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ ); my $val = 0; $val = $ext_hash{$ext} if ( defined $ext_hash{$ext} ); $val++; $ext_hash{$ext} = $val; if (is_htm_ext($fil)) { push(@htm_files, [$ff, '', '', 0, 0] ); $fcnt++; } elsif (is_graphic_ext($fil)) { push(@img_files, [$ff, '', '', 0, 0] ); } elsif (is_zip_ext($fil)) { push(@zip_files, [$ff, '', '', 0, 0] ); } elsif (is_css_ext($fil)) { push(@css_files, [$ff, '', '', 0, 0] ); } elsif (is_txt_ext($fil)) { push(@txt_files, [$ff, '', '', 0, 0] ); } elsif (is_script_ext($fil)) { push(@script_files, [$ff, '', '', 0, 0] ); } else { push(@other_files, [$ff, '', '', 0, 0] ); } } } } prt( "Processed $inf folder finding $fcnt HTML files ...\n" ) if ($dbg1); } else { prt( "ERROR: Failed to open folder $inf ...\n" ); } } ############################################## # Just to show the COUNTS in the ARRAYS ############################################## sub show_found_counts { my $cnt = scalar @htm_files; prt( "Found $cnt HTML, "); $cnt = scalar @img_files; prt( "$cnt images, " ); $cnt = scalar @css_files; prt( "$cnt css, " ); $cnt = scalar @zip_files; prt( "$cnt zip, " ); $cnt = scalar @txt_files; prt( "$cnt txt, " ); $cnt = scalar @script_files; prt( "$cnt script, " ); $cnt = scalar @other_files; prt( "and $cnt others ...\n" ); $cnt = scalar keys %ext_hash; if ($dbg27) { prt( "$cnt extensions, and each count ...\n" ); foreach my $key (keys %ext_hash) { my $val = $ext_hash{$key}; prt( "$val $key "); } prt("\n"); } } # @ipsfound = <INF>; sub in_ips_found { my ($ip) = shift; my $lcip = lc($ip); foreach my $i (@ipsfound) { chomp $i; if (lc($i) eq $lcip) { return 1; } } return 0; } ####################################################### # Process the HTTP HREF sources # if $chkip = 1; then attempt to resolve the IP # addresses from the host name. ####################################################### sub process_host_array { $hcnt = scalar @httprefs; if ($hcnt) { prt( "Found $hcnt HREF entries ...\n" ); for (my $i = 0; $i < $hcnt; $i++) { $href = $httprefs[$i][0]; $file = $httprefs[$i][1]; my $lnn = $httprefs[$i][2]; my ($nm,$dir) = fileparse($file); if (defined( $hrefs{$href} )) { $val = $hrefs{$href}; $val .= ' '.$file; } else { $val = $file; } $val .= ":$lnn"; $hrefs{$href} = $val; prt( "$href in [$file]$lnn\n" ) if ($dbg2); } $hcnt = scalar keys(%hrefs); prt( "Found $hcnt different entries ...\n" ); if ($chkip) { my $inips = 0; prt( "Checking $hcnt IP addresses ... " ); if ($writeips && ( -f $ipfile)) { if (open INF, "<$ipfile") { @ipsfound = <INF>; close INF; prt( "Have ".scalar @ipsfound." in $ipfile" ); } else { prt( "Warning: Failed to open $ipfile" ); } } prt("\n"); $procnt = 0; foreach my $key (keys %hrefs) { $val = $hrefs{$key}; $procnt++; prt( "$key in $val\n" ) if ($dbg8); if ($key =~ /^http:\/\//i) { my $hkey = substr($key, 7); my @arr = split( /\//, $hkey ); $hkey = $arr[0]; if ( !in_ips_found($hkey) ) { if (showIPAddress( $hkey ) == 0) { $msg = "FAILED: NO IP FOR HOST [$hkey][$val]"; push(@warnings, $msg); prt( "$msg\n" ) if ($dbg16); } elsif ($writeips) { push(@ipsfound,"$hkey\n"); } } else { $inips++; } } if (($procnt % 100) == 0) { prt( "Done $procnt IP Addresses ...\n" ); } } prt( "Completed $procnt IP Addresses ... " ); if ($writeips) { $val = join("\n", sort @ipsfound); $val = trimblanklines($val); write2file($val, $ipfile); prt( "$inips in previous. Written ".scalar @ipsfound." to $ipfile" ); } prt("\n"); } } } # eof - chklinks01.pl