Generated: Mon Aug 16 14:14:21 2010 from gensiteindex.pl 2010/04/15 17.5 KB.
#!/perl -w # NAME: gensiteindex.pl # AIM: Given a root file, or folder, scan ALL direcotries, build and output # a site index. # 2010/04/13 - geoff mclane - http://geoffair.net/mperl/ use strict; use warnings; use File::Basename; # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] ) use Cwd; unshift(@INC, 'C:\GTools\perl'); require 'logfile.pl' or die "Unable to load logfile.pl...\n"; require 'htmllib.pl' or die "Unable to load htmllib.pl...\n"; # log file stuff my ($LF); my $pgmname = $0; if ($pgmname =~ /\w{1}:\\.*/) { my @tmpsp = split(/\\/,$pgmname); $pgmname = $tmpsp[-1]; } my $perl_dir = 'C:\GTools\perl'; my $outfile = $perl_dir."\\temp.$pgmname.txt"; open_log($outfile); # user variables my $load_log = 1; my $in_file = 'C:\HOMEPAGE\FG\index.html'; my $show_title_list = 0; my $show_title = 0; # display title, during accumulation my $out_html = $perl_dir."\\tempout.htm"; my @fpfolders = qw( aspnet_client _vti_cnf _vti_pvt _private _derived ); my @html_ext = qw( .htm .html .shtml .php ); my @graf_ext = qw( .jpg .jpeg .gif .png .bmp .ico .mpg ); my @css_ext = qw( .css ); my @script_ext = qw( .js .class .cgi ); my @def_indexes = qw( default.html default.htm index.aspx index.htm index.html index.jsp index.php index.shtml home.htm home.html welcome.html welcome.htm ); ### program variables my @warnings = (); my $cwd = cwd(); my $os = $^O; my @g_xclude_dirs = (); my @g_xclude_files = (); my $g_indir = ''; my $g_infile = ''; # none yet my $g_dir_count = 0; my $g_item_count = 0; # global file array # constants my $OF_FFN = 0; # full file name my $OF_EXT = 1; # extension type number like $FE_HTM, $FE_IMG, etc my $OF_IND = 2; # if it contforms to an 'index' type file my $OF_LEV = 3; # directory depth 0=root, 1, 2, 3... my $OF_CTA = 4; # content array (ref), if HTML type my $OF_TIT = 5; # title text (if any) my $OF_LNK = 6; # array reference of links #sub add_2_g_file_array($$$$) { # my ($ff,$idi,$ext,$lev) = @_; # 0 1 2 3 4 # $OF_FFN 0 = full file name # $OF_EXT 1 = extent type number $FE_HTM, etc # $OF_IND 2 = is an 'index' type file # $OF_LEV 3 = depth of directory # $OF_CTA 4 = contents array, if $FE_HTM # $OF_TIT 5 = title (or sfn, if none, of HTM files # $OF_LNK 6 = links for HTM files #push(@g_file_array,[$ff,$ext,$idi,$lev,[],'',[],0]); my @g_file_array = (); ### constants my $FE_OTH = 0; my $FE_HTM = 1; my $FE_IMG = 2; my $FE_ZIP = 3; my $FE_CSS = 4; my $FE_TXT = 5; my $FE_SCR = 6; ### debug my $dbg01 = 0; # show each directory parse ### forward refs sub process_sub_dir($$); sub show_warnings($) { my ($val) = shift; if (@warnings) { prt( "\nGot ".scalar @warnings." WARNINGS...\n" ); foreach my $itm (@warnings) { prt("$itm\n"); } prt("\n"); } elsif ($val) { prt( "\nNo warnings issued.\n\n" ); } } sub pgm_exit($$) { my ($val,$msg) = @_; if (length($msg)) { $msg .= "\n" if (!($msg =~ /\n$/)); prt($msg); } show_warnings($val); close_log($outfile,$load_log); exit($val); } sub prtw($) { my ($tx) = shift; $tx =~ s/\n$//; prt("$tx\n"); push(@warnings,$tx); } sub os_is_win() { return (($^O eq 'MSWin32') ? 1 : 0); } sub dos_2_unix($) { my ($du) = shift; $du =~ s/\\/\//g; return $du; } sub unix_2_dos($) { my ($du) = shift; $du =~ s/\//\\/g; return $du; } sub is_fp_folder($) { my ($inf) = shift; $inf = lc($inf) if (os_is_win()); foreach my $fil (@fpfolders) { $fil = lc($fil) if (os_is_win()); return 1 if ($inf eq $fil); } return 0; } sub is_xclude_folder($) { my ($inf) = shift; $inf = lc($inf) if (os_is_win()); foreach my $fil (@g_xclude_dirs) { $fil = lc($fil) if (os_is_win()); return 1 if ($inf eq $fil); } return 0; } sub is_excluded_dir($) { my ($dir) = shift; return 1 if (is_fp_folder($dir)); return 1 if (is_xclude_folder($dir)); return 0; } sub is_user_excluded($) { my ($file) = shift; $file = lc($file) if (os_is_win()); foreach my $fil (@g_xclude_files) { $fil = lc($fil) if (os_is_win()); return 1 if ($file eq $fil); } return 0; } ######################################################### # Passed an array REF of extensions, # check if this is one of them? ######################################################### sub is_this_extent($$) { my ($ext, $rex) = @_; my $lcx = lc($ext); foreach my $x (@{$rex}) { return 1 if ($lcx eq lc($x)); } return 0; } ############################################ # only looking for HTM, HTML, PHP, # could be extended to others maybe ... ############################################ # test an EXTENSION, or form '.htm'... sub is_htm_ext($) { my ($ext) = shift; return( is_this_extent($ext,\@html_ext) ); } sub is_graf_ext($) { my ($ext) = shift; return( is_this_extent($ext,\@graf_ext) ); } sub is_zip_ext($) { my ($ext) = shift; my @arr = qw( .zip .gz ); return( is_this_extent($ext,\@arr) ); } sub is_css_ext($) { my ($ext) = shift; return( is_this_extent($ext, \@css_ext) ); } sub is_txt_ext($) { my ($ext) = shift; my @arr = qw( .txt ); return( is_this_extent($ext, \@arr) ); } sub is_script_ext($) { my ($fil) = shift; return( is_this_extent($fil, \@script_ext) ); } # test a FILE/PATH extension sub is_htm_file_ext($) { my ($fil) = shift; my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/); return( is_htm_ext($e) ); } sub is_graphic_file_ext($) { my ($fil) = shift; my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/); return( is_graf_ext($e) ); } sub is_zip_file_ext($) { my ($fil) = shift; my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/); return( is_zip_ext($e) ); } sub is_css_file_ext($) { my ($fil) = shift; my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/); return( is_css_ext($e) ); } sub is_txt_file_ext($) { my ($fil) = shift; my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/); return( is_txt_ext($e) ); } sub is_script_file_ext($) { my ($fil) = shift; my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/); return( is_script_ext($e) ); } sub get_file_ext_type($) { my ($fil) = shift; return $FE_HTM if (is_htm_file_ext($fil)); return $FE_IMG if (is_graphic_file_ext($fil)); return $FE_ZIP if (is_zip_file_ext($fil)); return $FE_CSS if (is_css_file_ext($fil)); return $FE_TXT if (is_txt_file_ext($fil)); return $FE_SCR if (is_script_file_ext($fil)); return $FE_OTH; } sub type_2_stg($) { my ($t) = shift; return "HTML" if ($t == $FE_HTM); return "IMG" if ($t == $FE_IMG); return "ZIP" if ($t == $FE_ZIP); return "CSS" if ($t == $FE_CSS); return "TXT" if ($t == $FE_TXT); return "SCRIPT" if ($t == $FE_SCR); return "Other"; } sub add_2_g_file_array($$$$) { my ($ff,$idi,$ext,$lev) = @_; # 0 1 2 3 4 # 0 = full file name # 1 = extent type number $FE_HTM, etc # 2 = is an 'index' type file # 3 = depth of directory # 4 = contents array, if $FE_HTM # 5 = title (or sfn, if not found), of HTM files # 6 = links - for HTML files push(@g_file_array, [$ff,$ext,$idi,$lev,[],,'',[],0]); } sub get_g_file_array_counts() { my $len = scalar @g_file_array; my %hash = (); my ($typ,$i); for ($i = 0; $i < $len; $i++) { $typ = $g_file_array[$i][1]; $hash{$typ}++; } my $res = "Total=$len"; $len = 0; foreach $typ (keys %hash) { $i = $hash{$typ}; $res .= " ".type_2_stg($typ)."=$i"; $len += $i; } $res .= " ($len)"; return $res; } # 2010.03.16 - seek an 'index.htm' type file sub is_def_index_file($) { my ($fil) = @_; $fil = lc($fil) if (os_is_win()); foreach my $f (@def_indexes) { $f = lc($f) if (os_is_win()); return 1 if ($f eq $fil); } return 0; } sub process_sub_dir($$) { my ($indir,$lev) = @_; $indir = dos_2_unix($indir); # use ALL unix form of path pgm_exit(1,"ERROR: Unable to open directory [$indir]!\n") if ( !opendir( DIR, $indir ) ); my @files = readdir(DIR); closedir DIR; my ($fcnt,$file,$ff,@dirs,$idi,$ext); $fcnt = scalar @files; prt("Processing $fcnt files, from [$indir]...\n") if ($dbg01); $indir .= '/' if !($indir =~ /(\\|\/)$/); @dirs = (); foreach $file (@files) { next if (($file eq '.')||($file eq '..')); $g_item_count++; $ff = $indir.$file; if (-f $ff) { $idi = is_def_index_file($file); $ext = get_file_ext_type($file); add_2_g_file_array($ff,$idi,$ext,$lev) if (!is_user_excluded($file)); } elsif (-d $ff) { push(@dirs,$ff) if (!is_excluded_dir($file)); $g_dir_count++; } else { pgm_exit(1,"ERROR: WHAT IS THIS? [$ff]!!\n"); } } foreach $ff (@dirs) { process_sub_dir($ff,($lev + 1)); } } sub get_all_files($) { my ($inf) = @_; $inf = dos_2_unix($inf); # use ALL unix form of path my ($infile,$indir); if (-f $inf) { ($infile,$indir) = fileparse($inf); #prt("Got file [$infile], in directory [$indir]...\n"); $indir = $cwd if ($indir =~ /^\.(\\|\/)$/); } elsif (-d $inf) { $indir = $inf; $infile = ''; # none yet } $indir .= '/' if !($indir =~ /(\\|\/)$/); pgm_exit(1,"ERROR: Unable to open directory [$indir]!\n") if ( !opendir( DIR, $indir ) ); my @files = readdir(DIR); closedir DIR; my ($fcnt,$file,$ff,@dirs,$idi,$ext); $fcnt = scalar @files; prt("Processing $fcnt files, from [$indir]...\n") if ($dbg01); @dirs = (); my $lev = 0; foreach $file (@files) { next if (($file eq '.')||($file eq '..')); $g_item_count++; $ff = $indir.$file; if (-f $ff) { $idi = is_def_index_file($file); $ext = get_file_ext_type($file); add_2_g_file_array($ff,$idi,$ext,$lev) if (!is_user_excluded($file)); } elsif (-d $ff) { push(@dirs,$ff) if (!is_excluded_dir($file)); $g_dir_count++; } else { pgm_exit(1,"ERROR: WHAT IS THIS? [$ff]!!\n"); } } # now process the subdirectories, and sub-sub... foreach $ff (@dirs) { process_sub_dir($ff,($lev + 1)); } } sub get_html_title($$) { my ($rca,$sfn) = @_; my $cnt = scalar @{$rca}; my $txt = ''; my ($i,$tag); for ($i = 0; $i < $cnt; $i++) { $tag = ${$rca}[$i][1]; if ($tag eq 'title') { $txt = trim_all(${$rca}[$i+1][0]) if (($i + 1) < $cnt); last; } } $txt = $sfn if (length($txt) == 0); return $txt; } sub get_html_links_array($) { my ($rca) = @_; my $cnt = scalar @{$rca}; # get count of items my ($i,$txt,$tag,$rah); my ($src,$typ); my @arr = (); for ($i = 0; $i < $cnt; $i++) { $txt = ${$rca}[$i][0]; $tag = ${$rca}[$i][1]; # get the attribute HASH REFERENCE $rah = ${$rca}[$i][3]; if (defined ${$rah}{'src'}) { $src = ${$rah}{'src'}; $typ = get_href_type($src); push(@arr, [$src,$typ,$tag,'src']); } if (defined ${$rah}{'href'}) { $src = ${$rah}{'href'}; $typ = get_href_type($src); push(@arr, [$src,$typ,$tag,'href']); } } return \@arr; } #my $OF_FFN = 0; # full file name #my $OF_EXT = 1; # extension type number like $FE_HTM, $FE_IMG, etc #my $OF_IND = 2; # if it contforms to an 'index' type file #my $OF_LEV = 3; # level/depth of sub-directory 0= root #my $OF_CTA = 4; # content array (ref), if HTML type #my $OF_TIT = 5; # title text (if any) #my $OF_LNK = 6; # links (for HTM files) sub process_html_files($) { my ($ra) = shift; # reference to @g_file_array my $max = scalar @{$ra}; my ($i,$typ,$ff,$hr,$cnt,$hcnt,$sff,$rca,$tit,$lev); my ($id,$rlnk,$lcnt); $cnt = 0; $hcnt = 0; for ($i = 0; $i < $max; $i++) { $typ = ${$ra}[$i][$OF_EXT]; $hcnt++ if ($typ == $FE_HTM); } prt("Found $hcnt HTML files, of total $max, to process from dir [$g_indir]...\n"); for ($i = 0; $i < $max; $i++) { $typ = ${$ra}[$i][$OF_EXT]; $lev = ${$ra}[$i][$OF_LEV]; if ($typ == $FE_HTM) { $ff = ${$ra}[$i][$OF_FFN]; $id = ${$ra}[$i][$OF_IND]; # is INDEX type $sff = sub_common_folder_unix($ff,$g_indir); $sff = unix_2_dos($sff) if (os_is_win()); $cnt++; # $hr = get_html_file_hash($ff,0,4); # 4 = show dropped closing elements warnings $hr = get_html_file_hash($ff,0,0); if (defined ${$hr}{$ff}) { $rca = ${$hr}{$ff}; $tit = get_html_title($rca,$sff); $rlnk = get_html_links_array($rca); $lcnt = scalar @{$rlnk}; ${$ra}[$i][$OF_CTA] = $rca; # store the file contents array ${$ra}[$i][$OF_TIT] = $tit; # add a 'title' (or short file name, if no title, or blank ${$ra}[$i][$OF_LNK] = $rlnk; # add links array (ref) } else { pgm_exit(1,"ERROR: What gives??? File name [$ff] NOT defined\n"); } prt("$cnt of $hcnt: Done [$sff]$lev $lcnt links "); prt("title=[$tit] ") if ($show_title); prt("INDEX") if ($id); prt("\n"); } } } sub mycmp_decend_stgi0 { return -1 if (lc(${$a}[0]) lt lc(${$b}[0])); return 1 if (lc(${$a}[0]) gt lc(${$b}[0])); return 0; } sub get_html_head() { my $head = <<EOF; <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"> <head> <meta http-equiv="Content-Type" content="text/html; charset=us-ascii" /> <meta name="Keywords" content= "free,gpl,flightgear,simgear,terragear,flight,simulator,simulation,world,scenery" /> <title>Site Index - FlightGear</title> <style type="text/css"> /*<![CDATA[ page specific style */ /*]]>*/ </style> </head> <body> <h1 align="center">Site Map</h1> EOF return $head; } sub build_site_index($) { my ($ra) = shift; # reference to @g_file_array my $max = scalar @{$ra}; my ($i,$typ,$lev,$ff,$id,$sff,$rca,$tit,$lnk); my ($cnt,$j,$src,$tag,$styp); my $html = '<ul>'; for ($i = 0; $i < $max; $i++) { $typ = ${$ra}[$i][$OF_EXT]; $lev = ${$ra}[$i][$OF_LEV]; $ff = ${$ra}[$i][$OF_FFN]; if ($typ == $FE_HTM) { $id = ${$ra}[$i][$OF_IND]; # is INDEX type $sff = sub_common_folder_unix($ff,$g_indir); $sff = dos_2_unix($sff); $rca = ${$ra}[$i][$OF_CTA]; # get the file contents array $tit = ${$ra}[$i][$OF_TIT]; # get a 'title' (or short file name, if no title, or blank $lnk = ${$ra}[$i][$OF_LNK]; # get links array (ref) $cnt = scalar @{$lnk}; # push(@arr, [$src,$typ,$tag,'href']); $html .= "<li><a href=\"$sff\">$tit</a><br>\n"; # if ($cnt) { # for ($j = 0; $j < $cnt; $j++) { # $src = ${$lnk}[$j][0]; # $styp = ${$lnk}[$j][1]; # $tag = ${$lnk}[$j][2]; # if (($tag ne 'script')&&($tag ne 'link' )) { # $html .= "<a href=\"$src\">$src ($styp-$tag)</a>\n"; # } # } # } $html .= "</li>\n" } } $html .= "</ul>\n"; $html = get_html_head().$html; $html .= "</body>\n"; $html .= "</html>\n"; write2file($html,$out_html); prt("HTML written to $out_html...\n"); system($out_html); } ######################################### ### MAIN ### parse_args(@ARGV); prt( "$pgmname: in [$cwd]: In file [$in_file]...\n" ); if (-f $in_file) { ($g_infile,$g_indir) = fileparse($in_file); #prt("Got file [$infile], in directory [$indir]...\n"); $g_indir = $cwd if ($g_indir =~ /^\.(\\|\/)$/); } elsif (-d $in_file) { $g_indir = $in_file; $g_infile = ''; # none yet } else { pgm_exit(1,"ERROR: In file [$in_file] NOT valid!\n"); } get_all_files($in_file); prt("Processed $g_item_count items, $g_dir_count directories, for -\n"); prt( get_g_file_array_counts()."\n" ); process_html_files(\@g_file_array); # now have has reference of ALL files build_site_index(\@g_file_array); pgm_exit(0,"Normal exit(0)"); ######################################## sub parse_args { my (@av) = @_; while (@av) { $in_file = $av[0]; shift @av; } } # eof - gensiteindex.pl