Generated: Tue Feb 2 17:54:42 2010 from imgalt01.pl 2006/10/27 13.7 KB.
#!/Perl # imgalt01.pl - 2006.10.24 - geoff mclane (geoffmclane.com) # AIM: To extract the <img alt="..." atribute for translation # If $addtr is 1, then a search and load current 'tranlation' # which is added to the table ... # ===================================================================== use strict; require 'logfile.pl' or die "ERROR: Can NOT load logfile.pl ...\n"; require 'htmltools.pl' or die "ERROR: Can NOT load htmltools.pl ...\n"; # log file stuff my ($LF); my $outfile = 'temp.'.$0.'.txt'; open_log($outfile); prt( "$0 ... Hello, World ...\n" ); # user variable my $def_folder = 'C:\HOMEPAGE\P26\travel'; my $def_input = $def_folder . '\tunisia.htm'; my $def_output = 'tempalt2.htm'; my $addtr = 1; # from file my $tr_file = $def_folder . '\tempalt.htm'; my $dosubs = 1; # modify in3 file, changing the alt text, and write out3 my $def_in3 = $def_folder . '\tunisfr2.htm'; my $def_out3 = $def_folder . '\tempalt3.htm'; my @trtable = (); my @tlines = (); my @langarr = (); # debug my $dbg1 = 1; # show length after adjustments my $dbg2 = 0; # show 'other' tags my $dbg3 = 0; # show collections phase my $dbg4 = 0; # show sub collection phase my $dbg5 = 0; # show the text collection my $dbg6 = 0; # show substitution # program variables my $line = ''; my @lines = (); my @frlines = (); my $cnt = 0; my $txt = ''; my $ccnt = 0; my $newtxt = ''; my @attlist = (); my @altlist = (); my $in_file = $def_input; my $out_file = $def_output; my $htm_head = <<"EOF"; <html> <head> <title>Alt List</title> </head> <body> <table border="2"> EOF my $htm_tail = <<"EOF"; </table> </body> </html> EOF $in_file = pop @ARGV if (@ARGV); $out_file = pop @ARGV if (@ARGV); prt( "Got input from [$in_file], output to [$out_file] ...\n" ); if ( ! -f $in_file) { mydie("OOPS: Can NOT locate [$in_file] ...\n"); } if ($addtr) { load_existing_table($tr_file); } open IF, "<$in_file" or mydie("OOPS: Can NOT open [$in_file] ...\n"); @lines = <IF>; # slurp it all in close IF; $cnt = scalar @lines; prt("Processing $cnt lines from [$in_file] ...\n"); $txt = join("\n", @lines); $ccnt = length($txt); prt("Or $ccnt characters from [$in_file] ...\n"); extract_img_alts( $txt ); show_att_list(); out_alt_list( $out_file ); if ($dosubs && @langarr) { open IFF, "<$def_in3" or mydie( "OOPS: Can not open file $def_in3 ... $! ...\n" ); @frlines = <IFF>; close IFF; prt( "Process " . scalar @frlines . " lines from [$def_in3] ...\n" ); $txt = do_substitution(); open OFF, ">$def_out3" or mydie( "YEEK! Unable to create [$def_out3] ... $! ...\n" ); print OFF $txt; close OFF; system( $def_out3 ); } #$ccnt = length($newtxt); #write_out_file($newtxt, $out_file); #system($out_file); close_log($outfile,1); exit(0); # ############################################### # all subs below # ############## sub do_substitution { my $lc = scalar @langarr; my ($i, $img, $eng, $fr, $j, $c, $d, $imtag, $im2); my $frhtm = join('', @frlines); my $tl = length($frhtm); prt( "Attempting $lc substitutions ... in $tl htm chars...\n" ); my $fnd = 0; my $newfr = ''; # accumulate into here for ($i = 0; $i < $lc; $i++) { $img = $langarr[$i][0]; $eng = $langarr[$i][1]; $fr = $langarr[$i][2]; $imtag = ''; $d = ''; $fnd = 0; $newfr = ''; $tl = length($frhtm); prt( "\nText length now $tl characters ...\n" ) if ($dbg6); for ($j = 0; $j < $tl; $j++) { $c = substr($frhtm,$j,1); if ($d eq '<') { if ($c eq "\n") { if (substr($imtag,-1) =~ /\s/) { $c = ''; } else { $c = ' '; } } $imtag .= $c; if ($c eq '>') { $d = $c; if ($imtag =~ /^<img.+/) { $imtag = trimall($imtag); if ($imtag =~ /src=['"](.+?)['"]/i) { $im2 = $1; if ($im2 eq $img) { if ($imtag =~ /alt=['"](.+?)['"]/i) { substr($imtag, index($imtag,$1),length($1),$fr); prt( "Change [$1] to [$fr] ..\n" ) if ($dbg6); prt( "$imtag\n" ) if ($dbg6); $fnd = 1; } $newfr .= $imtag; # add in this block last; } } } $newfr .= $imtag; # add in this block } } elsif ($c eq '<') { $imtag = $c; $d = $c; } else { $newfr .= $c; } } ############################################################## if (!$fnd) { prt( "Did not find [$img] ...\n" ); } else { $j++ if ($j < $tl); $newfr .= substr($frhtm, $j) if ($j < $tl); # use the NEW text $frhtm = $newfr; } } return $frhtm; } sub get_table_block { my ($tn) = shift; # table number my $lc = scalar @tlines; my ($l, $i, $c, $tg, $d, $ln, $ll); my $tbl = ''; my $tc = 0; my $in_tbl = 0; $d = ''; for ($l = 0; $l < $lc; $l++) { $ln = $tlines[$l]; # entract a line $ln = trimall($ln); # clean it up $ll = length($ln); if ($ll && $in_tbl && (length($tbl))) { $c = substr($tbl,-1); if ( !(($c =~ /\s/)||($c eq '>')) ) { $tbl .= ' '; } } for ($i = 0; $i < $ll; $i++) { $c = substr($ln,$i,1); $tbl .= $c if ($in_tbl); if ($d eq '<') { $tg .= $c; if ($c eq '>') { # got a tag if ($tg =~ /<table.*?>/i) { $tc++; if ($tn == $tc) { $in_tbl = 1; } } elsif ($tg =~ /<\/table>/i) { if ($in_tbl) { $tbl = substr($tbl, 0, length($tbl) - length($tg)); } $in_tbl = 0; } $d = ''; } } elsif ($c eq '<') { $tg = $c; $d = $c; } } } return $tbl; } sub load_existing_table { my ($fil) = shift; my $ln = ''; my $rows = 0; my $cols = 0; my $in_row = 0; my $in_td = 0; my $img = ''; my $eng = ''; my $fr = ''; if ( ! -f $fil) { mydie( "ERROR: Unable to locate exisitng [$fil] file ... $! ...\n" ); } open INF, "<$fil" or mydie( "ERROR: Unable to OPEN exisitng [$fil] file ... $! ...\n" ); @tlines = <INF>; close INF; prt( "Got " . scalar @tlines . " lines from file [$fil] ...\n" ); my $tt = get_table_block(1); ##prt( "Table block = [$tt]\n" ); #$tt = tag2newline($tt, 'caption'); #$tt = tag2newline($tt, 'tr'); #$tt = tag2newline($tt, 'th'); #$tt = tag2newline($tt, 'td'); #$tt = trimblanklines($tt); #prt( "\nTable block 2 = \n[$tt]\n" ); $tt = alltags2newline($tt); ##prt( "\nTable block 3 = \n[$tt]\n" ); @tlines = split("\n",$tt); prt( "Got " . scalar @tlines . " table lines ...\n" ); foreach $ln (@tlines) { $ln = trimall($ln); if ($ln =~ /<tr.*>/i) { $rows++; $in_row = 1; $cols = 0; } elsif ($ln =~ /<th.*>/i) { # ignore these $cols = 0; } elsif ($ln =~ /<caption.*>/i) { # ignore $cols = 0; } elsif ($ln =~ /<td.*>/i) { $cols++; $in_td = 1; } elsif ($ln =~ /<\/caption>/i) { # ignore this $cols = 0; } elsif ($ln =~ /<\/th>/i) { # ignore $cols = 0; } elsif ($ln =~ /<\/tr>/i) { $in_row = 0; $cols = 0; } elsif ($ln =~ /<\/td>/i) { $in_td = 0; } else { # should be a text entry if ($in_td) { if ($cols == 1) { $img = $ln; prt( "img=[$ln]\n" ) if ($dbg5); } elsif ($cols == 2) { $eng = $ln; prt( "eng=[$ln]\n" ) if ($dbg5); } elsif ($cols == 3) { $fr = $ln; prt( "fr=[$ln]\n" ) if ($dbg5); push(@langarr, [$img, $eng, $fr]); } } } } } sub alltags2newline { my ($tx) = shift; my $tl = length($tx); my ($i, $c, $d); my $nt = ''; $d = ''; for ($i = 0; $i < $tl; $i++) { $c = substr($tx,$i,1); if ($c eq '<') { if (length($nt) && (substr($nt,-1) ne "\n")) { $nt .= "\n"; } } if (($d eq '>')&&($c ne "\n")) { if (length($nt) && (substr($nt,-1) ne "\n")) { $nt .= "\n"; } } $nt .= $c; $d = $c; } return $nt; } sub short_text { my ($tx, $len) = @_; my $ln = length($tx); my $ntx = $tx; if ($ln > ($len + 3)) { my $hl = int( $len / 2 ); $ntx = substr($tx,0,$hl); $ntx .= '...'; $hl = $len - $hl; $ntx .= substr($tx, $ln - $hl); } return $ntx; } sub write_out_file { my ($tx, $fil) = @_; open OF, ">$fil" or mydie("YEEK! Can NOT create [$fil] ...\n"); print OF $tx; close OF; prt("Written " . length($tx) . " characters to [$fil]...\n"); } sub get_tag { my ($t) = shift; my $m = length($t); my ($j, $c); my $tg = ''; for ($j = 0; $j < $m; $j++) { $c = substr($t,$j,1); if ($c eq '<') { $tg = $c; $j++; for ( ; $j < $m; $j++) { $c = substr($t,$j,1); $tg .= $c; if ($c eq '>') { last; } } last; } } return $tg; } sub get_att_hash { my ($tg) = shift; $tg =~ s/\n/ /gm; $tg =~ s/\r/ /gm; my $ml = length($tg); my ($i, $c, $d); my $tag = ''; my $att = ''; my $val = ''; my %h = (); for ($i = 0; $i < $ml; $i++) { $c = substr($tg,$i,1); if ($c eq '<') { $i++; for ( ; $i < $ml; $i++) { $c = substr($tg,$i,1); if (($c =~ /\s/)||($c eq '>')) { last; } $tag .= $c; } # got the tag, now the attributes, if any prt( "tag=[$tag]\n" ) if ($dbg4); while (($c =~ /\s/)&&(($i + 1) < $ml)) { while (($c =~ /\s/)&&(($i + 1) < $ml)) { $i++; $c = substr($tg,$i,1); } $att = ''; $val = ''; if ( !($c =~ /\s/) && ($c ne '>')) { $att = $c; # start attribute $i++; for ( ; $i < $ml; $i++) { $c = substr($tg,$i,1); if ($c eq '=') { last; } $att .= $c; } if (($c eq '=')&&(($i + 1) < $ml)) { $i++; $d = substr($tg,$i,1); if (($d eq '"')||($d eq "'")) { $val = $d; # keep the inverted comma } else { $val = $d; # keep first item $d = ' '; } $i++; for ( ; $i < $ml; $i++) { $c = substr($tg,$i,1); if ($c eq '>') { last; } elsif ($c eq $d) { if ($c ne ' ') { $val .= $c; if (($i + 1) < $ml) { $i++; $c = substr($tg,$i,1); } } last; } $val .= $c; } } if (length($att) && length($val)) { prt( "att=[$att] value=[$val] c=[$c]\n" ) if ($dbg4); if (defined $h{$att}) { prt("Duplicate attribute!!! [$att] val1=[" . $h{$att} . "] adding [$val] ...\n" ); if ($h{$att} ne $val) { $h{$att} .= '|' . $val; } } else { $h{$att} = $val; } } else { prt( "Warning: failed to get att=[$att] value=[$val] c=[$c]\n" ); } } } # end while ############################# push(@attlist, [$tag, \%h]); } } } sub trim_tail { my ($ln) = shift; while ($ln =~ /\s$/m) { $ln = substr($ln,0, length($ln) - 1); } return $ln; } sub strip_quotes { my ($tx) = shift; $tx =~ s/^('|")//; $tx =~ s/('|")$//; return $tx; } sub show_att_list { my $ac = scalar @attlist; prt( "Got $ac entries in attlist ...\n" ); my ($i, $src, $alt); for ($i = 0; $i < $ac; $i++) { my $tg = $attlist[$i][0]; my %th = $attlist[$i][1]; prt( "TAG=[$tg]\n" ) if ($dbg4); ##foreach my $k (keys(%th)) { ## my $v = $th{$k}; ## prt( "k=[$k] v=[$v]\n" ); ##} $src = ''; $alt = ''; foreach my $k (keys(%{$attlist[$i][1]})) { my $v = ${$attlist[$i][1]}{$k}; prt( "k=[$k] v=[$v]\n" ) if ($dbg4); if ($k =~ /^src$/i) { $src = strip_quotes($v); } elsif ($k =~ /^alt$/) { $alt = strip_quotes($v); } } if (length($src) && length($alt)) { push(@altlist, [$src, $alt]); } else { prt( "WARNING: Failed to find src and alt ...\n" ); } } } sub get_fr { my ($ig) = shift; my ($img, $eng, $fr, $i); my $icnt = scalar @langarr; for ($i = 0; $i < $icnt; $i++) { $img = $langarr[$i][0]; $eng = $langarr[$i][1]; $fr = $langarr[$i][2]; if ($img eq $ig) { return $fr; } } return ' '; } sub out_alt_list { my ($fil) = shift; my $ct = scalar @altlist; if ($ct) { my ($i, $sr, $at, $msg); prt( "Outputting $ct alt list entries to $fil ...\n" ); open OTF, ">$fil" or mydie( "ERROR: Unable to open $fil file ... $! \n" ); print OTF $htm_head; for ($i = 0; $i < $ct; $i++) { $sr = $altlist[$i][0]; $at = $altlist[$i][1]; $msg = "<tr>\n"; $msg .= "<td>\n"; ##$msg .= $sr; $msg .= '<img src="' . $def_folder . '/' . $sr . '" width="60" height="40">'; $msg .= "</td>\n"; $msg .= "<td>\n"; $msg .= $at; $msg .= "</td>\n"; $msg .= "<td>\n"; $msg .= get_fr($sr); $msg .= "</td>\n"; $msg .= "</tr>\n"; print OTF $msg; } print OTF $htm_tail; close OTF; ###system($fil); } else { prt( "WARNING: Did not find any src/alt sets ...\n" ); } } sub extract_img_alts { my ($tx) = shift; my $tl = length($tx); my ($i); my $ch = ''; my $nt = ''; my $tag = ''; my $att = ''; my $tgl = ''; my $intd = 0; my $ntag = ''; for ($i = 0; $i < $tl; $i++) { $ch = substr($tx,$i,1); if ($ch eq '<') { $tag = get_tag( substr($tx,$i) ); $i += (length($tag) - 1) if (length($tag)); $tgl = $tag; $tgl =~ s/\n/ /g; $tgl =~ s/\r/ /g; if ($tgl =~ /<img(.*)>/im) { $att = $1; prt( "IMG tag [$tag]...\n" ) if ($dbg3); get_att_hash($tag); } elsif ((length($tag) > 4)&&(substr($tag,0,4) eq '<!--')) { prt( "Got comment ...\n" ) if ($dbg2); } else { prt( "other tag [$tag] ...\n" ) if ($dbg2); } $nt .= $tag; } else { $nt .= $ch; } } $tl = length($nt); prt("Now returning $tl characters ...\n") if $dbg1; return $nt; } sub trimall { my ($ln) = shift; chomp $ln; $ln =~ s/\r$//; $ln =~ s/\t/ /g; while ($ln =~ /\s\s/) { $ln =~ s/\s\s/ /g; } while ($ln =~ /^\s/) { $ln = substr($ln,1); } while ($ln =~ /\s$/) { $ln = substr($ln,0, length($ln) - 1); } return $ln; } # eof - imgalt01.pl