Generated: Tue Feb 2 17:54:22 2010 from autoexch02.pl 2006/08/26 7.7 KB.
#!/Perl use LWP::Simple; require "logfile.pl" or die "Missing logfile.pl ...\n"; # my simple log file and some other utility subs require "htmltools.pl" or die "Missing htmltools.pl ...\n"; # log file stuff my ($LF); my $outfile = 'temp'.$0.'.txt'; my $outfil1 = 'tempcurr02.htm'; my $outfil2 = 'tempcurr02.csv'; # program variables my $URL = 'http://www.exchangerate.com/'; my @hrefs = (); my @imgs = (); my @currency = (); my $m_date1 = ''; my $m_date2 = ''; my $test1 = 0; my $etext = ''; open_log($outfile); prt( "$0 ... Hello, World...\n" ); if ($test1) { prt("Fetching text from $outfil1 ...\n"); open CIF, "<$outfil1" or mydie("ERROR: Unable to open $outfil1 ...\n"); my @atext = <CIF>; # slurp it into a line array close CIF; prt( "From $outfil1, got ".scalar @atext." lines ...\n" ); $etext = join( "", @atext); prt( "len=".length($etext)."\n"); # "[$txt2]\n"); write2file($etext, 'tempcurr02a.htm'); } else { prt("Fetching text from $URL ...\n"); my $text = get("$URL"); my $tcnt = length($text); prt( "Got $tcnt characters from URL $URL ...\n"); $etext = htmlexpand($text); write2file($etext,$outfil1); } ##prt("[$text]\n"); ##my $ctext = $etext; my $ctext = htmlclean02($etext); prt( "len=".length($ctext)."\n"); # "[$txt2]\n"); my $ccnt = extractcurrencies02($ctext); ###my $ccnt = scalar @currency; prt( "Got $ccnt currencies ...\n" ); my $msg = ''; my $txt = ''; # AUSTRALIA Dollar AUD 1.310649 ### + 1.311001 -0.0268% + 1 if ($ccnt) { open OF, ">$outfil2" or mydie("ERROR: Unable to open CSV out file!\n"); prt("Writing CSV file $outfil2 ...\n"); $msg = "Country,Currency,ISO,Rate $m_date1,Rate $m_date2"; prt("$msg\n"); print OF $msg."\n"; for (my $i = 0; $i < $ccnt; $i++) { $msg = "".($currency[$i][0]).",".($currency[$i][1]).",".($currency[$i][2]).",".($currency[$i][3]).",".($currency[$i][4]); prt( "$msg\n" ); print OF $msg."\n"; } $msg = "From [$URL] on ".(scalar localtime); prt("$msg\n"); print OF $msg."\n"; close(OF); } else { prt( "WARNING: FAILED TO EXTRACT ANY CURRENCIES!!!\n" ); } prt( "All done ...\n" ); close_log($outfile,1); ###system( $outfil1 ); ###system( $outfil2 ); exit(0); # <TH COLSPAN=2> Country</TH> # <TH> Currency </TH> # <TH> ISO </TH> # <TH> 08/24/06 </TH> # <TH> 08/23/06 </TH> # <TH COLSPAN=2> Change </TH> my $m_stat1 = '^Country'; my $m_stat2 = '^Currency'; my $m_stat3 = '^ISO'; my $m_stat4 = '^\d{2}\D{1}\d{2}\D{1}\d{2}'; # <TH> 08/24/06 </TH> my $m_stat5 = '^\d{2}\/\d{2}\/\d{2}'; # <TH> 08/23/06 </TH> my $m_stat6 = ''; # <TH COLSPAN=2> Change </TH> my $m_stat7 = ''; # sub extractcurrencies02 { my ($txt) = shift; my $len = length($txt); my $i = 0; my $tag = ''; my $tt = ''; my $st = 0; my $cnt = 0; my @arr = (); my $rcnt = 0; my $isth = 0; prt( "Processing $len characters ...\n" ); for ($i = 0; $i < $len; $i++) { $ch = substr($txt,$i,1); if ($ch eq '<') { $tag = $ch; $i++; for ( ; $i < $len; $i++) { $ch = substr($txt,$i,1); $tag .= $ch; if ($ch eq '>') { ###prt( "Have tag [$tag] ...\n" ); last; } } $ch = ''; if (($tag =~ /^<td.*>/i)||($tag =~ /^<th.*>/i)) { $i++; $tt = ''; if ($tag =~ /^<td.*>/i) { ###prt( "Got TD tag [$tag] ...\n" ); $isth = 0; } else { ###prt( "Got TH tag [$tag] ...\n" ); $isth = 1; } for ( ; $i < $len; $i++) { $ch = substr($txt,$i,1); if ($ch eq '<') { $tt = trimbothends($tt); if (length($tt)) { ###$i--; # back up to this char last; } else { # eat this next - assume another tag # note this 'eats' though the first <td><img ...> etc tag $i++; for ( ; $i < $len; $i++) { $ch = substr($txt,$i,1); if ($ch eq '>') { last; } } } } else { $tt .= $ch } } ###prt( "Got TH/TD tag [$tag] ...txt=[$tt]\n" ); if ($st == 7) { ##prt( "Got TH/TD tag [$tag] ...txt=[$tt]\n" ); if ($cnt == 0) { $cnt = 1; @arr = (); push(@arr, $tt); } elsif ($cnt == 1) { $cnt = 2; push(@arr, $tt); } elsif ($cnt == 2) { $cnt = 3; push(@arr, $tt); } elsif ($cnt == 3) { $cnt = 4; push(@arr, $tt); } elsif ($cnt == 4) { $cnt = 5; push(@arr, $tt); } elsif ($cnt == 5) { $cnt = 6; push(@arr, $tt); } elsif ($cnt == 6) { push(@arr, $tt); $cnt = 0; } } elsif ($isth) { if ($st == 0) { ##if ($tt =~ /$m_stat1/) { if ($tt =~ /^Country/) { prt("At state 1 ...\n"); $st = 1; } } elsif ($st == 1) { ##if ($tt =~ /$m_stat2/) { if ($tt =~ /^Currency/) { prt("At state 2 ...\n"); $st = 2; } else { $st = 0; } } elsif ($st == 2) { ##if ($tt =~ /$m_stat3/) { if ($tt =~ /^ISO/) { prt("At state 3 ...\n"); $st = 3; } else { $st = 0; } } elsif ($st == 3) { ##if ($tt =~ /$m_stat3/) { if ($tt =~ /^(\d{2}\D{1}\d{2}\D{1}\d{2})/) { $m_date1 = $1; prt("At state 4 ... $m_date1 ...\n"); $st = 4; } else { $st = 0; } } elsif ($st == 4) { ##if ($tt =~ /$m_stat4/) { if ($tt =~ /^(\d{2}\D{1}\d{2}\D{1}\d{2})/) { $m_date2 = $1; prt("At state 5 ... $m_date2 ...\n"); $st = 5; } else { $st = 0; } } elsif ($st == 5) { prt("At state 6/7 ...\n"); $st = 7; } elsif ($st == 6) { prt("At state 7 ...\n"); $st = 7; } } } elsif ($tag =~ /^<\/tr.*/i) { if (($st == 7) && ($cnt >= 5)) { ###prt("End of row $cnt - return to cnt 0 ...\n"); # AUSTRALIA Dollar AUD 1.310649 1.311001 -0.0268% + <something> prt( "Arr=[" ); foreach my $itm (@arr) { prt("$itm,"); } prt("]\n"); push(@currency, [$arr[0], $arr[1], $arr[2], $arr[3], $arr[4]]); $rcnt++; } $cnt = 0; } elsif ($tag =~ /^<\/table.*/i) { if ($st > 0) { ###prt("End table - return to state 0 ...\n"); $st = 0; } } } } return $rcnt; } sub htmlclean02 { my ($rtxt) = shift; prt( "len=".length($rtxt)." Drop comments <!--...--> ...\n"); $rtxt = dropcomments($rtxt); prt( "len=".length($rtxt)." Strip <HEAD>...</HEAD> tag ...\n"); $rtxt = striptag($rtxt, 'HEAD'); prt( "len=".length($rtxt)." Strip <script>...</script> tag ...\n"); $rtxt = striptag($rtxt,'script'); prt( "len=".length($rtxt)." Strip <noscript>...</noscript> tag ...\n"); $rtxt = striptag($rtxt,'noscript'); prt( "len=".length($rtxt)." Strip <SELECT>...</SELECT> tag ...\n"); $rtxt = striptag($rtxt,'select'); prt( "len=".length($rtxt)." Remove <font ...> tags ...\n"); $rtxt = removefont($rtxt); prt( "len=".length($rtxt)." Remove <b> tags ...\n"); $rtxt = removetag($rtxt,'b'); prt( "len=".length($rtxt)." Remove <nobr> tags ...\n"); $rtxt = removetag($rtxt,'nobr'); prt( "len=".length($rtxt)." Remove th attributes ...\n"); $rtxt = removetagattrib($rtxt,'th'); prt( "len=".length($rtxt)." Remove tr attributes ...\n"); $rtxt = removetrattrib($rtxt); prt( "len=".length($rtxt)." Remove td attributes ...\n"); $rtxt = removetdattrib($rtxt); prt( "len=".length($rtxt)." Delete <a...> & </a>\n"); $rtxt = collecthrefs($rtxt,1); prt( "len=".length($rtxt)." Delete <img...>\n"); $rtxt = collectimgs($rtxt,1); prt( "len=".length($rtxt)." Do substitutions ...\n"); $rtxt = substitutions($rtxt); prt( "len=".length($rtxt)." Trim blank lines ...\n"); $rtxt = trimblanklines($rtxt); prt( "len=".length($rtxt)." Trim inline td ...\n"); $rtxt = triminlinetd($rtxt); return $rtxt; } # eof - autoexch01.pl