getfgxlist.pl to HTML.

index -|- end

Generated: Sun Aug 21 11:11:04 2011 from getfgxlist.pl 2011/07/09 15.7 KB.

#!/usr/bin/perl -w
# NAME: getfgxlist.pl
# AIM: Get google FGx list...
# 01/07/2011 geoff mclane http://geoffair.net/mperl
use strict;
use warnings;
use File::Basename;  # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] )
use LWP::Simple;
use Cwd;
my $perl_dir = 'C:\GTools\perl';
unshift(@INC, $perl_dir);
require 'lib_utils.pl' or die "Unable to load 'lib_utils.pl' ... Check \@INC values...\n";
require 'lib_html.pl' or die "Unable to load 'lib_html.pl' ... Check \@INC values...\n";
# log file stuff
our ($LF);
my $pgmname = $0;
if ($pgmname =~ /(\\|\/)/) {
    my @tmpsp = split(/(\\|\/)/,$pgmname);
    $pgmname = $tmpsp[-1];
}
my $outfile = $perl_dir."\\temp.$pgmname.txt";
open_log($outfile);

# user variables
my $load_log = 1;
my $in_file = '';
my $out_url2 = $perl_dir."\\tempurl2.txt";
my $out_url3 = $perl_dir."\\tempurl3.txt";

my $use_html_lib = 1;
my $get_new_file = 1; # force to fetch a NEW file from the WEB

my $debug_on = 0;
my $def_file = 'def_file';

### program variables
my @warnings = ();
my $cwd = cwd();
my $os = $^O;

#my $TAG_NORM    = 0;
#my $TAG_CLOSE   = 1;
#my $TAG_CLOSED  = 2;
#my $TAG_CLOSEA  = 3;
#my $TAG_SPECIAL = 4;
#my $TAG_COMMENT = 5;
#my $TAG_TEXT    = 6;
#my $ATT_NV = '<no_value>';

my $git = 'http://code.google.com';
my $fgx = '/p/fgx/issues/list';
my $fgx_base = '/p/fgx/issues/';

my $chk_page = 'detail?id=42';

# debug
my $dbg_01 = 0; # show tags as decoded
my $dbg_02 = 0; # extra debug output

sub show_warnings($) {
    my ($val) = @_;
    if (@warnings) {
        prt( "\nGot ".scalar @warnings." WARNINGS...\n" );
        foreach my $itm (@warnings) {
           prt("$itm\n");
        }
        prt("\n");
    } else {
        ###prt( "\nNo warnings issued.\n\n" );
    }
}

sub pgm_exit($$) {
    my ($val,$msg) = @_;
    if (length($msg)) {
        $msg .= "\n" if (!($msg =~ /\n$/));
        prt($msg);
    }
    show_warnings($val);
    close_log($outfile,$load_log);
    exit($val);
}


sub prtw($) {
   my ($tx) = shift;
   $tx =~ s/\n$//;
   prt("$tx\n");
   push(@warnings,$tx);
}

sub process_in_file($) {
    my ($inf) = @_;
    if (! open INF, "<$inf") {
        pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); 
    }
    my @lines = <INF>;
    close INF;
    my $lncnt = scalar @lines;
    prt("Processing $lncnt lines, from [$inf]...\n");
    my ($line,$inc,$lnn);
    $lnn = 0;
    foreach $line (@lines) {
        chomp $line;
        $lnn++;
        if ($line =~ /\s*#\s*include\s+(.+)$/) {
            $inc = $1;
            prt("$lnn: $inc\n");
        }
    }
}

sub get_attr_rhash($) {
    my $txt = shift;
    my %hash = ();
    my @arr = space_split($txt);
    my ($item,@arr2,$cnt,$att,$val,$j);
    foreach $item (@arr) {
        @arr2 = split("=",$item);
        $cnt = scalar @arr2;
        $att = trim_all($arr2[0]);
        $val = '';
        if ($cnt > 1) {
            if ($cnt > 2) {
                for ($j = 1; $j < $cnt; $j++) {
                    $val .= '=' if (length($val));
                    $val .= $arr2[$j];
                }
            } else {
                $val = $arr2[1];
            }
        } else {
            next if ($att eq '/');
            $val = get_attr_no_value(); # $ATT_NV;
        }
        $hash{$att} = $val;
    }
    return \%hash;
}

sub get_url() {
    my $URL = $git.$fgx;
    my ($content);
    prt("Fetching content form [$URL]...\n");
    unless (defined ($content = get $URL)) { pgm_exit(1,"ERROR: could not get $URL\n"); } 
    write2file("$content\n",$out_url2);
}

sub get_html() {
    if ($get_new_file || (! -f $out_url2)) {
        get_url();
    }
    if (! -f $out_url2) {
        pgm_exit(1,"ERROR: No URL file [$out_url2]!\n");
    }
    if (! open(FIL,"<$out_url2")) {
        pgm_exit(1,"ERROR: Failed to open file [$out_url2]!\n");
    }
    my @lines = <FIL>;
    close FIL;
    my $cnt = scalar @lines;
    my $content = join("",@lines);
    my $ra = get_html_refarray($content);
    prt("Done $cnt lines, from [$out_url2] file...\n");
    return $ra;
}

my @ignore_tags = qw( div script tr th td span u img small form input tbody option table pre );
sub ignore_tag($) {
    my ($tag) = @_;
    my ($tt);
    foreach $tt (@ignore_tags) {
        return 1 if ($tag =~ /^$tt/i);
    }
    return 0;
}

my @ignore_text = qw( ID Type Status Priority Platform Branch Milestone Owner Area BlockedOn 
    Enhancement Accepted High OSX Master 2.4.0 y...@sablonier.ch Defect All
    Medium p...@freeflightsim.org );
# 427: text [Summary + Labels]
sub ignore_text($) {
    my ($tag) = @_;
    my ($tt);
    return 1 if ($tag eq 'Summary + Labels');
    return 1 if ($tag eq 'My favorites');
    foreach $tt (@ignore_text) {
        return 1 if ($tt eq $tag);
    }
    return 0;
}

sub remove_html_entities($) {
    my ($txt) = @_;
    $txt =~ s/\&quot;/"/gm;
    $txt =~ s/\&gt;/</gm;
    $txt =~ s/\&lt;/>/gm;
    return $txt;
}

sub get_text_in_tag($$) {
    my ($ra,$txt) = @_;
    my ($cnt,$typ,$tag,$rha,$i,$lnn,$tt,$add);
    $cnt = scalar @{$ra};
    #prt("get_text_in_tag: HTML ref array had $cnt items\n");
    my @html_array = ();
    my $intag = 0;
    my $tagcnt = 0;
    my $txtcnt = 0;
    my $srchcnt = 0;
    my $tag_close = get_tag_close_value();
    my $tag_text = get_tag_text_value();
    my $tag_norm = get_tag_normal_value();
    my %found = ();
    for ($i = 0; $i < $cnt; $i++) {
        $typ = ${$ra}[$i][0];
        $tag = ${$ra}[$i][1];
        $rha = ${$ra}[$i][2];
        $lnn = ${$ra}[$i][3];
        $found{$tag} = 1;
        if ($intag) {
            if (($typ == $tag_close) && ($tag =~ /^$txt$/i)) {
                $intag = 0;
                #prt("$lnn: End tag [$txt]\n");
            } elsif ($typ == $tag_text) {
                #next if ($tag eq '(No comment was entered for this change.)');
                next if ($tag =~ /No\s+comment\s+was\s+entered\s+for\s+this\s+change/i);
                push(@html_array,[$typ,$tag,$rha,$lnn]);
            }
        } else {
            if (($typ == $tag_norm) && ($tag =~ /^$txt$/i)) {
                $intag = 1;
                #prt("$lnn: Begin tag [$txt]\n");
                $tagcnt++;
            }
        }
    }
    if ($tagcnt == 0) {
        $tagcnt = scalar keys(%found);
        #pgm_exit(1,"ERROR: Tag [$txt] NOT found in array of $cnt items... $tagcnt tags...\n");
        prtw("WARNING: Tag [$txt] NOT found in array of $cnt items... $tagcnt tags...\n");
    }
    $cnt = scalar @html_array;
    #prt("get_text_in_tag: HTML returning $cnt items\n");
    return \@html_array;
}

sub show_text_item($) {
    my ($ra) = @_;
    my ($typ,$tag,$rha,$lnn,$cnt,$i,$ttxt);
    $cnt = scalar @{$ra};
    #prt("HTML ref array had $cnt items\n");
    my $msg = '';
    my $max = 76;
    my $line = '';
    my ($len,$ch,$j);
    my %dupes = ();
    my %dupes2 = ();
    for ($i = 0; $i < $cnt; $i++) {
        $typ = ${$ra}[$i][0];
        $tag = ${$ra}[$i][1];
        next if (defined $dupes{$tag});
        $dupes{$tag} = 1;
        $ttxt = lc(trim_all($tag));
        next if (defined $dupes2{$ttxt});
        $dupes2{$ttxt} = 1;
        $tag = remove_html_entities($tag);
        $rha = ${$ra}[$i][2];
        $lnn = ${$ra}[$i][3];
        #prt("$lnn: text [$tag]");
        $len = length($tag); # get line length
        for ($j = 0; $j < $len; $j++) {
            $ch = substr($tag,$j,1); # get char
            if ($ch =~ /\s/) {
                # deal with spaces
                if ($ch =~ /\n/) {
                    $msg .= "$line\n" if (length($line)); # add this line
                    $line = ''; # and start again
                } else {
                    # not a newline char
                    if ($line =~ /\s$/) {
                        # discard extra spaces
                    } elsif (length($line)) {
                        if (length($line) > $max) {
                            $msg .= "$line\n";
                            $line = '';
                        } else {
                            $line .= ' ';
                        }
                    }
                }
            } else {
                $line .= $ch;
            }
        }
        $msg .= "$line\n" if (length($line));
        $line = '';
    }
    $msg =~ s/\n$//;
    #prt("Content\n$msg\n");
    prt("$msg\n");
}

sub process_page_content($) {
    my ($content) = shift;
    my $ra = get_html_refarray($content);
    my $ra2 = get_html_body_only($ra);
    #show_html_refarray($ra2);
    #my $ra3 = drop_div_tag($ra2);
    #my $ra3 = drop_html_tags($ra2,\@ignore_tags);
    my $ra3 = get_text_in_tag($ra2,"pre");
    #show_html_refarray($ra3);
    show_text_item($ra3);
}

sub get_this_page($) {
    my ($href) = @_;
    my $URL = $git.$fgx_base.$href;
    my ($content);
    unless (defined ($content = get $URL)) { pgm_exit(1,"ERROR: could not get $URL\n"); } 
    write2file("$content\n",$out_url3);
    #prt("Contents of $href, written to $out_url3\n");
    process_page_content($content);
    #pgm_exit(1,"CHECK EXIT\n");
}

sub check_a_page($) {
    my ($file) = shift;
    if (! open( INF, "<$file" )) {
        pgm_exit(1,"ERROR: Can NOT open file [$file]\n");
    }
    my @lines = <INF>;
    close INF;
    my $content = join("",@lines);
    process_page_content($content);
}

sub show_html_ra($) {
    my ($ra) = @_;
    my ($cnt,$typ,$tag,$rha,$i,$lnn);
    my ($hcnt,$key,$val,$att,$isnorm);
    my ($lckey,$isclose,$istext,$msg);
    my $inbody = 0;
    my $intable = 0;
    my $chref = '';
    my $phref = '';
    my $hrtext = '';
    my $hadload = 0;
    my %lchash = ();
    my $max = 85;
    $cnt = scalar @{$ra};
    prt("HTML ref array had $cnt items\n");
    for ($i = 0; $i < $cnt; $i++) {
        $typ = ${$ra}[$i][0];
        $tag = ${$ra}[$i][1];
        $rha = ${$ra}[$i][2];
        $lnn = ${$ra}[$i][3];
        $hcnt = scalar keys(%{$rha});
        $att = '';
        next if (ignore_tag($tag));
        $isnorm = ($typ == get_tag_normal_value()) ? 1 : 0;
        $isclose = ($typ == get_tag_close_value()) ? 1 : 0;
        $istext = ($typ == get_tag_text_value()) ? 1 : 0;
        if ($istext) {
            $tag = trim_all($tag);
            $tag =~ s/\&nbsp;/ /g;
            $tag = trim_all($tag);
            next if (length($tag) == 0);
            next if ($tag =~ /^\&.+;$/);
            next if ($tag =~ /^\W+$/);
            next if (ignore_text($tag));
            next if ($tag =~ /^\d+$/); # ignroe all digit text
            last if ($tag eq 'CSV');
            $hadload = 1 if ($tag eq 'Loading...');
        }
        if ($inbody && $hadload) {
            %lchash = ();
            foreach $key (keys %{$rha}) {
                $lckey = lc($key);
                next if ($key =~ /^onclick$/i);
                next if ($key =~ /^style$/i);
                next if ($key =~ /^class$/i);
                $val = ${$rha}{$key};
                $att .= " " if (length($att));
                if ($val eq get_attr_no_value()) {
                    $att .= $key;
                } else {
                    $att .= "$key=$val";
                }
                if ($key =~ /^href$/i) {
                    $chref = strip_quotes($val);
                }
                $lchash{$lckey} = $val;
            }
            # if ($typ == get_tag_normal_value()) {
            if ($isnorm) {
                if ($tag =~ /^a$/i) {
                    if (defined $lchash{'href'}) {
                        $val = strip_quotes($lchash{'href'});
                        next if ($val eq '#');
                        next if ($val =~ /^\#/);
                        next if ($val =~ /^\/p\/fgx/i);
                        next if ($val eq $phref); # only show NEW HREFS
                        if (length($hrtext) && length($phref) && ($phref =~ /^detail/) ) {
                            $msg = "==bug== [$phref] [$hrtext] ";
                            $msg .= '=' while (length($msg) < $max);
                            prt("$msg\n");
                            get_this_page($phref);
                            $msg = "=== HREF DONE [$phref] [$hrtext] ===";
                            $msg .= '=' while (length($msg) < $max);
                            prt("$msg\n\n");
                        }
                        $phref = $val;
                        $hrtext = '';
                    }
                }
                if ($dbg_02) {
                    prt("$lnn: norm [$tag]");
                } else {
                    next;
                }
            # } elsif ($typ == get_tag_close_value()) {
            } elsif ($isclose) {
                if ($tag =~ /^body/i) {
                    $inbody = 0;
                    prt("$lnn: exit BODY\n");
                    next;
                }
                #prt("$lnn: close [$tag]");
                next;
            } elsif ($typ == get_tag_closed_value()) {
                prt("$lnn: closed [$tag]");
            } elsif ($typ == get_tag_closea_value()) {
                prt("$lnn: closea [$tag]");
            } elsif ($typ == get_tag_special_value()) {
                prt("$lnn: spl [$tag]");
            } elsif ($typ == get_tag_comment_value()) {
                # prt("$lnn: comm [$tag]");
                next;
            #} elsif ($typ == get_tag_text_value()) {
            } elsif ($istext) {
                $hrtext .= ' ' if (length($hrtext));
                $hrtext .= remove_html_entities($tag);
                if ($dbg_02) {
                    prt("$lnn: text [$tag]");
                } else {
                    next;
                }
            } else {
                prt("$lnn: unknown [$tag]");
            }
            prt(" attr [$att]") if (length($att));
            prt("\n");
        } else {
            # not yet in body
            if ($isnorm && ($tag =~ /^body/i)) {
                $inbody = 1;
                prt("$lnn: entered BODY\n");
            }
        }
    }
}

#########################################
### MAIN ###
#parse_args(@ARGV);
#prt( "$pgmname: in [$cwd]: Hello, World...\n" );
#process_in_file($in_file);
#get_this_page($chk_page);
#check_a_page($out_url3);
#pgm_exit(0,"");
prt("$pgmname: get FGx issues list as of ".get_YYYYMMDD(time())."\n");
my $ref_arr = get_html();
show_html_ra($ref_arr);
pgm_exit(0,"");
########################################
sub give_help {
    prt("$pgmname: version 0.0.1 2010-09-11\n");
    prt("Usage: $pgmname [options] in-file\n");
    prt("Options:\n");
    prt(" --help (-h or -?) = This help, and exit 0.\n");
}
sub need_arg {
    my ($arg,@av) = @_;
    pgm_exit(1,"ERROR: [$arg] must have following argument!\n") if (!@av);
}

sub parse_args {
    my (@av) = @_;
    my ($arg,$sarg);
    while (@av) {
        $arg = $av[0];
        if ($arg =~ /^-/) {
            $sarg = substr($arg,1);
            $sarg = substr($sarg,1) while ($sarg =~ /^-/);
            if (($sarg =~ /^h/i)||($sarg eq '?')) {
                give_help();
                pgm_exit(0,"Help exit(0)");
            } else {
                pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n");
            }
        } else {
            $in_file = $arg;
            prt("Set input to [$in_file]\n");
        }
        shift @av;
    }

    if ((length($in_file) ==  0) && $debug_on) {
        $in_file = $def_file;
    }
    if (length($in_file) ==  0) {
        pgm_exit(1,"ERROR: No input files found in command!\n");
    }
    if (! -f $in_file) {
        pgm_exit(1,"ERROR: Unable to find in file [$in_file]! Check name, location...\n");
    }
}

# eof - getfgxlist.pl

index -|- top

checked by tidy  Valid HTML 4.01 Transitional