tidyword.pl to HTML.

index -|- end

Generated: Tue Feb 2 17:54:58 2010 from tidyword.pl 2007/05/06 5.7 KB.

#!/perl -w
# NAME: tidyword.pl
# AIM: Take a tidied word filtered html and remove MsoNormal, MsoPlainText calls paragraphs,
# That is no margin paragraphs separated by a 'blank' become 'normal' paragraphs, and
# paragraphs following each other become one paragraph with <br> separating the lines
# 06/05/2007 - geoff mclane - geoffmclane.com
use strict;
use warnings;
require 'logfile.pl' or die "Unable to load logfile.pl ...\n";
# log file stuff
my ($LF);
my $outfile = 'temp.'.$0.'.txt';
if ($0 =~ /\w{1}:\\.*/) {
   my @tmpsp = split(/\\/,$0);
   $outfile = 'temp.'.($tmpsp[-1]).'.txt';
}
open_log($outfile);
prt( "$0 ... Hello, World ...\n" );
my $def_file = "C:\\Documents and Settings\\Geoff McLane\\My Documents\\Louis\\tempout.htm";
my $in_file = $def_file;
my $out_file = "tempnew.htm";
my $lncnt = 0;
my @lines = ();
my $line = '';
my $tag = '';
my $ch = '';
my $boff = 0;
my $bln = 0;
my $bpoff = 0;
my $bpln = 0;
my $epoff = 0;
my $epln = 0;
my @paras = ();
my $inpara = 0;
# debug switches
my $dbg1 = 0;   # show line collection
if (open INF, "<$in_file") {
   @lines = <INF>;
   close INF;
}
$lncnt = scalar @lines;
prt( "Processing $lncnt lines from $in_file ...\n" );
for (my $i = 0; $i < $lncnt; $i++) {
   $line = $lines[$i];
   my $lnlen = length($line);
   for (my $j = 0; $j < $lnlen; $j++) {
      $ch = substr($line,$j,1);
      if ($ch eq '<') {
         if (length($tag)) {
            # deal with last tag
         }
         $tag = $ch;
         $boff = $j;
         $bln = $i;
         $j++;
         for ( ; $j < $lnlen; $j++) {
            $ch = substr($line,$j,1);
            $tag .= $ch;
            if ($ch eq '>') {
               # end of tag
               if ($tag =~ /^<p\s+(.*)>/i) {
                  prt( "$tag [$1] line $i:$boff\n" ) if ($dbg1);
                  $bpoff = $boff;
                  $bpln = $bln;
                  prt( "WARNING: Already in paragraph!\n" ) if ($inpara);
                  $inpara = 1;
               } elsif ($tag =~ /^<\/p>/) {
                  prt( "$tag CLOSED line $i:$j para: $bpln:$bpoff to $i:$j\n" ) if ($dbg1);
                  push(@paras, [$bpln, $bpoff, $i, $j, 1, "content"] );
                  $inpara = 0;
               }
               last;
            }
         }
      }
   }
}
my $pcnt = scalar @paras;
prt( "Looking at $pcnt paragraphs ...\n" );
for (my $i = 0; $i < $pcnt; $i++) {
   #push(@paras, [$bpln, $bpoff, $i, $j, 1, "content"] );
   $bpln = $paras[$i][0];
   $bpoff = $paras[$i][1];
   $epln = $paras[$i][2];
   $epoff = $paras[$i][3];
   prt( "Paragraph: $bpln:$bpoff to $epln:$epoff\n" ) if ($dbg1);
   $line = getpara( $bpln, $bpoff, $epln, $epoff, @lines );
   my $ln2 = getcontent( $line );
   my $res = ($ln2 =~ /\S/);
   if ($res) {
      prt( "content: $ln2 [$bpln, $bpoff, $epln, $epoff]\n" );
      $paras[$i][4] = length($ln2);
      $paras[$i][5] = $ln2;
   } else {
      prt( "$line (BLANK) [$bpln, $bpoff, $epln, $epoff]\n" );
      $paras[$i][4] = 0;
      $paras[$i][5] = "";
   }
}
open OUT, ">$out_file" or mydie( "ERROR: Unable to create $out_file ... $! ...\n" );
my $lastbr = 0;
for (my $i = 0; $i < $lncnt; $i++) {
   $line = $lines[$i];
   my $lnlen = length($line);
   my $ln2 = '';
   my $flg = 0;
   my $flg2 = 0;
   my $endp = '';
   my $i2 = lineinparas($i);
   $bpln = 0;
   if ($i2 < $pcnt) {
      $bpln = $paras[$i2][0];
      $bpoff = $paras[$i2][1];
      $epln = $paras[$i2][2];
      $epoff = $paras[$i2][3];
      $flg = $paras[$i2][4];
      $ln2 = $paras[$i2][5];
      # deal with substitution ...
      $flg2 = 0;
      if (($i2 + 1) < $pcnt) {
         $flg2 = $paras[$i2+1][4];
      }
      if ($flg) {
         if ($flg2) {
            $endp = '<br>';
         } else {
            $endp = '</p>';
         }
      } else {
         $endp = 'KILL';
      }
      if ($bpln == $epln) {
         if ($flg) {
            prt( "DEAL WITH LINE $i ...[$bpln, $bpoff, $epln, $epoff]\n<p>$ln2$endp ($lastbr)\n" );
            if ($lastbr == 0) {
               print OUT "<p>";
            }
            print OUT $ln2.$endp;
            if ($endp =~ /<br>/) {
               $lastbr = 1;
            } else {
               $lastbr = 0;
            }
         } else {
            prt( "KILL LINE $i ...[$bpln, $bpoff, $epln, $epoff] ($ln2) ($lastbr)\n" );
         }
      } else {
         if ($flg) {
            if ($lastbr == 0) {
               print OUT "<p>";
            }
            print OUT $ln2.$endp;
            prt( "DEAL WITH LINES $i-$epln ...[$bpln, $bpoff, $epln, $epoff]\n$ln2$endp\n ($lastbr)" );
            if ($endp =~ /<br>/) {
               $lastbr = 1;
            } else {
               $lastbr = 0;
            }
         } else {
            prt( "KILL LINE $i ...[$bpln, $bpoff, $epln, $epoff] ($ln2) ($lastbr)\n" );
         }
      }
      if ($epln > $bpln) {
         $i = $epln;
      }
   } else {
      print OUT $line; 
      chomp $line;
      prt( "$line ($i)\n" );
   }
}
close OUT;
close_log($outfile,1);
exit(0);
sub lineinparas {
   my ($il) = shift;
   for (my $j1 = 0; $j1 < $pcnt; $j1++) {
      my $pl1 = $paras[$j1][0];
      my $pl2 = $paras[$j1][2];
      if ($pl1 == $il) {
         return $j1;   # found this LINE
      } elsif ($pl1 > $il ) {
         last;   # reached a line GREATER
      }
      if ($pl2 > $pl1) {
         if ($il > $pl1) {
            if ($pl2 >= $il) {
               return $j1;   # found this LINE
            }
         }
      }
      # continue while para line LT given line
   }
   return $pcnt + 1;
}
sub getcontent {
   my ($ln) = shift;
   if ($ln =~ /^<p\s+.*>(.*)<\/p>/) {
      $ln = $1;
   } else {
      my $c = '';
      my $i = 0;
      my $nln = '';
      my $len = length($ln);
      for ($i = 0; $i < $len; $i++) {
         $c = substr($ln,$i,1);
         if ($c eq '>') {
            $i++;
            last;
         }
      }
      if ($c eq '>') {
         for (; $i < $len; $i++) {
            $c = substr($ln,$i,1);
            if ($c eq '<') {
               last;
            }
            $nln .= $c;
         }
      }
      $ln = $nln if length($nln);
   }
   $ln =~ s/&nbsp;/ /g;
   return $ln;
}
sub getpara {
   my ( $bpl, $bpo, $epl, $epo, @lns ) = @_;
   my $ln = $lns[$bpl];
   my $ll = length($ln);
   if ($bpo) {
      $ln = substr($ln,$bpo);
   }
   if ($bpl == $epl) {
      $ln = substr($ln,0, $epo - $bpo + 1);
   } else {
      while( $bpl < $epl ) {
         $bpl++;
         my $ln2 = $lns[$bpl];
         if ($bpl == $epl) {
            $ln2 = substr($ln2, 0, $epo + 1);
         }
         $ln .= $ln2;
      }
   }
   return $ln;
}
# eof

index -|- top

checked by tidy  Valid HTML 4.01 Transitional