#!/usr/bin/perl
# The Missing Textutils, Ondrej Bojar, obo@cuni.cz
# http://www.cuni.cz/~obo/textutils
#
# 'html2tab' dumps contents of HTML tables info nice tab-delimited plaintext.
# Requires HTML::TableContentParser.
#
# $Id: html2tab,v 1.3 2009-11-16 21:07:52 bojar Exp $
#

use HTML::TableContentParser;
$p = HTML::TableContentParser->new();

while (<>) {
  $html .= $_;
}


$html =~ s/<(\/?)\s*t[hd][^>]*>/<\1td>/gi;

$tables = $p->parse($html);
for $t (@$tables) {
  for $r (@{$t->{rows}}) {
    print join("\t", 
       map { fixdata($_->{data}) } @{$r->{cells}});
    print "\n";
  }
  print "\n";
}

sub fixdata {
  my $data = shift;
  #if ($data =~ /^[\xa0\s0-9]+$/) {
  if ($data =~ /^[\s\xa00-9]+([,.][\xa0\s0-9]+)?$/) {
    #print STDERR "<$data>";
    $data =~ s/[\s\xa0]//g;
    $data =~ s/,/./;
  }
  return $data;
}


#$HTML::TableContentParser::DEBUG=1;
#$tables = $p->parse($html);
#for $t (@$tables) {
#
#      foreach my $k (keys %{$t}) {
#        print "$k: ".$t->{$k}."\n";
#      }
#    for $r (@{$t->{rows}}) {
#			print "Row: ";
#      foreach my $k (keys %{$r}) {
#        print "$k: ".$r->{$k};
#      }
#      for $c (@{$r->{cells}}) {
#        print "[$c->{data}] ";				
#      }				
#      print "\n";			
#    }
#  }

