#!/usr/bin/perl # The Missing Textutils, Ondrej Bojar, obo@cuni.cz # http://www.cuni.cz/~obo/textutils # # 'html2tab' dumps contents of HTML tables info nice tab-delimited plaintext. # Requires HTML::TableContentParser. # # $Id: html2tab,v 1.3 2009-11-16 21:07:52 bojar Exp $ # use HTML::TableContentParser; $p = HTML::TableContentParser->new(); while (<>) { $html .= $_; } $html =~ s/<(\/?)\s*t[hd][^>]*>/<\1td>/gi; $tables = $p->parse($html); for $t (@$tables) { for $r (@{$t->{rows}}) { print join("\t", map { fixdata($_->{data}) } @{$r->{cells}}); print "\n"; } print "\n"; } sub fixdata { my $data = shift; #if ($data =~ /^[\xa0\s0-9]+$/) { if ($data =~ /^[\s\xa00-9]+([,.][\xa0\s0-9]+)?$/) { #print STDERR "<$data>"; $data =~ s/[\s\xa0]//g; $data =~ s/,/./; } return $data; } #$HTML::TableContentParser::DEBUG=1; #$tables = $p->parse($html); #for $t (@$tables) { # # foreach my $k (keys %{$t}) { # print "$k: ".$t->{$k}."\n"; # } # for $r (@{$t->{rows}}) { # print "Row: "; # foreach my $k (keys %{$r}) { # print "$k: ".$r->{$k}; # } # for $c (@{$r->{cells}}) { # print "[$c->{data}] "; # } # print "\n"; # } # }