#!/usr/bin/perl
# The Missing Textutils, Ondrej Bojar, obo@cuni.cz
# http://www.cuni.cz/~obo/textutils
#
# 'xml_to_tdf' converts <table></table> elements to plain tab-delimited text
# file.  If several <table> elements are found, blank line is put in between.
# Of course, neither nested tables nor span are supported.
# Tabs in input are replaced with space.
# All XML tags are deleted.
# Basic XML entities are expanded.
#
# $Id: xml_to_tdf,v 1.3 2010-11-02 16:07:41 bojar Exp $

use strict;
my $s = "";
while (<>) {
  chomp;
  $s .= " ".$_;
}

my $not_first = 0;
while ($s =~ /<table[^>]*>(.*?)<\/table[^>]*>/i) {
  my $d = $1;
  $s =~ s/<table[^>]*>(.*?)<\/table[^>]*>//i;
  print "\n" if $not_first;
  $not_first = 1;
  $d =~ s/[\t\n]/ /g;
  $d =~ s/\s*</</g;
  $d =~ s/>\s*/>/g;
  $d =~ s/<tr[^>]*>//i; # delete the first
  $d =~ s/<tr[^>]*>/\n/ig;
  $d =~ s/<t[dh][^>]*>//i; # delete the first
  $d =~ s/(\n.*?)<t[dh][^>]*>/\1/ig; # delete the first on every line
  $d =~ s/<t[dh][^>]*>/\t/ig;
  $d =~ s/<[^>]*>//g;
  $d =~ s/&amp;/&/g;
  $d =~ s/&quot;/"/g;
  $d =~ s/&apo;/'/g;
  $d =~ s/&lt;/</g;
  $d =~ s/&gt;/>/g;
  $d =~ s/ *\t */\t/g;
  print $d."\n";
}

#use Encode 'decode';
#use Encode 'encode';
#use strict;
#use XML::Parser;
