#!/usr/bin/perl
# The Missing Textutils, Ondrej Bojar, obo@cuni.cz
# http://www.cuni.cz/~obo/textutils
#
# 'xml_to_tdf' converts
elements to plain tab-delimited text
# file. If several elements are found, blank line is put in between.
# Of course, neither nested tables nor span are supported.
# Tabs in input are replaced with space.
# All XML tags are deleted.
# Basic XML entities are expanded.
#
# $Id: xml_to_tdf,v 1.3 2010-11-02 16:07:41 bojar Exp $
use strict;
my $s = "";
while (<>) {
chomp;
$s .= " ".$_;
}
my $not_first = 0;
while ($s =~ /]*>(.*?)<\/table[^>]*>/i) {
my $d = $1;
$s =~ s/]*>(.*?)<\/table[^>]*>//i;
print "\n" if $not_first;
$not_first = 1;
$d =~ s/[\t\n]/ /g;
$d =~ s/\s*\s*/>/g;
$d =~ s/]*>//i; # delete the first
$d =~ s/
]*>/\n/ig;
$d =~ s/]*>//i; # delete the first
$d =~ s/(\n.*?)]*>/\1/ig; # delete the first on every line
$d =~ s/]*>/\t/ig;
$d =~ s/<[^>]*>//g;
$d =~ s/&/&/g;
$d =~ s/"/"/g;
$d =~ s/&apo;/'/g;
$d =~ s/<//g;
$d =~ s/ *\t */\t/g;
print $d."\n";
}
#use Encode 'decode';
#use Encode 'encode';
#use strict;
#use XML::Parser;