#!/usr/bin/perl # The Missing Textutils, Ondrej Bojar, obo@cuni.cz # http://www.cuni.cz/~obo/textutils # # 'xml_to_tdf' converts
elements to plain tab-delimited text # file. If several elements are found, blank line is put in between. # Of course, neither nested tables nor span are supported. # Tabs in input are replaced with space. # All XML tags are deleted. # Basic XML entities are expanded. # # $Id: xml_to_tdf,v 1.3 2010-11-02 16:07:41 bojar Exp $ use strict; my $s = ""; while (<>) { chomp; $s .= " ".$_; } my $not_first = 0; while ($s =~ /]*>(.*?)<\/table[^>]*>/i) { my $d = $1; $s =~ s/]*>(.*?)<\/table[^>]*>//i; print "\n" if $not_first; $not_first = 1; $d =~ s/[\t\n]/ /g; $d =~ s/\s*\s*/>/g; $d =~ s/]*>//i; # delete the first $d =~ s/]*>/\n/ig; $d =~ s/]*>//i; # delete the first $d =~ s/(\n.*?)]*>/\1/ig; # delete the first on every line $d =~ s/]*>/\t/ig; $d =~ s/<[^>]*>//g; $d =~ s/&/&/g; $d =~ s/"/"/g; $d =~ s/&apo;/'/g; $d =~ s/<//g; $d =~ s/ *\t */\t/g; print $d."\n"; } #use Encode 'decode'; #use Encode 'encode'; #use strict; #use XML::Parser;