#!/usr/bin/perl # The Missing Textutils, Ondrej Bojar, obo@cuni.cz # http://www.cuni.cz/~obo/textutils # # 'pastetags' is the complement of picktags. It replaces the contents of a tag # by the input. # Please note that the script is rather picky and works well only on texts # originally generated by picktags. Avoid touching tab and newline characters! # # You'd probably find LT XML tools much more versatile: # http://www.ltg.ed.ac.uk/software/xml/index.html # # $Id: pastetags,v 1.3 2009/01/02 12:23:13 bojar Exp $ # use strict; use Getopt::Long; my $help = 0; my $pick = 0; GetOptions( "help"=>\$help, "pick"=>\$pick, ) or exit 1; my $matrixfn = shift; $help = 1 if !defined $matrixfn; my $tagcnt = 0; my @tags = (); while (my $tag = shift) { push @tags, $tag; $tagcnt++; } if ($help || !$tagcnt) { print STDERR "usage: pastetags matrixfile.xml \"MMt.*?\" ... < values-to-paste\n outputs tab-separated file of *first* values of matching tags. Tags assumed *non*pair. Beware using greedy *! It would eat up also the end of the tag. "; exit 1; } *INF = my_open($matrixfn); my $nr = 0; while (<>) { $nr++; chomp; my $values = $_; chomp $values; my @values = split /\t/, $values; my $got_some = 0; while (!$got_some) { my $matrix = ; die "$matrixfn too short!" if !defined $matrix; foreach my $tag (@tags) { if ($matrix =~ /(<$tag>)([^<]*)/) { $got_some = 1; last; } } if (!$pick || $got_some) { foreach my $tag (@tags) { if ($matrix =~ s/(<$tag>)([^<]*)/$1$values[0]/) { die "$nr:Too few values in line: $values\n$nr:to fill matrix line: $matrix\n" if !defined $values[0]; shift @values; } } } print $matrix; } } while () { foreach my $tag (@tags) { if (/(<$tag>)([^<]*)/) { die "$matrixfn too long!, got $&"; } } print; } close INF; sub my_open { my $f = shift; die "Not found: $f" if ! -e $f; my $opn; my $hdl; my $ft = `file $f`; # file might not recognize some files! if ($f =~ /\.gz$/ || $ft =~ /gzip compressed data/) { $opn = "zcat $f |"; } elsif ($f =~ /\.bz2$/ || $ft =~ /bzip2 compressed data/) { $opn = "bzcat $f |"; } else { $opn = "$f"; } open $hdl, $opn or die "Can't open '$opn': $!"; binmode $hdl, ":utf8"; return $hdl; }