#!/usr/bin/perl
# The Missing Textutils, Ondrej Bojar, obo@cuni.cz
# http://www.cuni.cz/~obo/textutils
#
# 'pastetags' is the complement of picktags. It replaces the contents of a tag
# by the input.
# Please note that the script is rather picky and works well only on texts
# originally generated by picktags. Avoid touching tab and newline characters!
#
# You'd probably find LT XML tools much more versatile:
#   http://www.ltg.ed.ac.uk/software/xml/index.html
#
# $Id: pastetags,v 1.3 2009/01/02 12:23:13 bojar Exp $
#

use strict;
use Getopt::Long;

my $help = 0;
my $pick = 0;
GetOptions(
  "help"=>\$help,
  "pick"=>\$pick,
) or exit 1;

my $matrixfn = shift;

$help = 1 if !defined $matrixfn;

my $tagcnt = 0;
my @tags = ();
while (my $tag = shift) {
  push @tags, $tag;
  $tagcnt++;
}
if ($help || !$tagcnt) {
  print STDERR "usage: pastetags matrixfile.xml \"MMt.*?\" ... < values-to-paste\n
outputs tab-separated file of *first* values of matching tags.
Tags assumed *non*pair.
Beware using greedy *! It would eat up also the end of the tag.
";
  exit 1;
}

*INF = my_open($matrixfn);

my $nr = 0;
while (<>) {
  $nr++;
  chomp;
  my $values = $_;
  chomp $values;
  my @values = split /\t/, $values;

  my $got_some = 0;
  while (!$got_some) {
    my $matrix = <INF>;
    die "$matrixfn too short!" if !defined $matrix;

    foreach my $tag (@tags) {
      if ($matrix =~ /(<$tag>)([^<]*)/) {
        $got_some = 1;
        last;
      }
    }
    if (!$pick || $got_some) {
      foreach my $tag (@tags) {
        if ($matrix =~ s/(<$tag>)([^<]*)/$1$values[0]/) {
          die "$nr:Too few values in line: $values\n$nr:to fill matrix line: $matrix\n"
            if !defined $values[0];
          shift @values;
        }
      }
    }
    print $matrix;
  }
}
while (<INF>) {
  foreach my $tag (@tags) {
    if (/(<$tag>)([^<]*)/) {
      die "$matrixfn too long!, got $&";
    }
  }
  print;
}
close INF;


sub my_open {
  my $f = shift;
  die "Not found: $f" if ! -e $f;

  my $opn;
  my $hdl;
  my $ft = `file $f`;
  # file might not recognize some files!
  if ($f =~ /\.gz$/ || $ft =~ /gzip compressed data/) {
    $opn = "zcat $f |";
  } elsif ($f =~ /\.bz2$/ || $ft =~ /bzip2 compressed data/) {
    $opn = "bzcat $f |";
  } else {
    $opn = "$f";
  }
  open $hdl, $opn or die "Can't open '$opn': $!";
  binmode $hdl, ":utf8";
  return $hdl;
}
