#!/usr/bin/perl
# The Missing Textutils, Ondrej Bojar, obo@cuni.cz
# http://www.cuni.cz/~obo/textutils
#
# 'update_cols' updates stdin and produces stdout according to the given update
# file.
# All files are expected to be tab-delimited.
#
# $Id: update_cols,v 1.5 2005/10/10 07:33:50 bojar Exp $
#

use Getopt::Long;

sub usage {
  print STDERR "update_cols update_file <stdin >stdout
Options:
  --paste= ... comma delimited list of indices of cols in the pasted (update)
               file, with respect to the columns in the main file
	       Use 0 to ignore the column.
  --keys=  ... comma delimited set of col indices from pasted that serve as keys
               to decide whether the update is to be done on the current line
  --trim      ... strip whitespace from data before mapping
Limitations: Update file is read to the memory.
Example:
  --keys=1 --paste=0,3,4
  If the value of the column 1 in main input is equal to a value in the column 1
  of a line in the update file, then the line of the update file is used as
  follows: the first column is not used, the second column is used to replace
  the column 3 and the third column is used to replace the column 4 of the input
  line.
";
  exit 1;
}

$pastecols = 0;
$keycols = 0;
$usage = 0;
$trim = 0;
GetOptions("help" => \$usage, "trim" => \$trim,
           "paste=s" => \$pastecols, "keys=s" => \$keycols);
usage() if $usage;

$mapfile = shift;
usage() if !$mapfile || !$pastecols || !$keycols;

@pastecols = map { $_ -1 } (split /[, ]+/, $pastecols);
@keycols = map { $_-1 } (split /[, ]+/, $keycols);
%index_in_paste_file = map { $i++; ($_, $i-1); } @pastecols;
my $lastpastecol = 0;
foreach my $c (@pastecols) {
  $lastpastecol = $c if $c > $lastpastecol;
}
#print STDERR join(":", %index_in_paste_file)."\n";
#print STDERR join(":", @pastecols)."\n";
#print STDERR join(":", @keycols)."\n";

open MF, "$mapfile" or die ("update_cols: Can't open update file: $mapfile");
while (<MF>) {
  $nl++;
  chomp;
  my @line = ();
  if ($trim) {
    @line = map {$_=~s/^ *| *$//g; $_;} split /\t/;
  } else {
    @line = split /\t/;
  }
  #print STDERR join("#", @line)."\n";
  $key = join("\t", map {
           #print STDERR "keycol $_, index_in_pastef $index_in_paste_file{$_}, "
	   #  ."value $line[$index_in_paste_file{$_}]\n";
           $line[$_]} @keycols);
           # Unused variant, incomprehensible column indexing
           # $line[$index_in_paste_file{$_}]} @keycols);
  #print STDERR "Defining key: $key\n";
  if (defined $mapping{$key}) {
    print STDERR "$mapfile:$nl:Key '$key' already defined.\n";
    $err = 1;
  } else {
    $mapping{$key} = \@line;
  }
}
close MF;

exit 1 if $err;

$nl = 0;
while (<>) {
  $nl++;
  chomp;
  @line = split /\t/;
  $key = join("\t", map {@line[$_]} @keycols);
  print STDERR "Using key: $key\n" if $verbose;
  if (defined $mapping{$key}) {
    my @update = @{$mapping{$key}};
    print STDERR "  update: @update\n" if $verbose;
    my $maxcol = $#line;
    $maxcol = $lastpastecol if $maxcol < $lastpastecol;
    for(my $pastecol=0; $pastecol <= $#pastecols; $pastecol++) {
      next if $pastecols[$pastecol] == -1;
      print STDERR "Replacing $pastecols[$pastecol] ($line[$pastecols[$pastecol]]) with $update[$pastecol]\n" if $verbose;
      $line[$pastecols[$pastecol]] = $update[$pastecol];
    }
    #for my $col (0 .. $#line) {
    #  if (defined $index_in_paste_file{$col}) {
    #    print STDERR "Replacing $col ($line[$col]) with $update[$index_in_paste_file{$col}]\n";
    #    $line[$col] = $update[$index_in_paste_file{$col}]
    #  }
    #}
    $used{$key} ++;
  }
  print join("\t", @line)."\n";
}

exit;

foreach my $k (keys %mapping) {
  next if $used{$k};
  print STDERR "Unused update line: "
    .join("\t", @{$mapping{$k}})."\n"
}


