#!/usr/bin/perl
# The Missing Textutils, Ondrej Bojar, obo@cuni.cz
# http://www.cuni.cz/~obo/textutils
#
# 'map' processes stdin to stdout, changing specified columns or expressions
# according to a mapping given in a file or on the command line.
#
# $Id: map,v 1.17 2013-06-17 19:03:46 bojar Exp $
#

use strict;
use Getopt::Long;

sub usage {
  print STDERR "map mapping_file <stdin >stdout
Options:
  --srccol=X   ... source of the mapping, a column number of the mapping file
               ... default is the first column
  --tgtcol=X   ... destination of the mapping, a column number of the m. file
               ... default is the second column (deprecated name: --destcol=X)
  --mapcols=X,Y... columns of stdin to be altered, default: all
  --pattern=RE ... map all occurrences of the given pattern
               ... default is to map exactly the given col
  --trim       ... strip whitespace from data before mapping
  --default=S  ... use this value, if the mapping doesn't define anything
  --quiet      ... no warnings to stderr
  --restrict   ... suppress the while input line, if something was not mapped!
  --map=PERLARRAY ... instead of mapping_file, one can specify the mapping
                   as perl array, such as: '\"green\"=>1,\"red\"=>2'
  --utf8       ... set binmode of all streams to utf-8
Limitations: Pattern can never contain <TAB>. Mapping file is all read.
";
  exit 1;
}

my $default = undef;
my $utf8 = 0;
my $restrict = 0;
my $mapcols = 0;
my $srccol = 1;
my $destcol = 2;
my $pattern = 0;
my $usage = 0;
my $quiet = 0;
my $trim = 0;
my $skip = 0;
my $must_use_all_mappings = 0;
my $debug = 0;
my $mapdata = 0;
GetOptions("help" => \$usage, "mapcols=s" => \$mapcols, "trim" => \$trim,
  "srccol=i" => \$srccol, "tgtcol|destcol=i" => \$destcol,
  "map=s" => \$mapdata, "default=s" => \$default,
  "skip=i" => \$skip,
  "restrict" => \$restrict,
  "utf8" => \$utf8,
  "must-use-all-mappings" => \$must_use_all_mappings,
  "pattern=s" => \$pattern, "quiet" => \$quiet,
) or exit 1;
usage() if $usage;
$srccol --; $destcol --;
my @mapcols = map { $_ -1 } (split /[, ]+/, $mapcols) if $mapcols;

if ($utf8) {
  binmode(STDIN, ":utf8");
  binmode(STDOUT, ":utf8");
  binmode(STDERR, ":utf8");
}

my %is_mapped;
my %mapping;

if (!$mapdata) {
  my $mapfile = shift;
  usage() if !$mapfile;
  my $fh = my_open($mapfile);
  my $nl=0;
  while (<$fh>) {
    $nl++;
    chomp;
    my @line = split /\t/;
    $line[$srccol] =~ s/^ *| *$//g if $trim;
    $line[$destcol] =~ s/^ *| *$//g if $trim;
    if ($is_mapped{$line[$srccol]} && $mapping{$line[$srccol]} ne $line[$destcol]) {
      print STDERR "$mapfile:$nl:Ambigous mapping for >$line[$srccol]<\n"
        if !$quiet;
    } else {
      $is_mapped{$line[$srccol]} = 1;
      $mapping{$line[$srccol]} = $line[$destcol];
    }
  }
  close $fh;
} else {
  eval '%mapping = ('.$mapdata.')';
  foreach my $k (keys %mapping) {
    $is_mapped{$k} = 1;
  }
}

my $nl = 0;
while ($skip > 0) {
  my $l = <>;
  print $l;
  $skip--;
  $nl++;
}
my $line_mapped;
my %mapping_used;
while (<>) {
  $nl++;
  $line_mapped = 1;
  chomp;
  my @line = split /\t/;

  if ($mapcols) {
    for my $col (@mapcols) {
      $line[$col] =~ s/^ *| *$//g if $trim;
      $line[$col] = do_map($line[$col]);
    }
  } else {
    for my $col (0 .. $#line) {
      $line[$col] =~ s/^ *| *$//g if $trim;
      $line[$col] = do_map($line[$col]);
    }
  }
  print join("\t", @line)."\n"
    if !$restrict || $line_mapped;
}

if ($must_use_all_mappings) {
  my $err = 0;
  foreach my $s (sort keys %mapping) {
    next if $mapping_used{$s};
    print STDERR "Unused mapping: $s\t$mapping{$s}\n";
    $err++;
  }
  exit 1 if $err;
}

sub perform_map {
  my $s = shift;
  print "MAPPING: $s  ..>  $mapping{$s}\n" if $debug;
  if ($is_mapped{$s}) {
    $mapping_used{$s}++;
    return $mapping{$s};
  }
  $line_mapped = 0;
  if (!$quiet) {
    print STDERR "$nl:W:No mapping for >$s<"
      .(defined $default ? ", using the default >$default<" : "")
      ."\n";
  }
  return $default if defined $default;
  return $s;
}

sub do_map {
  my $s = shift;
  
  if ($pattern) {
    if ($pattern =~ /(^|[^\\])\(/) {
      # pattern contains selector
      $s =~ s/$pattern/perform_map $1/ge;
    } else {
      # no selector in pattern, select the whole result
      $s =~ s/($pattern)/perform_map $1/ge;
    }
    return $s;
  } else {
    return perform_map($s);
  }
}


sub my_open {
  my $f = shift;
  if ($f eq "-") {
    binmode(STDIN, ":utf8");
    return *STDIN;
  }

  die "Not found: $f" if ! -e $f;

  my $opn;
  my $hdl;
  my $ft = `file '$f'`;
  # file might not recognize some files!
  if ($f =~ /\.gz$/ || $ft =~ /gzip compressed data/) {
    $opn = "zcat '$f' |";
  } elsif ($f =~ /\.bz2$/ || $ft =~ /bzip2 compressed data/) {
    $opn = "bzcat '$f' |";
  } else {
    $opn = "$f";
  }
  open $hdl, $opn or die "Can't open '$opn': $!";
  binmode $hdl, ":utf8";
  return $hdl;
}

