#!/usr/bin/perl
# The Missing Textutils, Ondrej Bojar, obo@cuni.cz
# http://www.cuni.cz/~obo/textutils
#
# 'pickre' is extremely useful to collect specific information from every line.
# The $1 output of a given regexp is prepended (tab-delimited) at the beginning
# of every line.
#
# $Id: pickre,v 1.7 2011-11-24 16:55:21 bojar Exp $
#
use strict;
use Getopt::Long;

my $re = 0;
my $collect = 0;
my $uniq = 0;
my $pick = 0;
my $delim = " ";
my $cut = 0;
my $utf = 0;
my $col = undef;
GetOptions(
  "re=s"=>\$re,
  "delim=s"=>\$delim,
  "collect"=>\$collect,
  "uniq"=>\$uniq,
  "pick"=>\$pick,
  "cut"=>\$cut,
  "utf8"=>\$utf,
  "col=i"=>\$col,
) or exit 1;
$collect = 1 if $uniq;

usage() if !$re;
sub usage {
  print STDERR "pickre --re=what_to_pick
  Searches for a token on every line and precedes the line with an extra column
containing the token (if found).
Options:
  --collect  ... delimit with space all collected tokens
  --uniq     ... ignore number of occurrences and order of found tokens
                 (implies collect)
  --pick     ... print only lines where something was indeed found
  --delim=' ' ... join multiple tokens of output with this delimiter
  --cut ... don't append the original line (used to be called just-output
  --col=N    ... pick only from column N (numbered from 1)
  --utf8     ... assume input in utf-8
";
  exit 1;
}
$re = "($re)" if $re !~ /[^\\]\(|^\(/;

if ($utf) {
  binmode(STDIN, ":utf8");
  binmode(STDOUT, ":utf8");
  binmode(STDERR, ":utf8");
}

while (<>) {
  my $line = $_;
  chomp $line;
  my @toks = ();
  my $done = 0;
  my $haystack;
  if (defined $col) {
    my @cols = split /\t/, $line;
    $haystack = $cols[$col-1];
  } else {
    $haystack = $line;
  }
  while (($#toks < 0 || $collect) && !$done) {
    if ($haystack =~ /$re/o) {
      push @toks, $1;
      $haystack =~ s/$re//o;
    } else {
      $done = 1;
    }
  }
  my @usetoks;
  if ($uniq) {
    my %tmp = map {($_,1)} @toks;
    @usetoks = sort keys %tmp;
  } else {
    @usetoks = @toks;
  }
  my $info = join($delim, @usetoks);
  if (!$pick || scalar @usetoks) {
    print "$info";
    if ($cut) {
      print "\n";
    } else {
      print "\t$line\n";
    }
  }
}
