#!/usr/bin/perl
# The Missing Textutils, Ondrej Bojar, obo@cuni.cz
# http://www.cuni.cz/~obo/textutils
#
# 'quantize' is a first step to histogram. It replaces every value in the
# specified column with the label of the "box" where the value fits.
#
# $Id: quantize,v 1.5 2013-07-25 23:19:18 bojar Exp $

use strict;
use Getopt::Long;

my $collapse = 1; # show 3-3 as "3"
my $discrete = '';
my $discrete_leftist = 0;
my $seq;
my $seqprec = 0;
my $min = "-inf";
my $max = "+inf";
my $verbose = 0;
my $skip = 0; # skip any line
my $help = 0;
my $method = "manual"; # the user should provide boxbounds
my $boxes = 10; # if $method ne "manual", how many boxes to use
GetOptions(
  "skip=i" => \$skip,
  "discrete" => \$discrete,
  "leftist" => \$discrete_leftist,
  "verbose" => \$verbose,
  "min=s" => \$min, "max=s" => \$max,
  "seq=s" => \$seq,
  "seqprec=i" => \$seqprec,
  "help" => \$help,
  "method=s" => \$method,
  "boxes=i" => \$boxes,
) or exit 1;

my $colindex = shift;
$colindex -- if defined $colindex;

my $bars;
if (defined $seq) {
  my ($a, $step, $b) = split /,/, $seq;
  $min = $a if $min eq "-inf";
  $max = $b if $max eq "+inf";
  $a+=$step;
  my @bounds = ();
  while ($a < $b) {
    push @bounds, $a;
    $a+=$step;
  }
  $bars = join(",", map {sprintf("%.$seqprec"."f", $_)} @bounds);
  # print "$boxeslabel\n";
} else {
  if ($method eq "even" || $method eq "histogram") {
    # need to preload the data and estimate boxes
    $bars = "AUTO";
  } elsif ($method eq "manual") {
    $bars = shift;
  } else {
    die "Unknown method $method";
  }
}
if (!$bars || $help || !defined $colindex) {
  print STDERR "usage: quantize colindex boxesdesc < infile > outfile
Options:  
--skip=N    ... dump first N lines without quantizing
--min=X     ... make the label of the first box to to start at X
--max=X     ... make the label of the last box to end at X
--seq=A,STEP,B  ... set boxes delimiters to start at A and stop at B, stepping
                    by STEP. Eg.: --seq=1,3,20 is equivalent to specifying
                    boxesdesc as: 1,4,7,10,13,16,19,20
--seqprec=P ... the automatic boxes should be labelled with the specified
                precision, e.g.:
                --seq=0,0.1,1 --seqprec=1  produces 0.1,0.2,...0.9,1.0
--discrete  ... the values are discrete, so the boxes should be labelled
                1 - 10, 11 - 20, 21 - 30 and not 1 - 10, 10 - 20, 20 - 30
--leftist   ... the values are discrete, so the boxes should be labelled
                0 - 9, 10 - 19, 20 - 29 and not 1 - 10, 10 - 20, 20 - 30
--method=even|histogram  ... automatically guess box boundaries
--boxes=N   ... number of boxes to use when automatically guessing
";
  exit 1;
}



while ($skip > 0) {
  $_ = <>;
  $skip--;
  print;
}


my $preloaded = undef;
if ($bars eq "AUTO") {
  # need to preload data and choose the bars positions
  my @nums = ();
  while (<>) {
    chomp;
    my @line = split /\t/;
    push @nums, $line[$colindex];
    push @$preloaded, [@line];
  }

  my @sortednums = sort { $b <=> $a } @nums;
  my $lowbound = $sortednums[-1];
  my $highbound = $sortednums[0];
  $min = $lowbound;
  $max = $highbound;
  if ($method eq "even") {
    my $step = ($highbound-$lowbound)/$boxes;
    my @bars = ();
    for my $i (1..$boxes) {
      my $bar = $lowbound+$i*$step;
      $bar = int($bar) if $discrete || $discrete_leftist;
      push @bars, $bar;
    }
    $bars = join(",", @bars);
  } elsif ($method eq "histogram") {
    my $sum = 0;
    grep {$sum+=$_} @sortednums;
    my $boxlimit = $sum/$boxes;
    my $sumsofar = 0;
    my $boxempty = 1;
    my $should_go_in_this_box = undef;
    my $boxes_assigned = 1;
    my @bars = ();
    foreach my $num (@sortednums) {
      if ($sumsofar + $num <= $boxlimit*$boxes_assigned || $boxempty
          || $should_go_in_this_box == $num) {
        # extending the current box (the number fits in, or the box is still
	# empty or a same number was already in the box)
	# this behaviour is greedy, it'd be better to choose the between the two
	# bar options on the left or right side of a sequence of equal numbers
	$should_go_in_this_box = $num;
        $boxempty = 0;
      } else {
        # starting a new box,
	# ie. setting bar between $should_go_in_this_box and $num
	$boxes_assigned++;
	if ($discrete || $discrete_leftist) {
	  push @bars, $should_go_in_this_box;
	} else {
	  push @bars, ($should_go_in_this_box+$num)/2;
	}
	$boxempty = 1;
      }
      $sumsofar += $num;
    }
    $bars = join(",", reverse @bars);
    print STDERR "Using bars: $bars\n";
  }
}



my @boxbeg;
my @boxend;
my $boxcnt;

my @bars = split /,/, $bars;
my $i = 0;
$boxbeg[0] = $min - $discrete_leftist;
for my $bar (@bars) {
  $boxbeg[$i] = $boxend[$i-1] + $discrete + $discrete_leftist if $i >0;
  $boxend[$i] = $bar - $discrete_leftist;
  $i++;
}
$boxbeg[$i] = $boxend[$i-1] + $discrete;
$boxend[$i] = $max;
$i++;
$boxcnt = $i -1;


if (defined $preloaded) {
  foreach my $line (@$preloaded) {
    my @line = @$line;
    my $box = getbox($line[$colindex]);
    $line[$colindex] = "$boxbeg[$box] - $boxend[$box]";
    $line[$colindex] = $boxbeg[$box]
      if $collapse && $boxbeg[$box] eq $boxend[$box];
    print join("\t", @line)."\n";
  }
} else {
  while (<>) {
    chomp;
    my @line = split /\t/;
    my $box = getbox($line[$colindex]);
    $line[$colindex] = "$boxbeg[$box] - $boxend[$box]";
    $line[$colindex] = $boxbeg[$box]
      if $collapse && $boxbeg[$box] eq $boxend[$box];
    print join("\t", @line)."\n";
  }
}


sub getbox {
  my $val = shift;
  my $i;
  for($i =0; $i <= $boxcnt; $i++) {
    if ($max ne "+inf" && $val > $max) {
      print STDERR "$val higher than max $max, increasing max\n";
      $max = $val;
    }
    if ($min ne "+inf" && $val < $min) {
      print STDERR "$val lower than min $min, decreasing min\n";
      $min = $val;
    }
    if ($discrete_leftist) {
      last if $val < $boxend[$i];
    } else {
      last if $val <= $boxend[$i];
    }
  }
  $i = $boxcnt if $i > $boxcnt;
  return $i;
}
