#!/usr/bin/perl
# The Missing Textutils, Ondrej Bojar, obo@cuni.cz
# http://www.cuni.cz/~obo/textutils
#
# 'numsort' sorts stdin properly based on numeric values (float supported,
# unlike in vanilla 'sort'), alphabetic values or even frequency.
#
# $Id: numsort,v 1.16 2013-09-21 12:46:14 bojar Exp $
#

use strict;
use locale;
use Getopt::Long;
use FileHandle;
use IPC::Open2;

sub usage {
  print STDERR "numsort sorting-request <stdin >stdout
  --utf8  ... the input is UTF-8
  --delim=NEW_DELIMITER
  --skip=<number_of_lines_to_copy_without_sorting>
  --order=sorting-request  ... useful if sorting request starts with -
Example of sorting requests:
  1        ... sort numerically ascending by the value of column 1
  s1       ... sort numerically and interpret [kK][mM][gG][tT] as 1000
  g1       ... sort numerically and interpret [kK][mM][gG][tT] as 1024
  a-2      ... sort alphabetically desceding by the value of column 2
  d2       ... like 1 but skip value up to the first (minus followed by a) digit
  f-1,a1   ... sort by descending frequency of value of column 1 and then
               alphabetically by the same column. For instance if used on
	       a phone book, all Smiths come at the beginning but will come
	       after Simeons in the unlikely case that you phone book contained
	       the same number of Simeons as Smiths.
 it is possible to use 'n' instead of '-'
";
  exit 1;
}

my %units = qw( k 1 m 2 g 3 t 4 );

my $usage = 0;
my $delim = "\t";
my $order;
my $skip = 0;
my $utf8 = 0;
GetOptions("help" => \$usage, "delim=s" => \$delim, "order=s"=>\$order,
  "skip=i" => \$skip,
  "utf8" => \$utf8
) or exit 1;
$order = shift if !defined $order;
usage() if $usage || !defined $order;

if ($utf8) {
  binmode(STDIN, ":utf8");
  binmode(STDOUT, ":utf8");
  binmode(STDERR, ":utf8");
}

my @order = split /,/, $order;
my %sortcolname;
my @sortopts;
for(my$i=0; $i<=$#order; $i++) {
  my $colname = $order[$i];
  $colname =~ s/^([afdsg]?[-n]?)//;
  my $sortopts = $1;
  if ($colname !~ /^[0-9]+$/) {
    # this is a column name
    print STDERR "Need to read headers due to '$colname' ($order[$i])\n";
    $sortcolname{$colname} = $i;
    $sortopts[$i] = $sortopts;
  }
}
if (0<scalar keys %sortcolname) {
  $skip = 0 if $skip==1;
  my $l = <>;
  print $l;
  chomp $l;
  my @labels = split /\t/, $l;
  my %found = ();
  for(my $i=0; $i<scalar @labels; $i++) {
    my $lab = $labels[$i];
    if (defined $sortcolname{$lab}) {
      $found{$lab} = 1;
      $order[$sortcolname{$lab}] = $sortopts[$sortcolname{$lab}].($i+1);
      # print STDERR "Setting order $sortcolname{$lab} to: $order[$sortcolname{$lab}]\n";
    }
  }
  foreach my $lab (sort keys %sortcolname) {
    next if $found{$lab};
    die "Cannot sort by '$lab', because it is not in the headline.";
  }
}

while ($skip > 0) {
  my $l = <>;
  print $l;
  $skip --;
}

my @data;
my $freq; # $freq->[colnumber]->{key}  ... number of occurrences
while (<>) {
  my $line = $_;
  chomp;
  my @line = ($line, (split /$delim/o));
  # update frequencies; warning, @line iterates the elements from 1, freq from 0
  for(my $i=1; $i<=$#line; $i++) {
    $freq->[$i-1]->{$line[$i]}++;
  }
  push @data, \@line;
}

my @sorted_data = sort
    {
      my_compare($a, $b);
    }
    @data;

foreach my $l (@sorted_data) {
  my ($line, undef) = @{$l};
  print $line;
}

sub my_compare {
  my $a = shift;
  my $b = shift;
  my @a = @{$a}; shift @a;
  my @b = @{$b}; shift @b;
  my $cmp = 0;
  foreach my $origcol (@order) {
    my $col = $origcol;
    my $neg = 1;
    my $skiptilldigit = 0;
    my $type = undef;
    if ($col =~ /^([afsg])/) {
      die "Conflicting type '$type' vs. '$1' in '$origcol'" if defined $type;
      $type = $1;
      $col =~ s/^$type//;
    }
    if ($col =~ /^d/) {
      $col =~ s/^d//;
      $skiptilldigit = 1;
    }
    if ($col =~ /^[-n]/) {
      $col =~ s/^[-n]//;
      $neg = -1;
    }
    $col--;
    my $usea;
    my $useb;
    if ($skiptilldigit) {
      ($usea, $usea) = map { s/^.*?(-?[0-9])/$1/; $_ } ($a[$col], $b[$col]);
    } else {
      ($usea, $useb) = ($a[$col], $b[$col]);
    }
    $type eq "i" if ! defined $type;
    if ($type eq "a") {
      $cmp = $usea cmp $useb;
    } elsif ($type eq "f") {
      $cmp = $freq->[$col]->{$usea} <=> $freq->[$col]->{$useb};
    } elsif ($type eq "g") {
      ($usea, $useb) = map { interpret_units(1024, $_); } ($usea, $useb);
      $cmp = $usea <=> $useb;
    } elsif ($type eq "s") {
      ($usea, $useb) = map { interpret_units(1000, $_); } ($usea, $useb);
      $cmp = $usea <=> $useb;
    } else {
      $cmp = $usea <=> $useb;
    }
    $cmp = $cmp*$neg;
    last if $cmp != 0;
  }
  # print "  ".join(":", @a) ." <$cmp> ". join(":", @b)."\n";
  return $cmp;
}

sub interpret_units {
  my $mult = shift;
  my $val = shift;
  if ($val =~ /^\s*([-0-9.]+)([kmgt])/i) {
    my $power = $units{lc($2)};
    $val = $1 * $mult**$power;;
  }
  return $val;
}


