#!/usr/bin/perl
# The Missing Textutils, Ondrej Bojar, obo@cuni.cz
# http://www.cuni.cz/~obo/textutils
#
# 'tpr_tnr' reads a column in the input, calculating true positive rate
# and true negative rate at various cut-offs. The column is expected to
# contain 1 in case of a positive example and 0 in case of negative example.
# If the lines are in perfect order, all positives come before all negatives.
# Use this script on files sorted non-perfectly to examine the accuracy of 
# the sorting compared to the golden positive-negative dichotomy.
#
# $Id: tpr_tnr,v 1.1 2006/04/09 22:40:16 bojar Exp $

use strict;
use Getopt::Long;

my $usage = 0;
my $skip = 0;  # skip n lines before putting the headline
my $step = 30;  # produce stats every step lines
GetOptions(
  "help"=>\$usage,
  "skip=i"=>\$skip,
  "step=i"=>\$step,
);
my $col = shift;
die "usage: tpr_tnr <col_index>  < infile   > outfile" if !$col || $usage;

if ($skip) {
  while (<>) {
    $skip--;
    print;
    last if $skip == 0;
  }
}

$col--;
my @data = ();
while (<>) {
  my $line = $_;
  chomp;
  my @cols = split /\t/;
  my $g = $cols[$col];
  push @data, [ $g, $line ];
}


for my $i (0..$#data) {
  my ($g, $line) = @{$data[$i]};

  if ($i % $step == 0) {
    my %stat = ();

    for my $j (0..$#data) {
      my ($g, undef) = @{$data[$j]};
      my $pred = ($j<$i) ? 1 : 0;
      $stat{"p$pred, g$g"}++;
    }

    foreach my $k (keys %stat) {
      print "$k: $stat{$k}   ";
    }
    print "\n";
    
    my $p = $stat{"p0, g1"} + $stat{"p1, g1"};
    my $n = $stat{"p0, g0"} + $stat{"p1, g0"};
    
    printf "TPR: %.2f  ", $stat{"p1, g1"}/$p*100 if $p>0;
    printf "TNR: %.2f  ", $stat{"p0, g0"}/$n*100 if $n>0;

    my $predp = $stat{"p1, g0"} + $stat{"p1, g1"};
    printf "Prec: %.2f  ", $stat{"p1, g1"}/$predp*100 if $predp>0;
    printf "Rec: %.2f  ", $stat{"p1, g1"}/$p*100 if $p>0;
    print "\n";
  }

  print $line;
}

