#!/usr/bin/perl # The Missing Textutils, Ondrej Bojar, obo@cuni.cz # http://www.cuni.cz/~obo/textutils # # 'quantize' is a first step to histogram. It replaces every value in the # specified column with the label of the "box" where the value fits. # # $Id: quantize,v 1.4 2006/10/11 22:20:08 bojar Exp $ use strict; use Getopt::Long; my $discrete = ''; my $seq; my $seqprec = 0; my $min = "-inf"; my $max = "+inf"; my $verbose = 0; my $skip = 0; # skip any line my $help = 0; my $method = "manual"; # the user should provide boxbounds my $boxes = 10; # if $method ne "manual", how many boxes to use GetOptions( "skip=i" => \$skip, "discrete" => \$discrete, "verbose" => \$verbose, "min=s" => \$min, "max=s" => \$max, "seq=s" => \$seq, "seqprec=i" => \$seqprec, "help" => \$help, "method=s" => \$method, "boxes=i" => \$boxes, ) or exit 1; my $colindex = shift; $colindex -- if defined $colindex; my $bars; if (defined $seq) { my ($a, $step, $b) = split /,/, $seq; $min = $a if $min eq "-inf"; $max = $b if $max eq "+inf"; $a+=$step; my @bounds = (); while ($a < $b) { push @bounds, $a; $a+=$step; } $bars = join(",", map {sprintf("%.$seqprec"."f", $_)} @bounds); # print "$boxeslabel\n"; } else { if ($method eq "even" || $method eq "histogram") { # need to preload the data and estimate boxes $bars = "AUTO"; } elsif ($method eq "manual") { $bars = shift; } else { die "Unknown method $method"; } } if (!$bars || $help || !defined $colindex) { print STDERR "usage: quantize colindex boxesdesc < infile > outfile Options: --skip=N ... dump first N lines without quantizing --min=X ... make the label of the first box to to start at X --max=X ... make the label of the last box to end at X --seq=A,STEP,B ... set boxes delimiters to start at A and stop at B, stepping by STEP. Eg.: --seq=1,3,20 is equivalent to specifying boxesdesc as: 1,4,7,10,13,16,19,20 --seqprec=P ... the automatic boxes should be labelled with the specified precision, e.g.: --seq=0,0.1,1 --seqprec=1 produces 0.1,0.2,...0.9,1.0 --discrete ... the values are discrete, so the boxes should be labelled 1 - 10, 11 - 20, 21 - 30 and not 1 - 10, 10 - 20, 20 - 30 --method=even|histogram ... automatically guess box boundaries --boxes=N ... number of boxes to use when automatically guessing "; exit 1; } while ($skip > 0) { $_ = <>; $skip--; print; } my $preloaded = undef; if ($bars eq "AUTO") { # need to preload data and choose the bars positions my @nums = (); while (<>) { chomp; my @line = split /\t/; push @nums, $line[$colindex]; push @$preloaded, [@line]; } my @sortednums = sort { $b <=> $a } @nums; my $lowbound = $sortednums[-1]; my $highbound = $sortednums[0]; $min = $lowbound; $max = $highbound; if ($method eq "even") { my $step = ($highbound-$lowbound)/$boxes; my @bars = (); for my $i (1..$boxes) { my $bar = $lowbound+$i*$step; $bar = int($bar) if $discrete; push @bars, $bar; } $bars = join(",", @bars); } elsif ($method eq "histogram") { my $sum = 0; grep {$sum+=$_} @sortednums; my $boxlimit = $sum/$boxes; my $sumsofar = 0; my $boxempty = 1; my $should_go_in_this_box = undef; my $boxes_assigned = 1; my @bars = (); foreach my $num (@sortednums) { if ($sumsofar + $num <= $boxlimit*$boxes_assigned || $boxempty || $should_go_in_this_box == $num) { # extending the current box (the number fits in, or the box is still # empty or a same number was already in the box) # this behaviour is greedy, it'd be better to choose the between the two # bar options on the left or right side of a sequence of equal numbers $should_go_in_this_box = $num; $boxempty = 0; } else { # starting a new box, # ie. setting bar between $should_go_in_this_box and $num $boxes_assigned++; if ($discrete) { push @bars, $should_go_in_this_box; } else { push @bars, ($should_go_in_this_box+$num)/2; } $boxempty = 1; } $sumsofar += $num; } $bars = join(",", reverse @bars); print STDERR "Using bars: $bars\n"; } } my @boxbeg; my @boxend; my $boxcnt; my @bars = split /,/, $bars; my $i = 0; $boxbeg[0] = $min; for my $bar (@bars) { $boxbeg[$i] = $boxend[$i-1] + $discrete if $i >0; $boxend[$i] = $bar; $i++; } $boxbeg[$i] = $boxend[$i-1] + $discrete; $boxend[$i] = $max; $i++; $boxcnt = $i -1; if (defined $preloaded) { foreach my $line (@$preloaded) { my @line = @$line; my $box = getbox($line[$colindex]); $line[$colindex] = "$boxbeg[$box] - $boxend[$box]"; print join("\t", @line)."\n"; } } else { while (<>) { chomp; my @line = split /\t/; my $box = getbox($line[$colindex]); $line[$colindex] = "$boxbeg[$box] - $boxend[$box]"; print join("\t", @line)."\n"; } } sub getbox { my $val = shift; my $i; for($i =0; $i <= $boxcnt; $i++) { last if $val <= $boxend[$i]; } $i = $boxcnt if $i > $boxcnt; return $i; }