#!/usr/bin/perl # The Missing Textutils, Ondrej Bojar, obo@cuni.cz # http://www.cuni.cz/~obo/textutils # # 'numsort' sorts stdin properly based on numeric values (float supported, # unlike in vanilla 'sort'), alphabetic values or even frequency. # # $Id: numsort,v 1.13 2010-11-02 17:06:04 bojar Exp $ # use strict; use Getopt::Long; use FileHandle; use IPC::Open2; sub usage { print STDERR "numsort sorting-request stdout --utf8 ... the input is UTF-8 --delim=NEW_DELIMITER --skip= --order=sorting-request ... useful if sorting request starts with - Example of sorting requests: 1 ... sort numerically ascending by the value of column 1 a-2 ... sort alphabetically desceding by the value of column 2 d2 ... like 1 but skip value up to the first (minus followed by a) digit f-1,a1 ... sort by descending frequency of value of column 1 and then alphabetically by the same column. For instance if used on a phone book, all Smiths come at the beginning but will come after Simeons in the unlikely case that you phone book contained the same number of Simeons as Smiths. it is possible to use 'n' instead of '-' "; exit 1; } my $usage = 0; my $delim = "\t"; my $order; my $skip = 0; my $utf8 = 0; GetOptions("help" => \$usage, "delim=s" => \$delim, "order=s"=>\$order, "skip=i" => \$skip, "utf8" => \$utf8 ) or exit 1; $order = shift if !defined $order; usage() if $usage || !defined $order; if ($utf8) { binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); binmode(STDERR, ":utf8"); } my @order = split /,/, $order; my %sortcolname; my @sortopts; for(my$i=0; $i<=$#order; $i++) { my $colname = $order[$i]; $colname =~ s/^([afd]?[-n]?)//; my $sortopts = $1; if ($colname !~ /^[0-9]+$/) { # this is a column name print STDERR "Need to read headers due to '$colname' ($order[$i])\n"; $sortcolname{$colname} = $i; $sortopts[$i] = $sortopts; } } if (0; print $l; chomp $l; my @labels = split /\t/, $l; my %found = (); for(my $i=0; $i 0) { my $l = <>; print $l; $skip --; } my @data; my $freq; # $freq->[colnumber]->{key} ... number of occurrences while (<>) { my $line = $_; chomp; my @line = ($line, (split /$delim/o)); # update frequencies; warning, @line iterates the elements from 1, freq from 0 for(my $i=1; $i<=$#line; $i++) { $freq->[$i-1]->{$line[$i]}++; } push @data, \@line; } my @sorted_data = sort { my_compare($a, $b); } @data; foreach my $l (@sorted_data) { my ($line, undef) = @{$l}; print $line; } sub my_compare { my $a = shift; my $b = shift; my @a = @{$a}; shift @a; my @b = @{$b}; shift @b; my $cmp = 0; foreach my $origcol (@order) { my $col = $origcol; my $neg = 1; my $alph = 0; my $usefreq = 0; my $skiptilldigit = 0; if ($col =~ /^a/) { $col =~ s/^a//; $alph = 1; } if ($col =~ /^f/) { $col =~ s/^f//; $usefreq = 1; } if ($col =~ /^d/) { $col =~ s/^d//; $skiptilldigit = 1; } if ($col =~ /^[-n]/) { $col =~ s/^[-n]//; $neg = -1; } $col--; if ($alph) { $cmp = $a[$col] cmp $b[$col]; } elsif ($usefreq) { $cmp = $freq->[$col]->{$a[$col]} <=> $freq->[$col]->{$b[$col]}; } elsif ($skiptilldigit) { my ($tmpa, $tmpb) = map { s/^.*?(-?[0-9])/$1/; $_ } ($a[$col], $b[$col]); $cmp = $tmpa <=> $tmpb; } else { $cmp = $a[$col] <=> $b[$col]; } $cmp = $cmp*$neg; last if $cmp != 0; } # print " ".join(":", @a) ." <$cmp> ". join(":", @b)."\n"; return $cmp; }