#!/usr/bin/perl # The Missing Textutils, Ondrej Bojar, obo@cuni.cz # http://www.cuni.cz/~obo/textutils # # 'zwc' is a word counting utility (wc) on (normal and) compressed files. # Allows also counting words in each column separately. # Allows also grouping using a column value (but always within each file). # Allows also quick estimate based on the beginning of the file. # # $Id: zwc,v 1.2 2012-01-23 08:36:16 bojar Exp $ use strict; use Getopt::Long; my $fl = undef; my @glob = (); my $countw = 0; my $countc = 0; my $countb = 0; my $countl = 0; my $countcw = 0; my $ensure_input_col_count = undef; my $total_only = 0; my $groupcol = undef; my $utf = 0; my $estimate = 0; my $estimate_time_limit = 10; # estimate in 10 seconds my $estimate_at_perc = 50; # estimate once at 50% of bytes read GetOptions( "fl|filelist=s" => \$fl, "g|glob=s" => \@glob, "utf" => \$utf, # input is in utf8 "l" => \$countl, # lines "w" => \$countw, # words (delimited by white space) "c" => \$countc, # utf-8 characters if --utf or bytes by default, incl. \n "b" => \$countb, # bytes "cw|count-words-in-columns" => \$countcw, "ensure-input-col-count=i" => \$ensure_input_col_count, "tot|total-only" => \$total_only, "group=i" => \$groupcol, "estimate" => \$estimate, "estimate-time-limit=i" => \$estimate_time_limit, "estimate-at-perc=i" => \$estimate_at_perc, ) or exit 1; if (!$countw && !$countc && !$countl && !$countcw && !$countb) { # default is to count words, chars and lines $countw = $countc = $countl = 1; } $groupcol-- if defined $groupcol; my @fl = (); if (defined $fl) { if ($fl eq "-") { *IN = *STDIN; } else { open IN, $fl or die "Can't read $fl"; } while () { chomp; push @fl, $_; } if ($fl ne "-") { close(IN); } } else { push @ARGV, "-" if 0 == scalar @ARGV; } my $totc = 0; my $totb = 0; my $totw = 0; my $totl = 0; my @totcw = (); @totcw = map {0} (1..$ensure_input_col_count) if defined $ensure_input_col_count; foreach my $f (@ARGV, (map {(glob($_));} @glob), @fl) { runwc($f); } if ($total_only) { print "$totl\t" if $countl; print "$totw\t" if $countw; print "$totc\t" if $countc; print "$totb\t" if $countb; print join("\t", map { $_ || 0 } @totcw)."\t" if $countcw; print "\n"; } sub runwc { my $f = shift; my $totbytes = 0; my $opn; if ( -e $f ) { my $ft = `file '$f'`; # file might not recognize some files! if ($f =~ /\.gz$/ || $ft =~ /gzip compressed data/) { $opn = "zcat '$f' |"; if ($estimate) { my $sizes = `gzip -l --quiet '$f'`; chomp $sizes; if ($sizes =~ /^\s*[0-9]+\s+([0-9]+)\s+([-0-9.]*)%/) { $totbytes = $1; my $ratio = $2; print STDERR "## Estimate very unreliable on poorly compressed files: $f; comp.rat. $ratio\n" if $ratio < 50; } else { die "Failed to get uncompressed size from $sizes"; } } } elsif ($f =~ /\.bz2$/ || $ft =~ /bzip2 compressed data/) { $opn = "bzcat '$f' |"; die "Cannot --estimate for bzipped files" if $estimate; } else { $opn = "$f"; $totbytes = -S $f; } } elsif ( $f eq "-" ) { $opn = "-"; } else { die "Not found: $f"; } open INF, $opn or die "Can't read: $opn"; binmode(INF, ":utf8") if $utf; my $l = 0; my $w = 0; my $c = 0; my $b = 0; my @cw = (); my $groupdata = undef; my $starttime = time(); my $early_stop = 0; while () { if ($estimate && $l % 1000 == 0 && $b > 0) { # consider stopping if ($estimate_time_limit && (time()-$starttime) > $estimate_time_limit) { $early_stop = "after $estimate_time_limit seconds"; last; } if ($estimate_at_perc && $b/$totbytes*100 > $estimate_at_perc) { $early_stop = sprintf("at %.1f%% of bytes", $b/$totbytes*100); last; } } $l++; my $neww = 0; my $newc = 0; my $newb = 0; my @newcw = (); if ($countw) { $neww = scalar(split /\s+/, trim($_)); $w += $neww; } if ($countc) { $newc = length($_); $c += $newc; } if ($countb || $estimate) { $newb = do { use bytes; length($_) }; $b += $newb; } if ($countcw || defined $groupcol) { my @line = split /\t/; chomp $line[-1]; if ($countcw) { die "$f:$l: Unexpected number of columns: got " .scalar(@line).", expected ".$ensure_input_col_count if defined $ensure_input_col_count && $ensure_input_col_count != scalar(@line); for(my $i=0; $i<=$#line; $i++) { $newcw[$i] = scalar(split /\s+/, trim($line[$i])); $cw[$i] += $newcw[$i]; } } if (defined $groupcol) { # collect counts to per-group statistics my $group = trim($line[$groupcol]); my @new = ( $countl ? (1) : (), $countw ? ($neww) : (), $countc ? ($newc) : (), $countb ? ($newb) : (), $countcw ? (@newcw) : () ); for(my $i=0; $i{$group}->[$i] += $new[$i]; } } } } close INF; my $scale = $totbytes / $b if $early_stop; if ($early_stop) { print STDERR "# stopped $early_stop, estimating the total\n"; $b = $totbytes; # this is exact $c = int($c*$scale); $l = int($l*$scale); $w = int($w*$scale); if ($countcw) { map { int($_*$scale) } @cw; } } if (!$total_only) { if (defined $groupcol) { foreach my $group (sort {$a cmp $::b} keys %$groupdata) { if ($early_stop) { print STDERR "# group information not reliable with --estimate\n"; map { int( ($_ || 0) * $scale ) } @{$groupdata->{$group}}; } print join("\t", (map {$_ || 0 } @{$groupdata->{$group}}, $group, $f)); print "\n"; } } else { print "$l\t" if $countl; print "$w\t" if $countw; print "$c\t" if $countc; print "$b\t" if $countb; print join("\t", map { $_ || 0 } @cw)."\t" if $countcw; print $f; print "\n"; } } $totc += $c; $totb += $b; $totl += $l; $totw += $w; for(my $i=0; $i<@cw; $i++) { $totcw[$i] += $cw[$i]; } } sub trim { my $s = shift; $s =~ s/^\s+//; $s =~ s/\s+$//; return $s; }