#!/usr/bin/perl # The Missing Textutils, Ondrej Bojar, obo@cuni.cz # http://www.cuni.cz/~obo/textutils # # 'parshuffle' shuffles lines in all input files simultaneously. All the input # files thus have to share the number of lines. # # $Id: parshuffle,v 1.2 2009-04-02 11:58:45 bojar Exp $ # use strict; use Getopt::Long; my $limit = 0; GetOptions("limit=i" => \$limit); my $firstfilename = shift; die "usage!" if ! defined $firstfilename; my @firstfile = load_file($firstfilename); my $total_lines = scalar(@firstfile); my @outorder = (); my @lines = 0 .. ($#firstfile); my $nr = 0; while (@lines) { $nr++; my $rnd = int(rand($#lines+1)); push @outorder, $lines[$rnd]; splice(@lines, $rnd, 1); last if $limit && $nr >= $limit; } save_reordered(\@outorder, $firstfilename, \@firstfile); @firstfile = (); # release memory foreach my $infn (@ARGV) { my @data = load_file($infn); die "Bad number of lines in $infn: expected $total_lines, got " .scalar(@data) if scalar(@data) != $total_lines; save_reordered(\@outorder, $infn, \@data); } sub save_reordered { my $outord = shift; my $infn = shift; $infn =~ s/\.(bz2|gz)$//; my $inlines = shift; my $outfn = $infn.".shuf"; open OUTF, ">$outfn" or die "Can't write $outfn"; foreach my $idx (@$outord) { print OUTF $inlines->[$idx]; } close OUTF; } sub load_file { my $fn = shift; my @lines = (); my $opn; my $hdl; my $ft = `file $fn`; # file might not recognize some files! if ($fn =~ /\.gz$/ || $ft =~ /gzip compressed data/) { $opn = "zcat $fn |"; } elsif ($fn =~ /\.bz2$/ || $ft =~ /bzip2 compressed data/) { $opn = "bzcat $fn |"; } else { $opn = "$fn"; } open $hdl, $opn or die "Can't open '$opn': $!"; while (<$hdl>) { push @lines, $_; } close $hdl; return @lines; }