#!/usr/bin/perl
# The Missing Textutils, Ondrej Bojar, obo@cuni.cz
# http://www.cuni.cz/~obo/textutils
#
# 'parshuffle' shuffles lines in all input files simultaneously. All the input
# files thus have to share the number of lines.
#
# $Id: parshuffle,v 1.2 2009-04-02 11:58:45 bojar Exp $
#

use strict;
use Getopt::Long;

my $limit = 0;
GetOptions("limit=i" => \$limit);

my $firstfilename = shift;
die "usage!" if ! defined $firstfilename;

my @firstfile = load_file($firstfilename);
my $total_lines = scalar(@firstfile);
my @outorder = ();
my @lines = 0 .. ($#firstfile);
my $nr = 0;
while (@lines) {
  $nr++;
  my $rnd = int(rand($#lines+1));
  push @outorder, $lines[$rnd];
  splice(@lines, $rnd, 1);
  last if $limit && $nr >= $limit;
}

save_reordered(\@outorder, $firstfilename, \@firstfile);
@firstfile = (); # release memory

foreach my $infn (@ARGV) {
  my @data = load_file($infn);
  die "Bad number of lines in $infn: expected $total_lines, got "
    .scalar(@data) if scalar(@data) != $total_lines;
  save_reordered(\@outorder, $infn, \@data);
}

sub save_reordered {
  my $outord = shift;
  my $infn = shift;
  $infn =~ s/\.(bz2|gz)$//;
  my $inlines = shift;

  my $outfn = $infn.".shuf";
  open OUTF, ">$outfn" or die "Can't write $outfn";
  foreach my $idx (@$outord) {
    print OUTF $inlines->[$idx];
  }
  close OUTF;
}

sub load_file {
  my $fn = shift;
  my @lines = ();
  my $opn;
  my $hdl;
  my $ft = `file $fn`;
  # file might not recognize some files!
  if ($fn =~ /\.gz$/ || $ft =~ /gzip compressed data/) {
    $opn = "zcat $fn |";
  } elsif ($fn =~ /\.bz2$/ || $ft =~ /bzip2 compressed data/) {
    $opn = "bzcat $fn |";
  } else {
    $opn = "$fn";
  }
  open $hdl, $opn or die "Can't open '$opn': $!";
  while (<$hdl>) {
    push @lines, $_;
  }
  close $hdl;
  return @lines;
}

