#!/usr/bin/perl
# The Missing Textutils, Ondrej Bojar, obo@cuni.cz
# http://www.cuni.cz/~obo/textutils
#
# 'split_train_test' reads all input lines, shuffles them and
# then produces a training file and a test file
#
# $Id: split_train_test,v 1.1 2006/06/16 08:36:05 bojar Exp $
#

use Getopt::Long;
use strict;

my $limit = 0;
my $parts = 10;
my $usage = 0;

GetOptions(
  "usage" => \$usage,
  "limit=i" => \$limit,
  "parts=i" => \$parts, # use 1 part'th of the data
);
my $trainfile = shift;
my $testfile = shift;

if ($usage || !defined $trainfile || !defined $testfile) {
  print STDERR "split_train_test outtrainingfile outtestfile < lines
Options:
  --limit=N  ... shuffle all lines, but use only first N
  --parts=N  ... use 1/N of lines as the test data
";
  exit 1;
}

my @lines;
while (<>) {
  push @lines, $_;
}

my @uselines;
my $nr = 0;
while (@lines) {
  $nr++;
  my $rnd = int(rand($#lines+1));
  push @uselines, $lines[$rnd];
  splice(@lines, $rnd, 1);
  last if $limit && $nr >= $limit;
}

my @test = splice @uselines, 0, (scalar @uselines)/$parts;

open OF, ">$trainfile" or die "Can't write $trainfile";
foreach my $l (@uselines) {
  print OF $l;
}
close OF;

open OF, ">$testfile" or die "Can't write $testfile";
foreach my $l (@test) {
  print OF $l;
}
close OF;
