#!/usr/bin/perl # The Missing Textutils, Ondrej Bojar, obo@cuni.cz # http://www.cuni.cz/~obo/textutils # # 'split_train_test' reads all input lines, shuffles them and # then produces a training file and a test file # # $Id: split_train_test,v 1.1 2006/06/16 08:36:05 bojar Exp $ # use Getopt::Long; use strict; my $limit = 0; my $parts = 10; my $usage = 0; GetOptions( "usage" => \$usage, "limit=i" => \$limit, "parts=i" => \$parts, # use 1 part'th of the data ); my $trainfile = shift; my $testfile = shift; if ($usage || !defined $trainfile || !defined $testfile) { print STDERR "split_train_test outtrainingfile outtestfile < lines Options: --limit=N ... shuffle all lines, but use only first N --parts=N ... use 1/N of lines as the test data "; exit 1; } my @lines; while (<>) { push @lines, $_; } my @uselines; my $nr = 0; while (@lines) { $nr++; my $rnd = int(rand($#lines+1)); push @uselines, $lines[$rnd]; splice(@lines, $rnd, 1); last if $limit && $nr >= $limit; } my @test = splice @uselines, 0, (scalar @uselines)/$parts; open OF, ">$trainfile" or die "Can't write $trainfile"; foreach my $l (@uselines) { print OF $l; } close OF; open OF, ">$testfile" or die "Can't write $testfile"; foreach my $l (@test) { print OF $l; } close OF;