#!/usr/bin/perl # The Missing Textutils, Ondrej Bojar, obo@cuni.cz # http://www.cuni.cz/~obo/textutils # # 'tdf_to_c4.5' converts a tab-delimited file (from stdin) to input suitable # for c4.5. # The output is stored in the files ARG1.data and ARG1.names. # # $Id: tdf_to_c4.5,v 1.6 2006/05/28 20:19:51 bojar Exp $ # use Getopt::Long; use strict; my $test = 0; my $help = 0; my $baseline = 0; GetOptions( "test=s"=>\$test, "help" => \$help, "baseline" => \$baseline, ); my $baseoutname=shift; if (!$baseoutname || $help) { print STDERR "usage: tdf_to_c4.5 baseoutputfilename Converts tabbed input into data suitable for c4.5 Options: --test=filename ... build unseen test dataset --baseline ... for the independent data set estimate the baseline (assign most frequent) and oracle (if only never seen data are wrong) rates "; exit 1; } my $vals; # here are all the vals stored my $known_line_len = undef; sub convert { my $outsuffix = shift; my $instream = shift; my $infname = shift; my @answers = (); open OUTDATA, ">$baseoutname.$outsuffix" || die "Can't write to $baseoutname.$outsuffix"; my $nr = 0; while (<$instream>) { $nr++; chomp; my @line = split /\t/; if (defined $known_line_len) { if (scalar @line != $known_line_len) { print STDERR "$infname:" if $infname; print STDERR "$nr:Bad number of columns: expected $known_line_len, got " .(scalar @line).". Skipping.\n"; next; } } else { $known_line_len = scalar @line; } for (my $i=0; $i<=$#line; $i++) { ## Protect the value $line[$i] =~ s/&/\&/g; $line[$i] =~ s/ /\&space;/g; $line[$i] =~ s/\./\˙/g; $line[$i] =~ s/\:/\:/g; $line[$i] =~ s/,/\,/g; $line[$i] =~ s/\?/\?/g; $line[$i] =~ s/\\/\&backslash;/g; $line[$i] = "␣" if !$line[$i] || $line[$i] eq ""; $vals->[$i]->{$line[$i]} = 1; } print OUTDATA join(",", @line)."\n"; push @answers, $line[$known_line_len-1]; } close OUTDATA; return \@answers; } my $goldanswers = convert("data", *STDIN); if ($test) { open TESTF, "$test" or die "Can't read '$test'"; my $needanswers = convert("test", *TESTF, $test); close TESTF; if ($baseline) { # Estimate the basic baseline and oracle rates my $n = scalar @$needanswers; my %answer; foreach my $a (@$goldanswers) { $answer{$a}++; } my @sorted_answers = sort{$answer{$b}<=>$answer{$a}}keys%answer; my $bestanswer = @sorted_answers[0]; my $oracl = 0; my $basel = 0; foreach my $e (@$needanswers) { $oracl++ if $answer{$e}; $basel++ if $e eq $bestanswer; } printf "Oracle: %.8f\n", $oracl/$n*100; printf "Baseline: %.8f\n", $basel/$n*100; } } open OUTDATA, ">$baseoutname.names" || die "Can't write to $baseoutname.names"; print OUTDATA "| Generated by tdf_to_c4.5\n"; print OUTDATA "\n"; print OUTDATA join(",", sort {$a <=> $b} keys %{$vals->[$known_line_len-1]}) .".\n"; print OUTDATA "\n"; for (my $i = 0; $i < $known_line_len-1; $i++) { print OUTDATA "COL".($i+1).":\t"; if (1 == scalar keys %{$vals->[$i]}) { print OUTDATA "ignore"; } else { print OUTDATA join(",", sort {$a <=> $b} keys %{$vals->[$i]}); } print OUTDATA ".\n"; } close OUTDATA;