#!/usr/bin/perl # The Missing Textutils, Ondrej Bojar, obo@cuni.cz # http://www.cuni.cz/~obo/textutils # # 'sparse_to_c4.5' converts a sparse matrix representation (from stdin) to # input suitable for c4.5. # The output is stored in the files ARG1.data and ARG1.names. # # $Id: sparse_to_c4.5,v 1.4 2007/09/25 04:45:43 bojar Exp $ # use Getopt::Long; use strict; use warnings; no strict 'subs'; my $ignore_duplicit_values = 0; my $blankval = "-"; my $test = 0; my $help = 0; my $coldelim = "\t"; my $valdelim = ":"; my $default_value = undef; my $groupdelim = "/"; my $outdelim = ","; my $usemore = ""; my $no_default_output = 0; my @use = (); GetOptions("test=s"=>\$test, "help" => \$help, "coldelim=s" =>\$coldelim, "outdelim=s" =>\$outdelim, "valdelim=s" =>\$valdelim, "groupdelim=s" =>\$groupdelim, "nondirected" =>\$no_default_output, "use=s@" => \@use, "usemore=s" => \$usemore, "defvalue=s" => \$default_value, "blankvalue=s" => \$blankval, "ignore-duplicit-values" => \$ignore_duplicit_values, ); push @use, split(/,/, $usemore); $outdelim =~ s/\\t/\t/g; $outdelim =~ s/\\\\/\\/g; my $baseoutname=shift; if (!$baseoutname || $help) { print STDERR "usage: sparse_to_c4.5 baseoutputfilename Converts sparse matrix input into data suitable for c4.5 Input line sample: answer group1/var1:value1 group2/var3:valueB Options: --test=filename ... build unseen test dataset --coldelim=str ... the delimiter between colums/items on each line --outdelim=str ... the delimiter to use in output.data file --valdelim=str ... the delimiter between varname and value --groupdelim=str ... the delimiter between groupname and varname --use=group1 --use=group2 ... list of group names of attributes to be used --usemore=group1,group2 ... same as --use=group1 --use=group2 --nondirected ... the first column is no special 'answer' attribute --defvalue=str ... the default value if there is no valdelim found in an item --blankvalue=str ... the value if the column is not metioned in a line --ignore-duplicit-values ... keep silent if more (equal) values are assigned to a column "; exit 1; } my %use = map { ($_, 1) } @use; my $cols; sub load { my $instream = shift; my $vals; my $nr = 0; while (<$instream>) { $nr++; print STDERR "." if $nr % 10000 == 0; print STDERR "($nr)" if $nr % 100000 == 0; chomp; my @line = split /$coldelim/; for (my $i=0; $i<=$#line; $i++) { next if ! defined $line[$i] || $line[$i] eq ""; # ignore blank fields my $name; my $value; if ($i == 0 && !$no_default_output) { $name = "A_N_S_W_E_R"; $value = $line[$i]; } else { ($name, $value) = split /$valdelim/, $line[$i]; if (0{$name}->{$value} = 1; if (defined $vals->[$nr]->{$name}) { die "Multiply defined $name: $value vs. $vals->[$nr]->{$name}" if $value ne $vals->[$nr]->{$name} || !$ignore_duplicit_values; } $vals->[$nr]->{$name} = $value; # print STDERR "vals->[ $nr ] {$name} = $value\n"; } } print STDERR "Done.\n"; return $vals; } sub store { my $outsuffix = shift; my $vals = shift; my $needsblank = shift; open OUTDATA, ">$baseoutname.$outsuffix" || die "Can't write to $baseoutname.$outsuffix"; my $nr = 0; foreach my $row (@$vals) { next if !defined $row; my @line; foreach my $k (sort keys %$cols) { next if $k eq "A_N_S_W_E_R"; my $val = $row->{$k}; if (!defined $val) { $val = $blankval; $needsblank->{$k} = 1; } push @line, $val; } push @line, $row->{"A_N_S_W_E_R"} if !$no_default_output; print OUTDATA join($outdelim, @line)."\n"; $nr++; print STDERR "." if $nr % 10000 == 0; print STDERR "($nr)" if $nr % 100000 == 0; } print STDERR "Done.\n"; close OUTDATA; return $needsblank; } print STDERR "Loading: "; my $datavals = load(STDIN); my $needsblank = undef; if ($test) { open TESTF, "$test" or die "Can't read '$test'"; my $testvals = load(TESTF); close TESTF; $needsblank = store("test", $testvals, undef); } print STDERR "Saving: "; $needsblank = store("data", $datavals, $needsblank); open OUTDATA, ">$baseoutname.names" || die "Can't write to $baseoutname.names"; print OUTDATA "| Generated by sparse_to_c4.5\n"; print OUTDATA "\n"; if (!$no_default_output) { print OUTDATA join(",", sort {$a cmp $b} keys %{$cols->{"A_N_S_W_E_R"}}) .".\n"; } else { print OUTDATA "| all the columns are equal, there is no 'default output' attribute\n"; } print OUTDATA "\n"; foreach my $k (sort keys %$cols) { next if $k eq "A_N_S_W_E_R"; print OUTDATA "$k:\t"; my @vals = keys %{$cols->{$k}}; push @vals, $blankval if $needsblank->{$k}; if (1 == scalar @vals) { print OUTDATA "ignore"; } else { print OUTDATA join(",", sort {$a cmp $b} @vals); } print OUTDATA ".\n"; } close OUTDATA;