#!/usr/bin/perl
# The Missing Textutils, Ondrej Bojar, obo@cuni.cz
# http://www.cuni.cz/~obo/textutils
#
# Given any number of files as args, 'setcompare' analyzes the intersections
# of lines.
#
# $Id: setcompare,v 1.2 2013-10-28 22:52:51 bojar Exp $
#

use strict; use warnings;
use IO::File;
use Getopt::Long;

# my $showname = 0; # show file name as the first column
# GetOptions(
#   "showname" => \$showname,
# ) or exit 1;

my @files = @ARGV;
my $data = undef;
foreach my $inf (@files) {
  my $nr = 0;
  my %unique = ();
  my $openstr = ($inf =~ /\.gz$/ ? "zcat $inf |" : "< $inf");
  my $inh = IO::File->new($openstr) or die "Can't open '$openstr'";
  while (<$inh>) {
    chomp;
    $nr++;
    $unique{$_} ++;
  }
  close $inh;
  print $inf, "\t", $nr, " total lines\t", scalar(keys %unique), " unique lines",
    "\n";
  foreach my $l (keys %unique) {
    $data->{$l}->{$inf} = $unique{$l};
  }
}

print STDERR "Checking intersections...\n";

my %hist = ();
foreach my $l (keys %$data) {
  my $files_with_the_line = $data->{$l};
  $hist{scalar(keys(%$files_with_the_line))} ++;
}

print "This many lines\tAre seen in this many files\n";
foreach my $num_files (sort { $hist{$b} <=> $hist{$a} } keys %hist) {
  print $hist{$num_files}, "\t", $num_files, "\n";
}
