#!/usr/bin/perl
# The Missing Textutils, Ondrej Bojar, obo@cuni.cz
# http://www.cuni.cz/~obo/textutils
#
# 'ngrams' reads tokenized plain text and returns n-grams of the given order.
#
# $Id: ngrams,v 1.3 2010-11-02 16:07:41 bojar Exp $

use strict;
use Getopt::Long;

my $ngrams = 2;
my $bos_token = undef;
my $eos_token = undef;
GetOptions("ngrams=i" => \$ngrams,
  "bos-token=s" => \$bos_token,
  "eos-token=s" => \$eos_token
) or exit 1;

while (<>) {
  chomp;
  my @words = split / /;
  unshift @words, $bos_token if defined $bos_token;
  push @words, $eos_token if defined $eos_token;
  while ($#words >= $ngrams-1) {
    print join(" ", @words[0..$ngrams-1])."\n";
    shift @words;
  }
}

