#!/usr/bin/perl # The Missing Textutils, Ondrej Bojar, obo@cuni.cz # http://www.cuni.cz/~obo/textutils # # 'ngrams' reads tokenized plain text and returns n-grams of the given order. # # $Id: ngrams,v 1.3 2010-11-02 16:07:41 bojar Exp $ use strict; use Getopt::Long; my $ngrams = 2; my $bos_token = undef; my $eos_token = undef; GetOptions("ngrams=i" => \$ngrams, "bos-token=s" => \$bos_token, "eos-token=s" => \$eos_token ) or exit 1; while (<>) { chomp; my @words = split / /; unshift @words, $bos_token if defined $bos_token; push @words, $eos_token if defined $eos_token; while ($#words >= $ngrams-1) { print join(" ", @words[0..$ngrams-1])."\n"; shift @words; } }