#!/usr/bin/perl # The Missing Textutils, Ondrej Bojar, obo@cuni.cz # http://www.cuni.cz/~obo/textutils # # 'antidocx' extracts raw text from docx (MS Office 2007) documents. Rather a # hack than a full solution. # # $Id: antidocx,v 1.2 2010-11-02 16:07:41 bojar Exp $ use strict; use File::Temp qw /tempdir/; use File::Path; use Getopt::Long; binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); binmode(STDERR, ":utf8"); my $tempdir = "/tmp"; my $keep = 0; my $debug = 0; GetOptions( "keep" => \$keep, "debug" => \$debug, ) or exit 1; my $fn; while ($fn = shift) { antidocx($fn); } sub antidocx { my $fn = shift; my $tmp = tempdir("antidocxXXXX", CLEANUP=>0, DIR=>$tempdir); print STDERR "My tempdir: $tmp\n"; # unzip the file my $fullfn = `readlink -f $fn`; chomp $fullfn; die "Confused by the filename $fn; Full pathname $fullfn" if ! -e $fullfn; safesystem("cd $tmp; unzip $fullfn >&2"); my $mainfile = "$tmp/word/document.xml"; die "Not a MS Word 2007 file?" if ! -e $mainfile; open INF, $mainfile or die "Failed to open $mainfile"; binmode(INF, ":utf8"); my $text = ""; while () { chomp; $text .= " ".$_; } close INF; my $printtext = 0; while ($text =~ /^(<.*?>|[^<]*)/) { last if $text eq ""; my $this = $1; print STDERR "GOT: $this\n" if $debug; $text =~ s/^\Q$this//; if ($this =~ / ]/) { print "\n\n"; next; } if ($this =~ / ]/) { print "\t"; next; } if ($this =~ / ]/) { $printtext = 1; next; } if ($this =~ /<\/w:t[> ]/) { $printtext = 0; next; } next if $this =~ /^> 8; print STDERR "Exit code: $exitcode\n" if $exitcode; return ! $exitcode; } }