#!/usr/bin/perl
# The Missing Textutils, Ondrej Bojar, obo@cuni.cz
# http://www.cuni.cz/~obo/textutils
#
# 'antidocx' extracts raw text from docx (MS Office 2007) documents. Rather a
# hack than a full solution.
#
# $Id: antidocx,v 1.2 2010-11-02 16:07:41 bojar Exp $

use strict;
use File::Temp qw /tempdir/;
use File::Path;
use Getopt::Long;

binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");

my $tempdir = "/tmp";
my $keep = 0;
my $debug = 0;

GetOptions(
  "keep" => \$keep,
  "debug" => \$debug,
) or exit 1;

my $fn;
while ($fn = shift) {
  antidocx($fn);
}

sub antidocx {
  my $fn = shift;

  my $tmp = tempdir("antidocxXXXX", CLEANUP=>0, DIR=>$tempdir);
  print STDERR "My tempdir: $tmp\n";


  # unzip the file
  my $fullfn = `readlink -f $fn`; chomp $fullfn;
  die "Confused by the filename $fn; Full pathname $fullfn" if ! -e $fullfn;

  safesystem("cd $tmp; unzip $fullfn >&2");

  my $mainfile = "$tmp/word/document.xml";
  die "Not a MS Word 2007 file?"
    if ! -e $mainfile;

  open INF, $mainfile or die "Failed to open $mainfile";
  binmode(INF, ":utf8");
  my $text = "";
  while (<INF>) {
    chomp;
    $text .= " ".$_;
  }
  close INF;
  my $printtext = 0;
  while ($text =~ /^(<.*?>|[^<]*)/) {
    last if $text eq "";
    my $this = $1;
    print STDERR "GOT: $this\n" if $debug;
    $text =~ s/^\Q$this//;
    if ($this =~ /<w:p[> ]/) {
      print "\n\n";
      next;
    }
    if ($this =~ /<w:tab[\/> ]/) {
      print "\t";
      next;
    }
    if ($this =~ /<w:t[> ]/) {
      $printtext = 1;
      next;
    }
    if ($this =~ /<\/w:t[> ]/) {
      $printtext = 0;
      next;
    }
    next if $this =~ /^</;
    print $this if $printtext || $this !~ /^\s*$/;
  }

  if ($keep) {
    print STDERR "Keeping $tmp, delete yourself.\n";
  } else {
    rmtree($tmp);
  }
}

sub safesystem {
  print STDERR "Executing: @_\n";
  system(@_);
  if ($? == -1) {
      print STDERR "Failed to execute: @_\n  $!\n";
      exit(1);
  }
  elsif ($? & 127) {
      printf STDERR "Execution of: @_\n  died with signal %d, %s coredump\n",
          ($? & 127),  ($? & 128) ? 'with' : 'without';
      exit(1);
  }
  else {
    my $exitcode = $? >> 8;
    print STDERR "Exit code: $exitcode\n" if $exitcode;
    return ! $exitcode;
  }
}

