#!/usr/bin/perl -CDS
# The Missing Textutils, Ondrej Bojar, obo@cuni.cz
# http://www.cuni.cz/~obo/textutils
#
# 'ua' strips off all accents over all latin letters. The name stands for 
# 'unicode to ascii'.
#
# $Id: ua,v 1.5 2014-01-07 09:56:01 bojar Exp $
#

use strict;
use utf8;
require 5.008;

require Encode;
use Unicode::Normalize;

while (<>) {  # the variable we work on
  $_ = NFD( $_ );   ##  decompose
  s/\pM//g;         ##  strip combining characters
  #s/[^\0-\x80]//g;  ##  clear everything else
  tr/ıł/il/;  ## other chars I spotted
  s/[“”«»]/"/g; ## simplify quotes
  s/[’]/'/g; ## simplify apostrophes
  s/[—]/-/g; ## simplify dashes
  s/±/+-/g; ## simplify plusminus
  print;
}


exit;

## alternative method, does not work on normalized writing where accents are
# separate

my @charnames = grep /\tLATIN \S+ LETTER/, split( /^/, do 'unicore/Name.pl' );

my %accents;

for my $c ( split //, "ABCDEFGHIJKLMNOPRSTUVWXYZabcdefghijklmnopqrstuvwxyz" ) {
    my $case = ( $c eq lc $c ) ?  'SMALL' : 'CAPITAL';
    $accents{$c} =
          join( '', map { chr hex( substr $_, 0, 4 ) }
                grep /\tLATIN $case LETTER \U$c WITH/, @charnames );
}

# now use each element of %accents as a character class:

while (<>) {
    for my $c ( keys %accents ) {
        s/[$accents{$c}]/$c/g;
    }
    print;
}


