#!/usr/bin/perl -CDS # The Missing Textutils, Ondrej Bojar, obo@cuni.cz # http://www.cuni.cz/~obo/textutils # # 'ua' strips off all accents over all latin letters. The name stands for # 'unicode to ascii'. # # $Id: ua,v 1.3 2010-05-27 20:56:42 bojar Exp $ # use strict; use utf8; require 5.008; require Encode; use Unicode::Normalize; while (<>) { # the variable we work on $_ = NFD( $_ ); ## decompose s/\pM//g; ## strip combining characters #s/[^\0-\x80]//g; ## clear everything else s/[“”«»]/"/g; ## simplify quotes s/[’]/'/g; ## simplify apostrophes s/[—]/-/g; ## simplify dashes s/±/+-/g; ## simplify plusminus print; } exit; ## alternative method, does not work on normalized writing where accents are # separate my @charnames = grep /\tLATIN \S+ LETTER/, split( /^/, do 'unicore/Name.pl' ); my %accents; for my $c ( split //, "ABCDEFGHIJKLMNOPRSTUVWXYZabcdefghijklmnopqrstuvwxyz" ) { my $case = ( $c eq lc $c ) ? 'SMALL' : 'CAPITAL'; $accents{$c} = join( '', map { chr hex( substr $_, 0, 4 ) } grep /\tLATIN $case LETTER \U$c WITH/, @charnames ); } # now use each element of %accents as a character class: while (<>) { for my $c ( keys %accents ) { s/[$accents{$c}]/$c/g; } print; }