use strict; use locale; if ($#ARGV != 0){ die "Usage : ", $0, " EXCEPTIONS\n"; } open(EXCEP, "<", $ARGV[0]) or die "impossible d'ouvrir ", $ARGV[0]; my %exception; while (my $ligne = ){ chomp($ligne); $exception{$ligne} = 1; } close(EXCEP); my @excepqm; foreach my $e (sort {length($b) <=> length($a)} keys (%exception)){ push(@excepqm, quotemeta($e)); } my $excepre = join("|", @excepqm); while(my $ligne = ){ chomp($ligne); $ligne =~ s/(^|\p{NonPT}|\s|\.\.\.)(${excepre})(\p{NonPT}|\s|\.\.\.|$)/$1$2$3/g; my @res; foreach my $seg (split(//, $ligne)){ if (defined($exception{$seg})){ push(@res, $seg); } else { $seg =~ s/(\d)\s(\d)/$1$2/g; foreach my $m (split(/(\p{NonAmb}|\s|\.\.\.)/, $seg)){ if ($m eq "..." or $m =~ /^\p{NonAmb}$/){ push(@res, $m); } elsif ($m !~ /^\s*$/){ $m =~ s/(\PL|^)([cdjlmnst]\')/$1\t$2\t/ig; $m =~ s/(\PL|^)(\pL*[q][u]\')/$1\t$2\t/ig; $m =~ s/([\pL\d]+\.([\pL\d]+\.)+)/\t$1\t/g; $m =~ s/\.$/\t./; $m =~ s/(\D|^),/$1\t,\t/g; $m =~ s/,($|\D)/\t,\t$1/g; $m =~ s/-t-(elle|elles|en|il|ils|on|y)($|\PL)/\t-t-\t$1\t$2/ig; $m =~ s/-(ce|elle|elles|en|il|ils|je|la|le|leur|les|lui|moi|m\'|nous|on|toi|tu|t\'|vous|y)($|\PL)/\t-\t$1\t$2/ig; $m =~ s/(\pP)(\pP)/\t$1\t$2\t/g; $m =~ s// /g; $m =~ s/([<>])/\t$1\t/g; push(@res, (split(/\t/, $m))); } } } } my @res1 = grep(/./, @res); if (@res1){ print join("\n", @res1), "\n" ; } } sub NonPT{ return "+utf8::P\n+utf8::S\n-002D\n-002E" } sub NonAmb{ return "+utf8::P\n+utf8::S\n-0027\n-002C\n-002D\n-002E\n-003C\n-003E" }