use strict; use locale; if ($#ARGV != 0){ die "Usage : ", $0, " EXCEPTIONS\n"; } open(EXCEP, "<", $ARGV[0]) or die "impossible d'ouvrir ", $ARGV[0]; my %exception; while (my $ligne = ){ chomp($ligne); $ligne=~s/\r//; $exception{$ligne} = 1; } close(EXCEP); my @excepqm; foreach my $e (sort {length($b) <=> length($a)} keys (%exception)){ push(@excepqm, quotemeta($e)); } my $excepre = join("|", @excepqm); #print $excepre,"\n"; while(my $ligne = ){ chomp($ligne); $ligne=~s/\r//; $ligne =~ s/qu\'/qu\' /g; $ligne =~ s/d\'/d\' /g; #$ligne =~ s/, /, /g; &handleLine ($ligne); } # while () close (FICHIER); sub handleLine () { my $ligne = shift; if ($ligne =~ /<.*?>/) { my $leftPart = $`; my $rightPart = $'; #' my $tag = $&; &handleLine ($leftPart); &handleTag ($tag); &handleLine ($rightPart); } # if ($line =~ /<.*?>/) else { &handleText ($ligne); } } # handleArticleLine () sub handleText () { my $ligne = shift; my $ligneseg; while($ligne =~ /(^|\p{InNonPT}|\s|\.\.\.)(${excepre})(\p{InNonPT}|\.|\s|\.\.\.|$)/ ) { $ligneseg .= $`.$1."".$2.""; $ligne = $3.$'; } $ligneseg .= $ligne; my @res; foreach my $seg (split(//, $ligneseg)){ if (defined($exception{$seg})){ push(@res, $seg); } else { $seg =~ s/(\d)\s(\d)/$1$2/g; foreach my $m (split(/(\p{InNonAmb}|\s|\.\.\.)/, $seg)){ if ($m eq "..." or $m =~ /^\p{InNonAmb}$/){ push(@res, $m); } elsif ($m !~ /^\s*$/){ $m =~ s/(-[nz]-)(\PL*)/\t$1\t$2/ig; #pas d'espace $m =~ s/(\PL|^)([dlmnst]\')/$1\t$2\t/ig; #espace avant $m =~ s/(\PL|^)(\pL*[qnv][us]\')/$1\t$2\t/ig; #espace avant $m =~ s/(\PL|^)(\pL*[q][u]\')/$1\t$2\t/ig; #espace avant $m =~ s/(\PL|^)(\pL*[e][n][t]\')/$1\t$2\t/ig; #espace avant $m =~ s/(\PL|^)(\pL*[çcbzu]\')/$1\t$2\t/ig; #espace avant $m =~ s/([\pL\d]+\.([\pL\d]+\.)+)/\t$1\t/g; #espace avant et après $m =~ s/\.($|\PL)/\t./; #??? $m =~ s/(\D|^),/$1\t,\t/g; #??? $m =~ s/,($|\D)/\t,\t$1/g; #??? $m =~ s/-(vos|ne|[st][eu]?'?|l[aoi']s?|me|d'|en|[nv]os|u)($|\PL)/\t-$1\t$2/ig; #espace après $m =~ s/\'([unv]\pL*)($|\PL)/\t\'$1\t$2/ig; #règle pour 'u 'us 'n 'v 'ns 'vs... #espace après $m =~ s/\'([dlmnsti])($|\PL)/\t\'$1\t$2/ig; #règle pour 'm 't 'i 's 'ac ... #espace après $m =~ s/(\pP)(\pP)/\t$1\t$2\t/g; #??? $m =~ s// /g; #??? $m =~ s/([<>])/\t$1\t/g; #??? #$m =~ s/\.([A-Z]\pL*)($|\PL)/\t\.\t$1\t$2/g; #règle pour . suivi d'une majuscule push(@res, (split(/\t/, $m))); } } } } my @res1 = grep(/./, @res); if (@res1){ print join("\n", @res1), "\n" ; } sub InNonPT{ return "+utf8::P\n+utf8::S\n-002D\n-002E" } sub InNonAmb{ return "+utf8::P\n+utf8::S\n-0027\n-002C\n-002D\n-002E\n-003C\n-003E" } } # handleText () sub handleTag () { my $tag = shift; print "$tag\n"; } # handleTag ()