use strict;
use locale;

if ($#ARGV != 0){ 
    die "Usage : ", $0, " EXCEPTIONS\n";
}
open(EXCEP, "<", $ARGV[0]) or die "impossible d'ouvrir ", $ARGV[0];
my %exception;
while (my $ligne = <EXCEP>){
    chomp($ligne);
    $ligne=~s/\r//;
    $exception{$ligne} = 1;
}
close(EXCEP);
my @excepqm;
foreach my $e (sort {length($b) <=> length($a)} keys (%exception)){
    push(@excepqm, quotemeta($e));
}

my $excepre = join("|", @excepqm);
#print $excepre,"\n";
while(my $ligne = <STDIN>){
    chomp($ligne);
    $ligne=~s/\r//;
    $ligne =~ s/qu\'/qu\' /g;
    $ligne =~ s/d\'/d\' /g;
    #$ligne =~ s/, /,  /g;
    &handleLine ($ligne);
} # while (<FICHIER_WIKI>)

close (FICHIER);

sub handleLine ()
{
    my $ligne = shift;

    if ($ligne =~ /<.*?>/)
    {
	my $leftPart = $`;
	my $rightPart = $'; #'
	my $tag = $&;

	&handleLine ($leftPart);
	&handleTag ($tag);
	&handleLine ($rightPart);
    } # if ($line =~ /<.*?>/)
    else
    {
	&handleText ($ligne);
    }
} # handleArticleLine ()


sub handleText ()
{
my $ligne = shift;

my $ligneseg;
    while($ligne =~ /(^|\p{InNonPT}|\s|\.\.\.)(${excepre})(\p{InNonPT}|\.|\s|\.\.\.|$)/ ) {
	$ligneseg .= $`.$1."<PPLcut>".$2."<PPLcut>";
	$ligne = $3.$';
    }
	$ligneseg .= $ligne;
    my @res;
    foreach my $seg (split(/<PPLcut>/, $ligneseg)){
	if (defined($exception{$seg})){
	    push(@res, $seg);
	}
	else {
	    $seg =~ s/(\d)\s(\d)/$1<PPLesp>$2/g;
	    foreach my $m (split(/(\p{InNonAmb}|\s|\.\.\.)/, $seg)){
		if ($m eq "..." or $m =~ /^\p{InNonAmb}$/){
		    push(@res, $m);
		}
		elsif ($m !~ /^\s*$/){
		$m =~ s/(-[nz]-)(\PL*)/\t$1\t$2/ig;		#pas d'espace
		$m =~ s/(\PL|^)([dlmnst]\')/$1\t$2\t/ig;	#espace avant
		$m =~ s/(\PL|^)(\pL*[qnv][us]\')/$1\t$2\t/ig;	#espace avant
		$m =~ s/(\PL|^)(\pL*[q][u]\')/$1\t$2\t/ig;	#espace avant
		$m =~ s/(\PL|^)(\pL*[e][n][t]\')/$1\t$2\t/ig;	#espace avant
		$m =~ s/(\PL|^)(\pL*[cbzu]\')/$1\t$2\t/ig;	#espace avant
		$m =~ s/([\pL\d]+\.([\pL\d]+\.)+)/\t$1\t/g;	#espace avant et aprs
		$m =~ s/\.($|\PL)/\t./;	#???
		$m =~ s/(\D|^),/$1\t,\t/g;	#???
		$m =~ s/,($|\D)/\t,\t$1/g;	#???
		$m =~ s/-(vos|ne|[st][eu]?'?|l[aoi']s?|me|d'|en|[nv]os|u)($|\PL)/\t-$1\t$2/ig;	#espace aprs    
		$m =~ s/\'([unv]\pL*)($|\PL)/\t\'$1\t$2/ig; #rgle pour 'u 'us 'n 'v 'ns 'vs...	#espace aprs
		$m =~ s/\'([dlmnsti])($|\PL)/\t\'$1\t$2/ig; #rgle pour 'm 't 'i 's 'ac ...	#espace aprs
		$m =~ s/(\pP)(\pP)/\t$1\t$2\t/g; #???
		$m =~ s/<PPLesp>/ /g;	#???
		$m =~ s/([<>])/\t$1\t/g;	#???
		#$m =~ s/\.([A-Z]\pL*)($|\PL)/\t\.\t$1\t$2/g; #rgle pour . suivi d'une majuscule
		    push(@res, (split(/\t/, $m)));
		}
	    }
	}
    }
    my @res1 = grep(/./, @res);
    if (@res1){
         print join("\n", @res1), "\n" ;
    }

sub InNonPT{
    return "+utf8::P\n+utf8::S\n-002D\n-002E"
}
sub InNonAmb{
    return "+utf8::P\n+utf8::S\n-0027\n-002C\n-002D\n-002E\n-003C\n-003E"
}

} # handleText ()


sub handleTag ()
{
    my $tag = shift;

    print "$tag\n";
} # handleTag ()
