Newer
Older
calcule-offsets / calculeOffsetsEN.pl
@besagni besagni on 14 Aug 2020 29 KB Premier dépôt, version 0.6.2
#!/usr/bin/perl

# TEST AVEC 5D991C2718453AF12941533C23C84AF4A8B0F399.tei !!!!!

# Declaration of pragmas
use strict;
use utf8;
use open qw/:std :utf8/;

# Call of basic external modules
use Encode qw(decode_utf8 encode_utf8 is_utf8);
use Getopt::Long;

# Call of modules specific to the application
use Digest::MD5;
use XML::TokeParser;
use XML::Twig;

my ($programme) = $0 =~ m|^(?:.*/)?(.+)|;
my  $substitut  = " " x length($programme);
my $usage = "Usage : \n" .
            "    $programme -i input_file -e entities_file -o output_file \n" .
            "    $substitut [ -n new_input_file ] [ -d ] \n" .
            "    $programme -h \n\n";

my $version     = "0.6.2";
my $dateModif   = "14 Août 2020";

# Initialisation of global variables
# needed for the options
my $debug      = 0;
my $entities   = "";
my $input      = "";
my $help       = 0;
my $newfile    = "";
my $output     = "";

eval    {
        $SIG{__WARN__} = sub {usage(1);};
        GetOptions(
                "debug"             => \$debug,
                "entities=s"        => \$entities,
                "input=s"           => \$input,
                "help"              => \$help,
                "newfile=s"         => \$newfile,
                "output=s"          => \$output,
                );
        };
$SIG{__WARN__} = sub {warn $_[0];};

if ( $help ) {
        print "\nProgramme : \n";
        print "   “$programme”, version $version ($dateModif)\n";
        print "   Permet de générer un fichier JSON avec les offsets des entités \n";
        print "   nommées dans le fichier en entrée en utilisant un fichier où \n";
        print "   ces entitées ont été balisées. \n";
        print "\n";
        print $usage;
        print "\nOptions : \n";
        print "    -d  active le mode “débogage” \n";
        print "    -e  indique le nom du fichier au format TEI où les entitées \n";
        print "        nommées sont balisées \n";
        print "    -h  affiche cette aide. \n";
        print "    -i  indique le nom du fichier d’entrée, au format TEI, qui doit \n";
        print "        être le fichier présent sur le serveur Istex \n";
        print "    -n  crée un nouveau fichier d’entrée, identique à celui indiqué \n";
        print "        par l’option “-i”, mais où les entitées nommées sont balisées \n";
        print "    -o  indique le nom du fichier de sortie,  au format JSON, avec \n";
        print "        les offsets des entitées nommées \n";
        print " \n";

        exit 0;
        }

usage(2) if not $input or not $entities or not $output;

# Variables needed for processing the data
my $doi    = "";
my $header = "";
my $istex  = "";
my $length = 0;
my $path   = "";
my $pii    = "";
my $pmid   = "";
my $raw    = "";
my $start  = 0;
my $status = 'out';
my $tei    = "";
my $text   = "";
my @lines  = ();
my @offset = ();
my %dejavu = ();
my %limite = ();

# Inirtializing MD5
my $md5 = Digest::MD5->new;

open(INP, "<:raw", $input) or die "$!,";
while(<INP>) {
        $md5->add($_);
        my $line = decode_utf8($_);
        push @lines, $line;
        push @offset, $length;
        $length += length($line);
        $line =~ s/[\s\r]/ /go;
        $text .= $line;
        }
push @offset, $length;
close INP;

my $digest = $md5->hexdigest;

if ( $text =~ /<TEI\s/i ) {
        $header = $`;
        $tei = $& . $';
        $start = length($header);
        }
else    {
        print STDERR "$programme: Warning: no “TEI” element in file “$input”\n";
        exit 3;
        }


# Parsing original file
my $parser = XML::TokeParser->new(\$tei);
while( my $token = $parser->get_token() ) {
        $raw .= $token->raw;

        if ( $token->is_start_tag ) {
                my $tag = $token->tag;
                # Updating the path
                $path .= "/$tag";
                # We get the attributes from a hashref
                my %attributes = %{$token->attr};

                if ( $tag eq "titleStmt" ) {
                        $limite{'title'}{'start'} = $start + length(decode_utf8($raw)) + 1;
                        print STDERR "Début titre  : $limite{'title'}{'start'}\n" if $debug;
                        $status = 'in';
                        }
                elsif ( $tag eq "abstract" ) {
                        $limite{'abstract'}{'start'} = $start + length(decode_utf8($raw)) - length($token->raw);
                        print STDERR "Début résumé : $limite{'abstract'}{'start'}\n" if $debug;
                        $status = 'in';
                        }
                elsif ( $tag eq "body" ) {
                        $limite{'body'}{'start'} = $start + length(decode_utf8($raw)) + 1;
                        print STDERR "Début texte  : $limite{'body'}{'start'}\n" if $debug;
                        $status = 'in';
                        }
                elsif ( $tag eq "idno" ) {
                        if ( defined $attributes{'type'} ) {
                                my $id = $parser->get_text();
                                $istex = $id if $attributes{'type'} eq 'istex';
                                $doi = $id if $attributes{'type'} eq 'DOI';
                                $pii = $id if $attributes{'type'} eq 'PII';
                                $pmid = $id if $attributes{'type'} eq 'PMID';
                                }
                        }
                elsif ( $status eq 'in' ) {
                        my $string = $token->raw;
                        my $position = $start + length($raw) - length($token->raw);
                        push(@{$dejavu{"S:$tag"}{$string}}, $position);
                        }
                }

        elsif ( $token->is_end_tag ) {
                my $tag = $token->tag;

                if ( $tag eq "titleStmt" ) {
                        $limite{'title'}{'end'} = $start + length(decode_utf8($raw)) - length($token->raw) + 1;
                        print STDERR "Fin titre    : $limite{'title'}{'end'}\n" if $debug;
                        $status = 'out';
                        }
                elsif ( $tag eq "abstract" ) {
                        $limite{'abstract'}{'end'} = $start + length(decode_utf8($raw)) + 1;
                        print STDERR "Fin résumé   : $limite{'abstract'}{'end'}\n" if $debug;
                        $status = 'out';
                        }
                elsif ( $tag eq "body" ) {
                        $limite{'body'}{'end'} = $start + length(decode_utf8($raw)) - length($token->raw) + 1;
                        print STDERR "Fin texte    : $limite{'body'}{'end'}\n" if $debug;
                        $status = 'out';
                        }
                elsif ( $status eq 'in' ) {
                        my $string = $token->raw;
                        my $position = $start + length($raw) - length($token->raw);
                        push(@{$dejavu{"E:$tag"}{$string}}, $position);
                        }

                $path =~ s|/\w+\z||o;
                }
        }

# Other variables
my $element  = "";
my $endtag   = "";
my $entity   = "";
my $erase    = "";
my $ok       = undef;
my $original = validateUtf8($text);;
my $starttag = "";
my $string   = "";
my $type     = "";
my @errors   = ();
my %offset   = ();

open(ORG, ">:utf8", "original.txt") or die "$!,";
print ORG "$original\n";
close ORG;

# Reinitializing variables
$path = $raw = $text = "";
$status = 'out';

# Getting the title
my $title = substr($original, $limite{'title'}{'start'} - 1, $limite{'title'}{'end'} - $limite{'title'}{'start'});
print STDERR "Titre : \"$title\"\n" if $debug;
my $twig = XML::Twig->new(
                   twig_handlers => {
                        'title[@type="main"]'         => sub {$string = $_->sprint;},
                        'title[@level="a"]'         => sub {$string = $_->sprint;},
                        },
                 );
$twig->parse("<text>$title</text>");
$twig->purge;

$title = $string;
$title =~ s|^<title [^>]+>||o;
$title =~ s|</title>||o;
$title =~ s/\s+/ /go;
$title =~ s/^\s+//o;
$title =~ s/\s+\z//o;

# Generating a simpler version of
# the original text
my $simpler = join(" ", map {s/<[^>]+>/<>/go; $_;} ($original =~ /(<[^>]+>|\w+)/go));

open(SMP, ">:utf8", "simpler.txt") or die "$!,";
print SMP "$simpler\n";
close SMP;

# List of named entity tag
my %entity = (
        'date'      => 1,
        'geogName'  => 1,
        'orgName'   => 1,
        'persName'  => 1,
        'placeName' => 1,
        'ref'       => {'type' => 'bibl'},
        'roleName'  => 1,
        );
my %position = ();

open(INP, "<:raw", $entities) or die "$!,";
while(<INP>) {
        $text .= $_;
        }
close INP;

# Parsing annotated file
$parser = XML::TokeParser->new(\$text);
while( my $token = $parser->get_token() ) {
        $string = $token->raw;
        $entity .= $string if $status eq 'in';
        if ( $token->is_start_tag ) {
                my $tag = $token->tag;
                # Updating the path
                $path .= "/$tag";

                if ( $path =~ m|/titleStmt/|o or
                     $path =~ m|/abstract/|o or
                     $path =~ m|/body/|o ) {
                        $ok = 1;
                        }
                else    {
                        $ok = 0;
                        }
                # We get the attributes from a hashref
                my %attributes = %{$token->attr};

                if ( $entity{$tag} and not ref($entity{$tag}) ) {
                        $element = $tag;
                        $starttag = $string;
                        $entity = "";
                        $status = 'in';
                        if ( defined $attributes{'type'} ) {
                                $type = $attributes{'type'};
                                }
                        else    {
                                $type = "";
                                }
                        }
                # Problème pour faire une condition qui ne se base que sur le hachage
                # “%entity” et qui permette de n'avoir que le cas souhaité pour ne pas
                # géner la condition suivante.
                # elsif ( $entity{$tag} and ref($entity{$tag}) eq 'HASH' ) {
                elsif ( $tag eq 'ref' and defined $attributes{'type'} and $attributes{'type'} eq 'bibl' ) {
                        $element = $tag;
                        $starttag = $string;
                        $entity = "";
                        $status = 'in';
                        $type = 'bibl';
                        }
                elsif ( $ok ) {
                        # print "Balive ouvrante \"$string\"\n";
                        my $position = $start + length($string) + 1;
                        if ( defined $dejavu{"S:$tag"} ) {
                                if ( defined $dejavu{"S:$tag"}{$string} ) {
                                        my @tmp = @{$dejavu{"S:$tag"}{$string}};
                                        my $match = 0;
                                        foreach my $item (@tmp) {
                                                if ( $item == $position ) {
                                                        # print "Balise \"$string\" trouvé à offset $position\n";
                                                        $match ++;
                                                        }
                                                }
                                        # print "Pas de correspondance pour la balise \"$string\"\n" if not $match;
                                        }
                                else    {
                                        my @tmp = ();
                                        my $match = 0;
                                        foreach my $item (keys %{$dejavu{"S:$tag"}}) {
                                                push(@tmp, @{$dejavu{"S:$tag"}{$item}});
                                                }
                                        foreach my $item (@tmp) {
                                                if ( $item == $position ) {
                                                        # print "Balise \"$string\" trouvé à offset $position\n";
                                                        $match ++;
                                                        }
                                                }
                                        # print "Pas de correspondance pour une balise \"$tag\"\n" if not $match;
                                        }

                                }
                        else    {
                                $erase = $path;
                                $string = "";
                                }
                        }
                }
        elsif ( $token->is_end_tag ) {
                my $tag = $token->tag;

                if ( $entity{$tag} and $tag eq $element ) {
                        $entity = substr($entity, 0, length($entity) - length($string));
                        $status = recherche();
                        $element = "";
                        $endtag = $string;
                        # $status = 'leaving';
                        }
                elsif ( $erase eq $path ) {
                        $erase = "";
                        $string = "";
                        }

                $path =~ s|/\w+\z||o;
                }
        if ( $status eq 'out' ) {
                $raw .= $string;
                }
        elsif ( $status eq 'leaving' ) {
                $raw .= $entity;
                $status = 'out';
                }
        }

# If errors ...
$input =~ s|^(?:.*/)?(.+)|$1|;

if ( @errors ) {
        print STDERR "Fichier \"$input\" : \n";
        foreach my $error (@errors) {
                print STDERR "\t -> $error \n";
                }
        print STDERR " \n";

        exit 4;
        }
else    {
        open(OUT, ">:utf8", $output) or die "$!,";
        print OUT "{\n";
        print OUT "  \"Fichier\": \"$input\",\n";
        print OUT "  \"Titre\": \"" . valid($title). "\",\n";
        my @tmp = ();
        if ( $istex ) {
                push(@tmp, "    \"Istex\": \"$istex\"");
                }
        if ( $doi ) {
                push(@tmp, "    \"DOI\": \"" . valid($doi). "\"");
                }
        if ( $pii ) {
                push(@tmp, "    \"PII\": \"" . valid($pii). "\"");
                }
        if ( $pmid ) {
                push(@tmp, "    \"PMID\": \"" . valid($pmid). "\"");
                }
        if ( @tmp ) {
                print OUT "  \"Identifiant\": \{\n";
                while ( my $item = shift @tmp ) {
                        print OUT $item, @tmp ? "," : "", "\n";
                        }
                print OUT "  \},\n";
                }
        print OUT "  \"Empreinte MD5\": \"$digest\",\n";
        if ( %offset ) {
                print OUT "  \"Entités nommées\": \[\n";
                my @entities = sort keys %offset;
                while ( $entity = shift @entities ) {
                        print OUT "    \{\n";
                        print OUT "      \"Type\": \"$entity\",\n";
                        print OUT "      \"Liste\": \[\n";
                        my @ne = @{$offset{$entity}};
                        while ( my $ne = shift @ne ) {
                                print OUT "        \{\n";
                                print OUT "          \"Expression\": \"" . valid($ne->{'ne'}). "\",\n";
                                if ( defined $ne->{'string'} ) {
                                        print OUT "          \"Texte\": \"" . valid($ne->{'string'}). "\",\n";
                                        }
                                print OUT "          \"Position\": \{\n";
                                print OUT "            \"Début\": \"$ne->{'debut'}\",\n";
                                print OUT "            \"Fin\": \"$ne->{'fin'}\"\n";
                                print OUT "          \},\n";
                                print OUT "          \"Offsets\": \{\n";
                                print OUT "            \"Caractères\": \"$ne->{'characters'}\",\n";
                                print OUT "            \"Octets\": \"$ne->{'octets'}\"\n";
                                print OUT "          \}\n";
                                print OUT "        \}", @ne ? "," : "", "\n";
                                }
                        print OUT "      \]\n";
                        print OUT "    \}", @entities ? "," : "", "\n";
                        }
                print OUT "  \]\n";
                }
        else    {
                print OUT "  \"Entités nommées\": \[\]\n";
                }
        print OUT "}\n";
        close OUT;
        }

# If necessary, create a new annotated file combining
# the original file and the file of named entities
if ( $newfile ) {
        my %position = "";
        foreach $entity (keys %offset) {
                $starttag = "<$entity>";
                $endtag = "</$entity>";
                if ( $entity =~ /^(\w+) type:(\w+)\z/o ) {
                        $starttag = "<$1 type=\"$2\">";
                        $endtag = "</$1>";
                        }
                foreach my $ne (@{$offset{$entity}}) {
                        my ($line, $char) = ($ne->{'debut'} =~ /^ligne (\d+), caractère (\d+)\z/o);
                        $position{$line}{$char} = $starttag;
                        ($line, $char) = $ne->{'fin'} =~ /^ligne (\d+), caractère (\d+)\z/o;
                        $position{$line}{$char + 1} = $endtag;
                        }
                }
        foreach my $line (keys %position) {
                my @chars = split(//, $lines[$line - 1]);
                foreach my $char (sort {$b <=> $a} keys %{$position{$line}}) {
                        splice(@chars, $char - 1, 0, $position{$line}{$char});
                        }
                $lines[$line - 1] = join("", @chars);
                }
        open(NEW, ">:utf8", $newfile) or die "$!,";
        foreach my $line (@lines) {
                print NEW $line;
                }
        close NEW;
        }

exit 0;


sub usage
{
print STDERR $usage;

exit shift;
}

sub recherche
{
if ( $path !~ m{/(abstract|body|titleStmt)/}o ) {
        $raw .= $starttag . $entity;
        return 'out';
        }

my $chaine = validateUtf8($entity);
my $utf8 = validateUtf8($raw);
my $dejaVu = undef;

$chaine = join(" ", ($chaine =~ /(\S+)/go));
print STDERR "Entité : \"$chaine\"\n" if $debug;
# Regular expression for the named entity
# my $regex2 = join("\\s*", map {s/\W/\\W+/go; $_;} ($chaine =~ /(\p{IsAlnum}+|\P{IsSpace})/go));
# $regex2 = ligature($regex2);
# $regex2 =~ s/\\W\+\\s\*amp\\s\*\\W\+/$&(?:amp\\s*\\W)?/go;
my $regex2 = join("\\s*", map {s/\W/\\W/go; $_;} ($chaine =~ /(\p{IsAlnum}+|\P{IsSpace})/go));
$regex2 = ligature($regex2);
$regex2 =~ s/\\W\\s\*amp\\s\*\\W/$&(?:amp\\s*\\W)?/go;
my @words = $utf8 =~ /(<[^>]+>|\p{IsAlnum}+)/go;
splice(@words, 0, -10);
my @all = @words;
while ( 1 ) {
        # Taking into account entity "&amp;" mistakenly
        # written "&amp;amp;" in the original file
        my $regex1 = join("\\W+", @words);
        $regex1 =~ s/\bamp\b/amp(?:\\W+amp)?/go;
        $regex1 = ligature($regex1);
        if ( $regex1 =~ /<[^>]+>/ ) {
                $regex1 =~ s|(\\W\+)?<[^>]+>(\\W\+)?|\\W*<[^>]+>\\W*|go;
                }
                if ( $regex1 =~ /\\W\*\\W\*/ ) {
                        $regex1 =~ s|\\W\*(\\W\*)+|\\W*|go;
                        }
        print STDERR "Recherche \"", join(" ", @words, ($chaine =~ /(\S+)/go)), "\"\n" if $debug;

        my @results = $original =~ /$regex1\W*$regex2/g;
        if ( $#results == 0 ) {
                $original =~ /($regex1\W*)($regex2)/;
                offset($2, $1, $`, $chaine);
                last;
                }
        elsif ( $#results > 0 ) {
                if ( not $dejaVu ) {
                        $dejaVu = 1;
                        my ($code, $intruder) = intruder($regex2, @all) if not $dejaVu;
                        if ( $code > 0 ) {
                                print STDERR "INTRUS [$code] \"$intruder\"\n";
                                @words = @all;
                                splice(@words, $code, 0, $intruder);
                                next;
                                }
                        elsif ( $code < 0 ) {
                                print STDERR "INTRUS [$code] \"$intruder\"\n";
                                @words = @all;
                                @words[ - $code] = $intruder;
                                next;
                                }
                        else    {
                                last if occurrence($regex2);
                                }
                        }
                my $message = "Résultats multiples pour \"" . join(" ", ($chaine =~ /(\S+)/go)) . "\" [texte original]";
                push(@errors, $message);
                last;
                }
        else    {
                $regex1 =~ s/\\W\*\<\[\^\>\]\+>\\W\*/\\W*/go;
                (my $regex3 = $regex2) =~ s/(\\[Ws][?*+]?)+/\\W*/go;
                @results = $simpler =~ /$regex1\W*$regex3/g;
                if ( $#results == 0 ) {
                        $simpler =~ /($regex1\W*)($regex3)/;
                        my $match1 = $1;
                        my $match2 = $2;
                        $match1 =~ s/<>/<[^>]+>/go;
                        $match1 =~ s/ /\\W*/go;
                        $match2 =~ s/\P{IsAlnum}+\z//o;
                        $match2 =~ s/<>/<[^>]+>/go;
                        $match2 =~ s/ /\\W*/go;
                        if ( $chaine =~ /^[^\p{IsAlnum}\p{IsSpace}]/ ) {
                                $match2 = '\W\s*(?:<[^>]+>)?\s*' . $match2;
                                }
                        if ( $chaine =~ /[^\p{IsAlnum}\p{IsSpace}]\z/ ) {
                                $match2 .= '\s*(?:<[^>]+>)?\s*\W';
                                }
                        if ( $original =~ /($match1)($match2)/ ) {
                                offset($2, $1, $`, $chaine);
                                last;
                                }
                        }
                elsif ( $#results > 0 ) {
                        if ( not $dejaVu ) {
                                $dejaVu = 1;
                                my ($code, $intruder) = intruder($regex2, @all);
                                if ( $code > 0 ) {
                                        print STDERR "INTRUS [$code] \"$intruder\"\n";
                                        @words = @all;
                                        splice(@words, $code, 0, $intruder);
                                        next;
                                        }
                                if ( $code < 0 ) {
                                        print STDERR "INTRUS [$code] \"$intruder\"\n";
                                        @words = @all;
                                        @words[ - $code] = $intruder;
                                        next;
                                        }
                                }
                        my $message = "Résultats multiples pour \"" . join(" ", ($chaine =~ /(\S+)/go)) . "\" [texte simple]";
                        push(@errors, $message);
                        last;
                        }
                elsif ( $#words < 0 ) {
                        if ( not $dejaVu ) {
                                $dejaVu = 1;
                                if ( $regex2 =~ /\\W/o ) {
                                        $regex2 =~ s/\\W/\\W?/go;
                                        @words = @all;
                                        next;
                                        }
                                }
                        my $message = "Pas de résultat pour \"" . join(" ", ($chaine =~ /(\S+)/go)) . "\"";
                        push(@errors, $message);
                        }
                }

        last if not @words;
        shift @words;
        }

return 'leaving';
}

sub offset
{
my ($ne, $match, $before, $expression) = @_;

# Petit correctif pour les cas où on récupère un blanc final
$ne =~ s/\s+\z//o;

my $offset   = length($before) + length($match);
my $size     = length($ne);
my $position = $offset + 1;
my $nb       = 0;
my %tmp      = ();

for ( ; $nb < $#offset ; $nb ++ ) {
        if ( $position > $offset[$nb] and $position < $offset[$nb + 1] ) {
                my $line = $nb + 1;
                $tmp{'debut'} = sprintf "ligne %d, caractère %d", $line, $position - $offset[$nb];
                for ( ; $nb < $#offset ; $nb ++ ) {
                        if ( $offset + $size > $offset[$nb] and $offset + $size < $offset[$nb + 1] ) {
                                $tmp{'fin'} = sprintf "ligne %d, caractère %d", $nb + 1, $offset + $size - $offset[$nb];
                                $nb = $#offset;
                                }
                        }
                }
        }
$tmp{'characters'} = sprintf "%d - %d", $offset + 1, $offset + $size;
$offset = length(encode_utf8($before . $match));
$size = length(encode_utf8($ne));
$tmp{'octets'} = sprintf "%d - %d", $offset + 1, $offset + $size;

if ( %tmp ) {
        $tmp{'ne'} = $expression;
        if ( $expression ne $ne ) {
                $tmp{'string'} = $ne;
                }
        my $value = $element;
        if ( $type ) {
                $value .= " type:$type";
                }
        push(@{$offset{$value}}, \%tmp);
        }
else    {
        push(@errors, "Entité nommée \"$ne\" introuvable dans le fichier original");
        }
}

sub offset2
{
my ($ne, $position) = @_;

my $size     = length($ne);
my $offset   = $position - $size + 1;
my $nb       = 0;
my %tmp      = ();

for ( ; $nb < $#offset ; $nb ++ ) {
        if ( $offset > $offset[$nb] and $offset < $offset[$nb + 1] ) {
                my $line = $nb + 1;
                $tmp{'debut'} = sprintf "ligne %d, caractère %d", $line, $offset - $offset[$nb];
                for ( ; $nb < $#offset ; $nb ++ ) {
                        if ( $position > $offset[$nb] and $position < $offset[$nb + 1] ) {
                                $tmp{'fin'} = sprintf "ligne %d, caractère %d", $nb + 1, $position - $offset[$nb];
                                $nb = $#offset;
                                }
                        }
                }
        }
$tmp{'characters'} = sprintf "%d - %d", $offset, $position;
my $before = substr($original, 0, $offset - 1);
$offset = length(encode_utf8($before));
$size = length(encode_utf8($ne));
$tmp{'octets'} = sprintf "%d - %d", $offset + 1, $offset + $size;

if ( %tmp ) {
        $tmp{'ne'} = $ne;
        my $value = $element;
        if ( $type ) {
                $value .= " type:$type";
                }
        push(@{$offset{$value}}, \%tmp);
        }
else    {
        push(@errors, "Entité nommée \"$ne\" introuvable dans le fichier original");
        }
}

sub validateUtf8
{
my $chaine = shift;

my $valide = "";
if ( is_utf8($chaine, Encode::FB_QUIET) ) {
        $valide = $chaine;
        }
else    {
        $valide = decode_utf8($chaine);
        }

return $valide
}

sub valid
{
my $json = shift;

$json =~ s/\\/\\\\/go;
$json =~ s/"/\\"/go;

return $json;
}

sub ligature
{
my $chaine = shift;

if ( $chaine =~ /f(f|i|l|fi|fl)/o ) {
        $chaine =~ s/ff/(?:ff|\x{FB00})/go;
        $chaine =~ s/fi/(?:fi|\x{FB01})/go;
        $chaine =~ s/fl/(?:fl|\x{FB02})/go;
        $chaine =~ s/ffi/(?:ffi|\x{FB03})/go;
        $chaine =~ s/ffl/(?:ffl|\x{FB04})/go;
        }
if ( $chaine =~ /[äéöü]/ ) {
        $chaine =~ s/ä/(?:ä|a\x{0308})/go;
        $chaine =~ s/é/(?:é|e\x{0301})/go;
        $chaine =~ s/ö/(?:ö|o\x{0308})/go;
        $chaine =~ s/ü/(?:ü|u\x{0308})/go;
        }
elsif ( $chaine =~ /[aou]\x{0308}/o ){
        $chaine =~ s/a\x{0308}/(?:ä|a\x{0308})/go;
        $chaine =~ s/o\x{0308}/(?:ö|o\x{0308})/go;
        $chaine =~ s/u\x{0308}/(?:ü|u\x{0308})/go;
        }
elsif ( $chaine =~ /e\x{0301}/o ){
        $chaine =~ s/e\x{0301}/(?:é|e\x{0301})/go;
        }

return $chaine;
}

sub intruder
{
my ($regex, @words) = @_;

for ( my $nb = 10 ; $nb > 0 ; $nb -- ) {

        my @tmp = @words;
        splice(@tmp, $nb, 0, '(.+?)');

        my $tmp = join("\\W+", map {$_ eq 'amp' ? 'amp(?:\W+amp)?' : $_;} @tmp);
        $tmp = ligature($tmp);

        my @results = $original =~ /$tmp\W*$regex/g;
        if ( $#results == 0 ) {
                my $intruder = $1;
                my $nbWords =()= $intruder =~ /\p{IsAlnum}+/go;
                return ($nb, $intruder) if $nbWords < 10;
                }
        elsif ( $#results < 0 ) {
                @results = $simpler =~ /$tmp\W*$regex/g;
                if ( $#results == 0 ) {
                        my $intruder = $1;
                        my $nbWords =()= $intruder =~ /\p{IsAlnum}+/go;
                        return ($nb, $intruder) if $nbWords < 10;
                        }
                }
        }

for ( my $nb = 9 ; $nb > 0 ; $nb -- ) {

        my @tmp = @words;
        $tmp[$nb] = '(.+?)';

        my $tmp = join("\\W+", map {$_ eq 'amp' ? 'amp(?:\W+amp)?' : $_;} @tmp);
        $tmp = ligature($tmp);

        my @results = $original =~ /$tmp\W*$regex/g;
        if ( $#results == 0 ) {
                my $intruder = $1;
                my $nbWords =()= $intruder =~ /\p{IsAlnum}+/go;
                return (- $nb, $intruder) if $nbWords < 10;
                }
        elsif ( $#results < 0 ) {
                @results = $simpler =~ /$tmp\W*$regex/g;
                if ( $#results == 0 ) {
                        my $intruder = $1;
                        my $nbWords =()= $intruder =~ /\p{IsAlnum}+/go;
                        return (- $nb, $intruder) if $nbWords < 10;
                        }
                }
        }

return 0;
}

sub occurrence
{
my $regex = shift;

my @match = ();
my @pos   = ();

my @raw = $raw =~ /$regex/g;
my $num = $#raw + 1;

while ( $original =~ /$regex/g ) {
        push(@match, $&);
        push(@pos, pos($original));
        }

if ( $pos[$num] ) {
        offset2($match[$num], $pos[$num]);
        return 1;
        }

return 0;
}