#!/usr/bin/perl # TEST AVEC 5D991C2718453AF12941533C23C84AF4A8B0F399.tei !!!!! # Declaration of pragmas use strict; use utf8; use open qw/:std :utf8/; # Call of basic external modules use Encode qw(decode_utf8 encode_utf8 is_utf8); use Getopt::Long; # Call of modules specific to the application use Digest::MD5; use XML::TokeParser; use XML::Twig; my ($programme) = $0 =~ m|^(?:.*/)?(.+)|; my $substitut = " " x length($programme); my $usage = "Usage : \n" . " $programme -i input_file -e entities_file -o output_file \n" . " $substitut [ -n new_input_file ] [ -d ] \n" . " $programme -h \n\n"; my $version = "0.6.2"; my $dateModif = "14 Août 2020"; # Initialisation of global variables # needed for the options my $debug = 0; my $entities = ""; my $input = ""; my $help = 0; my $newfile = ""; my $output = ""; eval { $SIG{__WARN__} = sub {usage(1);}; GetOptions( "debug" => \$debug, "entities=s" => \$entities, "input=s" => \$input, "help" => \$help, "newfile=s" => \$newfile, "output=s" => \$output, ); }; $SIG{__WARN__} = sub {warn $_[0];}; if ( $help ) { print "\nProgramme : \n"; print " “$programme”, version $version ($dateModif)\n"; print " Permet de générer un fichier JSON avec les offsets des entités \n"; print " nommées dans le fichier en entrée en utilisant un fichier où \n"; print " ces entitées ont été balisées. \n"; print "\n"; print $usage; print "\nOptions : \n"; print " -d active le mode “débogage” \n"; print " -e indique le nom du fichier au format TEI où les entitées \n"; print " nommées sont balisées \n"; print " -h affiche cette aide. \n"; print " -i indique le nom du fichier d’entrée, au format TEI, qui doit \n"; print " être le fichier présent sur le serveur Istex \n"; print " -n crée un nouveau fichier d’entrée, identique à celui indiqué \n"; print " par l’option “-i”, mais où les entitées nommées sont balisées \n"; print " -o indique le nom du fichier de sortie, au format JSON, avec \n"; print " les offsets des entitées nommées \n"; print " \n"; exit 0; } usage(2) if not $input or not $entities or not $output; # Variables needed for processing the data my $doi = ""; my $header = ""; my $istex = ""; my $length = 0; my $path = ""; my $pii = ""; my $pmid = ""; my $raw = ""; my $start = 0; my $status = 'out'; my $tei = ""; my $text = ""; my @lines = (); my @offset = (); my %dejavu = (); my %limite = (); # Inirtializing MD5 my $md5 = Digest::MD5->new; open(INP, "<:raw", $input) or die "$!,"; while(<INP>) { $md5->add($_); my $line = decode_utf8($_); push @lines, $line; push @offset, $length; $length += length($line); $line =~ s/[\s\r]/ /go; $text .= $line; } push @offset, $length; close INP; my $digest = $md5->hexdigest; if ( $text =~ /<TEI\s/i ) { $header = $`; $tei = $& . $'; $start = length($header); } else { print STDERR "$programme: Warning: no “TEI” element in file “$input”\n"; exit 3; } # Parsing original file my $parser = XML::TokeParser->new(\$tei); while( my $token = $parser->get_token() ) { $raw .= $token->raw; if ( $token->is_start_tag ) { my $tag = $token->tag; # Updating the path $path .= "/$tag"; # We get the attributes from a hashref my %attributes = %{$token->attr}; if ( $tag eq "titleStmt" ) { $limite{'title'}{'start'} = $start + length(decode_utf8($raw)) + 1; print STDERR "Début titre : $limite{'title'}{'start'}\n" if $debug; $status = 'in'; } elsif ( $tag eq "abstract" ) { $limite{'abstract'}{'start'} = $start + length(decode_utf8($raw)) - length($token->raw); print STDERR "Début résumé : $limite{'abstract'}{'start'}\n" if $debug; $status = 'in'; } elsif ( $tag eq "body" ) { $limite{'body'}{'start'} = $start + length(decode_utf8($raw)) + 1; print STDERR "Début texte : $limite{'body'}{'start'}\n" if $debug; $status = 'in'; } elsif ( $tag eq "idno" ) { if ( defined $attributes{'type'} ) { my $id = $parser->get_text(); $istex = $id if $attributes{'type'} eq 'istex'; $doi = $id if $attributes{'type'} eq 'DOI'; $pii = $id if $attributes{'type'} eq 'PII'; $pmid = $id if $attributes{'type'} eq 'PMID'; } } elsif ( $status eq 'in' ) { my $string = $token->raw; my $position = $start + length($raw) - length($token->raw); push(@{$dejavu{"S:$tag"}{$string}}, $position); } } elsif ( $token->is_end_tag ) { my $tag = $token->tag; if ( $tag eq "titleStmt" ) { $limite{'title'}{'end'} = $start + length(decode_utf8($raw)) - length($token->raw) + 1; print STDERR "Fin titre : $limite{'title'}{'end'}\n" if $debug; $status = 'out'; } elsif ( $tag eq "abstract" ) { $limite{'abstract'}{'end'} = $start + length(decode_utf8($raw)) + 1; print STDERR "Fin résumé : $limite{'abstract'}{'end'}\n" if $debug; $status = 'out'; } elsif ( $tag eq "body" ) { $limite{'body'}{'end'} = $start + length(decode_utf8($raw)) - length($token->raw) + 1; print STDERR "Fin texte : $limite{'body'}{'end'}\n" if $debug; $status = 'out'; } elsif ( $status eq 'in' ) { my $string = $token->raw; my $position = $start + length($raw) - length($token->raw); push(@{$dejavu{"E:$tag"}{$string}}, $position); } $path =~ s|/\w+\z||o; } } # Other variables my $element = ""; my $endtag = ""; my $entity = ""; my $erase = ""; my $ok = undef; my $original = validateUtf8($text);; my $starttag = ""; my $string = ""; my $type = ""; my @errors = (); my %offset = (); open(ORG, ">:utf8", "original.txt") or die "$!,"; print ORG "$original\n"; close ORG; # Reinitializing variables $path = $raw = $text = ""; $status = 'out'; # Getting the title my $title = substr($original, $limite{'title'}{'start'} - 1, $limite{'title'}{'end'} - $limite{'title'}{'start'}); print STDERR "Titre : \"$title\"\n" if $debug; my $twig = XML::Twig->new( twig_handlers => { 'title[@type="main"]' => sub {$string = $_->sprint;}, 'title[@level="a"]' => sub {$string = $_->sprint;}, }, ); $twig->parse("<text>$title</text>"); $twig->purge; $title = $string; $title =~ s|^<title [^>]+>||o; $title =~ s|</title>||o; $title =~ s/\s+/ /go; $title =~ s/^\s+//o; $title =~ s/\s+\z//o; # Generating a simpler version of # the original text my $simpler = join(" ", map {s/<[^>]+>/<>/go; $_;} ($original =~ /(<[^>]+>|\w+)/go)); open(SMP, ">:utf8", "simpler.txt") or die "$!,"; print SMP "$simpler\n"; close SMP; # List of named entity tag my %entity = ( 'date' => 1, 'geogName' => 1, 'orgName' => 1, 'persName' => 1, 'placeName' => 1, 'ref' => {'type' => 'bibl'}, 'roleName' => 1, ); my %position = (); open(INP, "<:raw", $entities) or die "$!,"; while(<INP>) { $text .= $_; } close INP; # Parsing annotated file $parser = XML::TokeParser->new(\$text); while( my $token = $parser->get_token() ) { $string = $token->raw; $entity .= $string if $status eq 'in'; if ( $token->is_start_tag ) { my $tag = $token->tag; # Updating the path $path .= "/$tag"; if ( $path =~ m|/titleStmt/|o or $path =~ m|/abstract/|o or $path =~ m|/body/|o ) { $ok = 1; } else { $ok = 0; } # We get the attributes from a hashref my %attributes = %{$token->attr}; if ( $entity{$tag} and not ref($entity{$tag}) ) { $element = $tag; $starttag = $string; $entity = ""; $status = 'in'; if ( defined $attributes{'type'} ) { $type = $attributes{'type'}; } else { $type = ""; } } # Problème pour faire une condition qui ne se base que sur le hachage # “%entity” et qui permette de n'avoir que le cas souhaité pour ne pas # géner la condition suivante. # elsif ( $entity{$tag} and ref($entity{$tag}) eq 'HASH' ) { elsif ( $tag eq 'ref' and defined $attributes{'type'} and $attributes{'type'} eq 'bibl' ) { $element = $tag; $starttag = $string; $entity = ""; $status = 'in'; $type = 'bibl'; } elsif ( $ok ) { # print "Balive ouvrante \"$string\"\n"; my $position = $start + length($string) + 1; if ( defined $dejavu{"S:$tag"} ) { if ( defined $dejavu{"S:$tag"}{$string} ) { my @tmp = @{$dejavu{"S:$tag"}{$string}}; my $match = 0; foreach my $item (@tmp) { if ( $item == $position ) { # print "Balise \"$string\" trouvé à offset $position\n"; $match ++; } } # print "Pas de correspondance pour la balise \"$string\"\n" if not $match; } else { my @tmp = (); my $match = 0; foreach my $item (keys %{$dejavu{"S:$tag"}}) { push(@tmp, @{$dejavu{"S:$tag"}{$item}}); } foreach my $item (@tmp) { if ( $item == $position ) { # print "Balise \"$string\" trouvé à offset $position\n"; $match ++; } } # print "Pas de correspondance pour une balise \"$tag\"\n" if not $match; } } else { $erase = $path; $string = ""; } } } elsif ( $token->is_end_tag ) { my $tag = $token->tag; if ( $entity{$tag} and $tag eq $element ) { $entity = substr($entity, 0, length($entity) - length($string)); $status = recherche(); $element = ""; $endtag = $string; # $status = 'leaving'; } elsif ( $erase eq $path ) { $erase = ""; $string = ""; } $path =~ s|/\w+\z||o; } if ( $status eq 'out' ) { $raw .= $string; } elsif ( $status eq 'leaving' ) { $raw .= $entity; $status = 'out'; } } # If errors ... $input =~ s|^(?:.*/)?(.+)|$1|; if ( @errors ) { print STDERR "Fichier \"$input\" : \n"; foreach my $error (@errors) { print STDERR "\t -> $error \n"; } print STDERR " \n"; exit 4; } else { open(OUT, ">:utf8", $output) or die "$!,"; print OUT "{\n"; print OUT " \"Fichier\": \"$input\",\n"; print OUT " \"Titre\": \"" . valid($title). "\",\n"; my @tmp = (); if ( $istex ) { push(@tmp, " \"Istex\": \"$istex\""); } if ( $doi ) { push(@tmp, " \"DOI\": \"" . valid($doi). "\""); } if ( $pii ) { push(@tmp, " \"PII\": \"" . valid($pii). "\""); } if ( $pmid ) { push(@tmp, " \"PMID\": \"" . valid($pmid). "\""); } if ( @tmp ) { print OUT " \"Identifiant\": \{\n"; while ( my $item = shift @tmp ) { print OUT $item, @tmp ? "," : "", "\n"; } print OUT " \},\n"; } print OUT " \"Empreinte MD5\": \"$digest\",\n"; if ( %offset ) { print OUT " \"Entités nommées\": \[\n"; my @entities = sort keys %offset; while ( $entity = shift @entities ) { print OUT " \{\n"; print OUT " \"Type\": \"$entity\",\n"; print OUT " \"Liste\": \[\n"; my @ne = @{$offset{$entity}}; while ( my $ne = shift @ne ) { print OUT " \{\n"; print OUT " \"Expression\": \"" . valid($ne->{'ne'}). "\",\n"; if ( defined $ne->{'string'} ) { print OUT " \"Texte\": \"" . valid($ne->{'string'}). "\",\n"; } print OUT " \"Position\": \{\n"; print OUT " \"Début\": \"$ne->{'debut'}\",\n"; print OUT " \"Fin\": \"$ne->{'fin'}\"\n"; print OUT " \},\n"; print OUT " \"Offsets\": \{\n"; print OUT " \"Caractères\": \"$ne->{'characters'}\",\n"; print OUT " \"Octets\": \"$ne->{'octets'}\"\n"; print OUT " \}\n"; print OUT " \}", @ne ? "," : "", "\n"; } print OUT " \]\n"; print OUT " \}", @entities ? "," : "", "\n"; } print OUT " \]\n"; } else { print OUT " \"Entités nommées\": \[\]\n"; } print OUT "}\n"; close OUT; } # If necessary, create a new annotated file combining # the original file and the file of named entities if ( $newfile ) { my %position = ""; foreach $entity (keys %offset) { $starttag = "<$entity>"; $endtag = "</$entity>"; if ( $entity =~ /^(\w+) type:(\w+)\z/o ) { $starttag = "<$1 type=\"$2\">"; $endtag = "</$1>"; } foreach my $ne (@{$offset{$entity}}) { my ($line, $char) = ($ne->{'debut'} =~ /^ligne (\d+), caractère (\d+)\z/o); $position{$line}{$char} = $starttag; ($line, $char) = $ne->{'fin'} =~ /^ligne (\d+), caractère (\d+)\z/o; $position{$line}{$char + 1} = $endtag; } } foreach my $line (keys %position) { my @chars = split(//, $lines[$line - 1]); foreach my $char (sort {$b <=> $a} keys %{$position{$line}}) { splice(@chars, $char - 1, 0, $position{$line}{$char}); } $lines[$line - 1] = join("", @chars); } open(NEW, ">:utf8", $newfile) or die "$!,"; foreach my $line (@lines) { print NEW $line; } close NEW; } exit 0; sub usage { print STDERR $usage; exit shift; } sub recherche { if ( $path !~ m{/(abstract|body|titleStmt)/}o ) { $raw .= $starttag . $entity; return 'out'; } my $chaine = validateUtf8($entity); my $utf8 = validateUtf8($raw); my $dejaVu = undef; $chaine = join(" ", ($chaine =~ /(\S+)/go)); print STDERR "Entité : \"$chaine\"\n" if $debug; # Regular expression for the named entity # my $regex2 = join("\\s*", map {s/\W/\\W+/go; $_;} ($chaine =~ /(\p{IsAlnum}+|\P{IsSpace})/go)); # $regex2 = ligature($regex2); # $regex2 =~ s/\\W\+\\s\*amp\\s\*\\W\+/$&(?:amp\\s*\\W)?/go; my $regex2 = join("\\s*", map {s/\W/\\W/go; $_;} ($chaine =~ /(\p{IsAlnum}+|\P{IsSpace})/go)); $regex2 = ligature($regex2); $regex2 =~ s/\\W\\s\*amp\\s\*\\W/$&(?:amp\\s*\\W)?/go; my @words = $utf8 =~ /(<[^>]+>|\p{IsAlnum}+)/go; splice(@words, 0, -10); my @all = @words; while ( 1 ) { # Taking into account entity "&" mistakenly # written "&amp;" in the original file my $regex1 = join("\\W+", @words); $regex1 =~ s/\bamp\b/amp(?:\\W+amp)?/go; $regex1 = ligature($regex1); if ( $regex1 =~ /<[^>]+>/ ) { $regex1 =~ s|(\\W\+)?<[^>]+>(\\W\+)?|\\W*<[^>]+>\\W*|go; } if ( $regex1 =~ /\\W\*\\W\*/ ) { $regex1 =~ s|\\W\*(\\W\*)+|\\W*|go; } print STDERR "Recherche \"", join(" ", @words, ($chaine =~ /(\S+)/go)), "\"\n" if $debug; my @results = $original =~ /$regex1\W*$regex2/g; if ( $#results == 0 ) { $original =~ /($regex1\W*)($regex2)/; offset($2, $1, $`, $chaine); last; } elsif ( $#results > 0 ) { if ( not $dejaVu ) { $dejaVu = 1; my ($code, $intruder) = intruder($regex2, @all) if not $dejaVu; if ( $code > 0 ) { print STDERR "INTRUS [$code] \"$intruder\"\n"; @words = @all; splice(@words, $code, 0, $intruder); next; } elsif ( $code < 0 ) { print STDERR "INTRUS [$code] \"$intruder\"\n"; @words = @all; @words[ - $code] = $intruder; next; } else { last if occurrence($regex2); } } my $message = "Résultats multiples pour \"" . join(" ", ($chaine =~ /(\S+)/go)) . "\" [texte original]"; push(@errors, $message); last; } else { $regex1 =~ s/\\W\*\<\[\^\>\]\+>\\W\*/\\W*/go; (my $regex3 = $regex2) =~ s/(\\[Ws][?*+]?)+/\\W*/go; @results = $simpler =~ /$regex1\W*$regex3/g; if ( $#results == 0 ) { $simpler =~ /($regex1\W*)($regex3)/; my $match1 = $1; my $match2 = $2; $match1 =~ s/<>/<[^>]+>/go; $match1 =~ s/ /\\W*/go; $match2 =~ s/\P{IsAlnum}+\z//o; $match2 =~ s/<>/<[^>]+>/go; $match2 =~ s/ /\\W*/go; if ( $chaine =~ /^[^\p{IsAlnum}\p{IsSpace}]/ ) { $match2 = '\W\s*(?:<[^>]+>)?\s*' . $match2; } if ( $chaine =~ /[^\p{IsAlnum}\p{IsSpace}]\z/ ) { $match2 .= '\s*(?:<[^>]+>)?\s*\W'; } if ( $original =~ /($match1)($match2)/ ) { offset($2, $1, $`, $chaine); last; } } elsif ( $#results > 0 ) { if ( not $dejaVu ) { $dejaVu = 1; my ($code, $intruder) = intruder($regex2, @all); if ( $code > 0 ) { print STDERR "INTRUS [$code] \"$intruder\"\n"; @words = @all; splice(@words, $code, 0, $intruder); next; } if ( $code < 0 ) { print STDERR "INTRUS [$code] \"$intruder\"\n"; @words = @all; @words[ - $code] = $intruder; next; } } my $message = "Résultats multiples pour \"" . join(" ", ($chaine =~ /(\S+)/go)) . "\" [texte simple]"; push(@errors, $message); last; } elsif ( $#words < 0 ) { if ( not $dejaVu ) { $dejaVu = 1; if ( $regex2 =~ /\\W/o ) { $regex2 =~ s/\\W/\\W?/go; @words = @all; next; } } my $message = "Pas de résultat pour \"" . join(" ", ($chaine =~ /(\S+)/go)) . "\""; push(@errors, $message); } } last if not @words; shift @words; } return 'leaving'; } sub offset { my ($ne, $match, $before, $expression) = @_; # Petit correctif pour les cas où on récupère un blanc final $ne =~ s/\s+\z//o; my $offset = length($before) + length($match); my $size = length($ne); my $position = $offset + 1; my $nb = 0; my %tmp = (); for ( ; $nb < $#offset ; $nb ++ ) { if ( $position > $offset[$nb] and $position < $offset[$nb + 1] ) { my $line = $nb + 1; $tmp{'debut'} = sprintf "ligne %d, caractère %d", $line, $position - $offset[$nb]; for ( ; $nb < $#offset ; $nb ++ ) { if ( $offset + $size > $offset[$nb] and $offset + $size < $offset[$nb + 1] ) { $tmp{'fin'} = sprintf "ligne %d, caractère %d", $nb + 1, $offset + $size - $offset[$nb]; $nb = $#offset; } } } } $tmp{'characters'} = sprintf "%d - %d", $offset + 1, $offset + $size; $offset = length(encode_utf8($before . $match)); $size = length(encode_utf8($ne)); $tmp{'octets'} = sprintf "%d - %d", $offset + 1, $offset + $size; if ( %tmp ) { $tmp{'ne'} = $expression; if ( $expression ne $ne ) { $tmp{'string'} = $ne; } my $value = $element; if ( $type ) { $value .= " type:$type"; } push(@{$offset{$value}}, \%tmp); } else { push(@errors, "Entité nommée \"$ne\" introuvable dans le fichier original"); } } sub offset2 { my ($ne, $position) = @_; my $size = length($ne); my $offset = $position - $size + 1; my $nb = 0; my %tmp = (); for ( ; $nb < $#offset ; $nb ++ ) { if ( $offset > $offset[$nb] and $offset < $offset[$nb + 1] ) { my $line = $nb + 1; $tmp{'debut'} = sprintf "ligne %d, caractère %d", $line, $offset - $offset[$nb]; for ( ; $nb < $#offset ; $nb ++ ) { if ( $position > $offset[$nb] and $position < $offset[$nb + 1] ) { $tmp{'fin'} = sprintf "ligne %d, caractère %d", $nb + 1, $position - $offset[$nb]; $nb = $#offset; } } } } $tmp{'characters'} = sprintf "%d - %d", $offset, $position; my $before = substr($original, 0, $offset - 1); $offset = length(encode_utf8($before)); $size = length(encode_utf8($ne)); $tmp{'octets'} = sprintf "%d - %d", $offset + 1, $offset + $size; if ( %tmp ) { $tmp{'ne'} = $ne; my $value = $element; if ( $type ) { $value .= " type:$type"; } push(@{$offset{$value}}, \%tmp); } else { push(@errors, "Entité nommée \"$ne\" introuvable dans le fichier original"); } } sub validateUtf8 { my $chaine = shift; my $valide = ""; if ( is_utf8($chaine, Encode::FB_QUIET) ) { $valide = $chaine; } else { $valide = decode_utf8($chaine); } return $valide } sub valid { my $json = shift; $json =~ s/\\/\\\\/go; $json =~ s/"/\\"/go; return $json; } sub ligature { my $chaine = shift; if ( $chaine =~ /f(f|i|l|fi|fl)/o ) { $chaine =~ s/ff/(?:ff|\x{FB00})/go; $chaine =~ s/fi/(?:fi|\x{FB01})/go; $chaine =~ s/fl/(?:fl|\x{FB02})/go; $chaine =~ s/ffi/(?:ffi|\x{FB03})/go; $chaine =~ s/ffl/(?:ffl|\x{FB04})/go; } if ( $chaine =~ /[äéöü]/ ) { $chaine =~ s/ä/(?:ä|a\x{0308})/go; $chaine =~ s/é/(?:é|e\x{0301})/go; $chaine =~ s/ö/(?:ö|o\x{0308})/go; $chaine =~ s/ü/(?:ü|u\x{0308})/go; } elsif ( $chaine =~ /[aou]\x{0308}/o ){ $chaine =~ s/a\x{0308}/(?:ä|a\x{0308})/go; $chaine =~ s/o\x{0308}/(?:ö|o\x{0308})/go; $chaine =~ s/u\x{0308}/(?:ü|u\x{0308})/go; } elsif ( $chaine =~ /e\x{0301}/o ){ $chaine =~ s/e\x{0301}/(?:é|e\x{0301})/go; } return $chaine; } sub intruder { my ($regex, @words) = @_; for ( my $nb = 10 ; $nb > 0 ; $nb -- ) { my @tmp = @words; splice(@tmp, $nb, 0, '(.+?)'); my $tmp = join("\\W+", map {$_ eq 'amp' ? 'amp(?:\W+amp)?' : $_;} @tmp); $tmp = ligature($tmp); my @results = $original =~ /$tmp\W*$regex/g; if ( $#results == 0 ) { my $intruder = $1; my $nbWords =()= $intruder =~ /\p{IsAlnum}+/go; return ($nb, $intruder) if $nbWords < 10; } elsif ( $#results < 0 ) { @results = $simpler =~ /$tmp\W*$regex/g; if ( $#results == 0 ) { my $intruder = $1; my $nbWords =()= $intruder =~ /\p{IsAlnum}+/go; return ($nb, $intruder) if $nbWords < 10; } } } for ( my $nb = 9 ; $nb > 0 ; $nb -- ) { my @tmp = @words; $tmp[$nb] = '(.+?)'; my $tmp = join("\\W+", map {$_ eq 'amp' ? 'amp(?:\W+amp)?' : $_;} @tmp); $tmp = ligature($tmp); my @results = $original =~ /$tmp\W*$regex/g; if ( $#results == 0 ) { my $intruder = $1; my $nbWords =()= $intruder =~ /\p{IsAlnum}+/go; return (- $nb, $intruder) if $nbWords < 10; } elsif ( $#results < 0 ) { @results = $simpler =~ /$tmp\W*$regex/g; if ( $#results == 0 ) { my $intruder = $1; my $nbWords =()= $intruder =~ /\p{IsAlnum}+/go; return (- $nb, $intruder) if $nbWords < 10; } } } return 0; } sub occurrence { my $regex = shift; my @match = (); my @pos = (); my @raw = $raw =~ /$regex/g; my $num = $#raw + 1; while ( $original =~ /$regex/g ) { push(@match, $&); push(@pos, pos($original)); } if ( $pos[$num] ) { offset2($match[$num], $pos[$num]); return 1; } return 0; }