diff --git a/alignement/README.md b/alignement/README.md new file mode 100644 index 0000000..1ab1234 --- /dev/null +++ b/alignement/README.md @@ -0,0 +1,55 @@ +Alignement Pascal-Francis / Istex +=============== + +Programmes d’alignement des notices bibliographiques Inist avec les documents Istex + +Ce programme permet d’aligner les bases bibliographiques Pascal et Francis de l’**Inist** avec la base **Istex**, c’est-à-dire retrouver dans la base Istex les documents correspondants aux notices bibliographiques des base Pascal ou Francis. + +À noter que si ce programme est utilisé à l'Inist, il est recommandé d’éviter de passer par le proxy. Pour cela, il faut supprimer les variables globales le définissant, ce qui se fait avec la commande `unset http_proxy https_proxy no_proxy`. Vérifiez également avec la commande `env` si ces mêmes variables n'existent pas en majuscule. Auquel cas, supprimez les avec la commande`unset HTTP_PROXY HTTPS_PROXY NO_PROXY`. + +### Prérequis + +Le programme `matchStan2Istex.pl` fonctionne sous Unix/Linux ainsi qu’avec Cygwin sous Windows. Il utilise plusieurs modules qui ne sont pas tous dans la distribution standard de **Perl**. Les modules qu’on peut être amené à installer sont : + - HTML::Entities + - HTTP::CookieJar::LWP + - JSON + - LWP::userAgent + - Number::Convert::Roman + - Text::Unidecode + - URI::Encode + + +### Usage +```txt + matchStan2Istex.pl -f (fichier|-) [ -d ] [ -v nombre ] [ -r id ] + [ -n notices ] [ -c corpus[,corpus]* ]* + matchStan2Istex.pl -h fichier_HFD [ -d ] [ -v nombre ] [ -r id ] + [ -n notices ] [ -c corpus[,corpus]* ]* + matchStan2Istex.pl -i +``` + + +### Options +``` + -c indique le nom du ou des corpus de la base Istex à interroger (on peut soit répéter + cette option, soit mettre tous les noms de corpus séparés par des virgules, mais sans + espace) + -d active le mode “débogage” + -f indique le nom du fichier d’entrée (qui peut être un fichier compressé avec “gzip” ou + “bzip2”). Pour utiliser l’entrée standard, mettre un tiret “-” + comme argument + -h indique le nom du fichier HFD servant d’entrée au programme + -i affiche cette aide. + -n indique le nom du fichier contenant les notices Pascal ou Francis modifiées parce + que les notices originales provoquaient une erreur de syntaxe dans la requête à l’API + ISTEX (N.B. : l’absence du fichier indiqué n’entraîne pas l’arrêt du programme) + -r indique le numéro de la dernière notice Pascal ou Francis traitée précédemment, pour + permettre la reprise de l’alignement si celui-ci a été prématurément arrêté + -v permet de suivre la progression du traitement en affichant sur la sortie erreur l’heure + de début, l’heure de fin et l’heure chaque fois qu’un lot de notices, correspondant au + nombre de notices donné en argument, a été traité +``` + + +### Formats d’entrée et de sortie + diff --git a/alignement/matchStan2Istex.pl b/alignement/matchStan2Istex.pl new file mode 100755 index 0000000..6b43b4f --- /dev/null +++ b/alignement/matchStan2Istex.pl @@ -0,0 +1,3082 @@ +#!/usr/bin/perl + + +# Déclaration des pragmas +use strict; +use utf8; +use open qw/:std :utf8/; +## use warnings::unused; +## use warnings 'once'; + +# Appel des modules externes de base +use Encode; +use Getopt::Long; + +# Appel des modules spécifiques à l'application +use HTML::Entities qw(decode_entities %entity2char); +use HTTP::CookieJar::LWP; +use JSON; +use LWP::UserAgent; +use Number::Convert::Roman; +use Text::Unidecode; +use URI::Encode qw(uri_encode uri_decode); + +my ($programme) = $0 =~ m|^(?:.*/)?(.+)|; +my $substitut = " " x length($programme); +my $version = "15.3.1"; +my $dateModif = "4 Octobre 2021"; + +my $usage = "Usage : \n" . + " $programme -f (fichier|-) [ -d ] [ -v nombre ] [ -r id ] \n" . + " $substitut [ -n notices ] [ -c corpus[,corpus]* ]* \n" . + " $programme -h fichier_HFD [ -d ] [ -v nombre ] [ -r id ] \n" . + " $substitut [ -n notices ] [ -c corpus[,corpus]* ]* \n" . + " $programme -i \n"; + +my $debug = undef; +my $fichier = undef; +my $hfd = undef; +my $info = undef; +my $notices = undef; +my $reprise = undef; +my $verbeux = undef; +my @corpus = (); + +eval { + $SIG{__WARN__} = sub {usage(1);}; + GetOptions( + "corpus=s" => \@corpus, + "notices=s" => \$notices, + "info" => \$info, + "debug" => \$debug, + "fichier=s" => \$fichier, + "hfd=s" => \$hfd, + "reprise=s" => \$reprise, + "verbeux=i" => \$verbeux, + ); + }; +$SIG{__WARN__} = sub {warn $_[0];}; + +if ( $info ) { + print "Programme : \n"; + print " “$programme”, version $version ($dateModif)\n"; + print " Permet de chercher les correspondances entre les notices bibliographiques des bases Pascal \n"; + print " et Francis de l’INIST et les documents de la base ISTEX. \n"; + print "\n"; + print $usage; + print "\nOptions : \n"; + print " -c indique le nom du ou des corpus de la base ISTEX à interroger (on peut soit répéter \n"; + print " cette option, soit mettre tous les noms de corpus séparés par des virgules, mais sans \n"; + print " espace) \n"; + print " -d active le mode “débogage” \n"; + print " -f indique le nom du fichier d’entrée (qui peut être un fichier compressé avec “gzip” ou \n"; + print " “bzip2”). Pour utiliser l’entrée standard, mettre un tiret “-” \n"; + print " comme argument \n"; + print " -h indique le nom du fichier HFD servant d’entrée au programme \n"; + print " -i affiche cette aide. \n"; + print " -n indique le nom du fichier contenant les notices Pascal ou Francis modifiées parce \n"; + print " que les notices originales provoquaient une erreur de syntaxe dans la requête à l’API \n"; + print " ISTEX (N.B. : l’absence du fichier indiqué n’entraîne pas l’arrêt du programme) \n"; + print " -r indique le numéro de la dernière notice Pascal ou Francis traitée précédemment, pour \n"; + print " permettre la reprise de l’alignement si celui-ci a été prématurément arrêté \n"; + print " -v permet de suivre la progression du traitement en affichant sur la sortie erreur l’heure \n"; + print " de début, l’heure de fin et l’heure chaque fois qu’un lot de notices, correspondant au \n"; + print " nombre de notices donné en argument, a été traité \n"; + print " \n"; + print "N.B. : pour ne pas passer par le proxy sur le réseau interne de l’INIST, il faut effacer les \n"; + print " variables globales du proxy par la commande “unset http_proxy https_proxy no_proxy” \n"; + print " avant de lancer le programme “$programme”. \n"; + print " Également, l’absence du fichier donné par l’option “-n” n’entraîne pas l’arrêt du \n"; + print " programme, seulement un message d’erreur. \n"; + print " \n"; + + exit 0; + } + +usage(2) if not $fichier and not $hfd; +usage(2) if $fichier and $hfd; + +if ( @corpus ) { + @corpus = split(/,/, join(",", @corpus)); + } + +if ( $verbeux ) { + if ( $verbeux < 0 ) { + print STDERR "\n$programme : l’option “-v” doit être un entier positif !\n"; + exit(3); + } + } + +# Complétion de la table des entitées HTML +while() { + next if /^\s*$/o or /^#/o; + chomp; + my ($num, $sgml) = split(/\t+/); + next if $entity2char{$sgml}; + $entity2char{$sgml} = chr($num); + } +close DATA; + +# Paramètres de l'API ISTEX +my $base = "https://api.istex.fr"; +my $url = "$base/document/?q="; +my $out = "output=author,copyrightDate,doi,host,pii,pmid,publicationDate,serie,title"; +my $size = 100; +my $echec = 0; + +# Initialisation de l'agent +my $agent = LWP::UserAgent->new( + cookie_jar => HTTP::CookieJar::LWP->new, + ); +$agent->agent("$programme/$version"); +$agent->default_header("Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); +$agent->default_header("Accept-Language" => "fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3"); +$agent->default_header("Accept-Encoding" => "gzip, deflate"); +$agent->default_header("Connection" => "keep-alive"); + +# Allongement du temps d'attente +$agent->timeout(300); +$agent->env_proxy; + +# Déclaration des variables concernant la notice à chercher +my $date = ""; +my $fascicule = ""; +my $isbn = ""; +my $issn = ""; +my $journal = ""; +my $label = ""; +my $livre = ""; +my $nivbib = ""; # Niveau bibliographique (A, M ou C); +my $nom1 = ""; # Nom du premier auteur +my $num = ""; +my $pagedebut = ""; +my $pagefin = ""; +my $prenom1 = ""; # Prénom du premier auteur +my $titre = ""; +my $volume = ""; +my @autres = (); # Auteurs (autres que le premier auteur) +my @isbns = (); +my @issns = (); +my @journaux = (); +my @langues = (); +my @livres = (); +my @pages = (); +my @titres = (); +my %modifs = (); +my %notice = (); +my %pages = (); +my %pb = (); +my %rdvfp = (); +my %source = (); + +# Déclaration d'un hachage général pour éviter de répéter les +# messages plusieurs fois pour la même notice +my @alertes = (); +my %alerte = (); + +# Nombre de notices traitées +my $notice = 0; + +# Initialisation du convertisseur de chiffres romains +my $convert = Number::Convert::Roman->new; + +# Sortie sans “buffer” +select STDERR; +$| = 1; +select STDOUT; +$| = 1; + +# Récupération des notices alternatives du fichier indiqué +# par l’option “- a” +if ( $notices ) { + if ( -f $notices ) { + open(INP, "<:raw", $notices) or die "$!,"; + while() { + next if /^\s*$/o or /^#/o; + ($num) = m|(.+?)|o; + $notice{$num} = $_; + } + close INP; + if ( $verbeux ) { + print STDERR "*** Fichier de notices corrigées : $notices *** \n"; + } + } + elsif ( $verbeux ) { + print STDERR "*** Fichier de notices corrigées “$notices” absent *** \n"; + } + } + +if ( defined $fichier ) { + if ( $fichier eq '-' ) { + open(INP, "<&STDIN") or die "$!,"; + binmode(INP, ":raw"); + } + elsif ( $fichier =~ /\.gz\z/o ) { + open(INP, "gzip -cd $fichier |") or die "$!,"; + binmode(INP, ":raw"); + } + elsif ( $fichier =~ /\.bz2\z/o ) { + open(INP, "bzip2 -cd $fichier |") or die "$!,"; + binmode(INP, ":raw"); + } + else { + open(INP, "<:raw", $fichier) or die "$!,"; + } + } +elsif ( defined $hfd ) { + open(INP, "IhfdCat $hfd |") or die "$!,"; + binmode(INP, ":raw"); + } + +if ( $verbeux ) { + print STDERR "==> $programme, version $version ($dateModif) <== \n"; + if ( $reprise ) { + print STDERR "*** Reprise du traitemment après la notice \"$reprise\" *** \n"; + } + else { + if ( defined $hfd ) { + print STDERR "*** Notices traitées : $hfd *** \n"; + } + elsif ( defined $fichier ) { + if ( $fichier eq '-' ) { + print STDERR "*** Notices traitées : entrée standard *** \n"; + } + else { + print STDERR "*** Notices traitées : $fichier *** \n"; + } + } + my $moment = date(); + print STDERR " -> $moment : début\n"; + } + } + +if ( defined $reprise ) { + while() { + next if /^\s*$/o; + # Recherche du numero de notice. + ($num) = m|(.+?)|o; + $notice ++; + if ( $num eq $reprise ) { + if ( $verbeux ) { + my $moment = date(); + print STDERR " -> $moment : reprise = $notice notices\n"; + } + last; + } + } + } + +# Modification de l’URL de base si la requête est restreinte à un +# (ou plusieurs) corpus plutôt qu’à l’ensemble de la base ISTEX +if ( @corpus ) { + if ( $#corpus == 0 ) { + $url .= "corpusName:$corpus[0]+AND+" + } + else { + $url .= "corpusName:(" . join("+", @corpus) . ")+AND+" + } + } + +while() { + next if /^\s*$/o; + next if /^#/o; + + # Message pour chaque "$verbeux" notices traitées + $notice ++; + if ( $verbeux and $notice % $verbeux == 0 ) { + my $moment = date(); + print STDERR " -> $moment : $notice notices\n"; + } + + # Remise à zéro des alertes + %alerte = (); + + # Recherche du niveau bibbliographique. + ($nivbib) = m|([AMC])|o; + + # Recherche du numero de notice. + ($num) = m|(.+?)|o; + if ( not $num ) { + print STDERR "pas de numéro INIST pour la notice #$notice\n"; + next; + } + + + if ( $notice{$num} ) { + $_ = $notice{$num}; + } + + # Prétraitement de la notice avant découpe + chomp; + s/\r//o; + s|||o; + # Problème de l'entité "&dquot;" et des trémas + s/&dquot;/"/go; + s/&(\w)\.die;/&${1}uml;/go; + s/&Lstok;/Ł/go; + + # Découpe des champs. + my @champs = split(/(.+?)|o; + push(@langues, $langue); + } + + # Recherche du titre en fonction du niveau bib. + $titre = ""; + if ( $nivbib eq "A" ) { + my @titres = grep(/^A[06]8 /, @champs); + ($titre) = $titres[0] =~ m|(.+?)|o; + } + elsif ( $nivbib eq "M" ) { + my @titres = grep(/^A[06]9 /, @champs); + ($titre) = $titres[0] =~ m|(.+?)|o; + } + elsif ( $nivbib eq "C" ) { + my @titres = grep(/^A[17]0 /, @champs); + ($titre) = $titres[0] =~ m|(.+?)|o; + } + else { + print STDERR "Attention : pas de titre pour notice n° $num, ligne $.\n"; + } + # On pense à mettre le titre INIST en UTF-8, mais une seul fois + $titre = decode_entities($titre); + + # Recherche du titre de journal. + my @tmp = (); + my %tmp = (); + $journal = ""; + @journaux = grep(/^A64 /, @champs); + if ( $#journaux == -1 ) { + @journaux = grep(/^A03 /, @champs); + } + foreach my $item (@journaux) { + ($journal) = $item =~ m|(.+?)|o; + push(@tmp, $journal); + } + @journaux = grep(/./, @tmp); + foreach my $item (@journaux) { + if ( $item =~ /^([A-Z]{3,})\. (\p{isUpper}(\p{isAlpha}|&\w+;)+.*)/o ) { + push(@journaux, $1); + push(@journaux, $2); + } + $item =~ s/ : .*//o; + $item =~ s/^\s*\(([^)]+)\) +/$1 /o; + $item =~ s/^(.+) +\(([^)]+)\)\s*\z/$2 $1/o; + if ( $tmp{$item} ) { + $item = ""; + } + else { + $tmp{$item} ++; + } + } + @journaux = grep(/./, @journaux); + if ( $#journaux >= 0 ) { + $journal = $journaux[0]; + } + + # Recherche du titre du livre. + %tmp = (); + $livre = ""; + @livres = grep(/^A09 /, @champs); + foreach my $item (@livres) { + ($livre) = $item =~ m|(.+?)|o; + if ( $tmp{$livre} ) { + $item = ""; + } + else { + $item = $livre; + $tmp{$livre} ++; + } + } + @livres = grep(/./, @livres); + if ( $#livres >= 0 ) { + $livre = $livres[0]; + } + + # Recherche de l'ISSN. + %tmp = (); + $issn = ""; + @issns = grep(/^A01 /, @champs); + foreach my $item (@issns) { + ($issn) = $item =~ m|(.+?)|o; + if ( $issn =~ /^(\d\d\d\d)[^-](\d\d\d[0-9Xx])\z/o ) { + $issn = "$1-$2"; + } + elsif ( $issn =~ /^(\d\d\d\d)/(\d\d\d[0-9Xx])\z/o ) { + $issn = "$1-$2"; + } + if ( $tmp{$issn} ) { + $item = ""; + } + else { + $item = $issn; + $tmp{$issn} ++; + } + } + @issns = grep(/./, @issns); + if ( $#issns >= 0 ) { + $issn = $issns[0]; + } + + # Recherche de l'ISBN. + %tmp = (); + $isbn = ""; + @isbns = grep(/^A26 /, @champs); + foreach my $item (@isbns) { + ($isbn) = $item =~ m|(.+?)|o; + if ( $tmp{$isbn} ) { + $item = ""; + } + else { + $item = $isbn; + $tmp{$isbn} ++; + } + } + @isbns = grep(/./, @isbns); + if ( $#isbns >= 0 ) { + $isbn = $isbns[0]; + } + + # Recherche de la date de publication + if ( m|(.+?)|o ) { + my $tmp = $1; + if ( $tmp =~ /^[12]\d\d\d/ ) { + $date = substr($tmp, 0, 4); + } + else { + $date = undef; + } + } + + # Recherche du numéro de volume + $volume = ""; + if ( m|(.+?)|o ) { + $volume = $1; + $volume =~ s/^\s+//o; + $volume =~ s/\s+\z//o; + $volume =~ s|/|/|go; + if ( $volume =~ m|^(\d+)[-/](\d+)\z|o ) { + my $nb1 = $1; + my $nb2 = $2; + if ( $nb1 > $nb2 and length($nb1) > length($nb2) ) { + my $diff = length($nb1) - length($nb2); + $nb2 = substr($nb1, 0, $diff) . $nb2; + $volume = "$nb1-$nb2"; + } + } + } + + # Recherche du numéro de fascicule + $fascicule = ""; + if ( m|(.+?).*|o ) { + $fascicule = $1; + $fascicule =~ s/^\s+//o; + $fascicule =~ s/\s+\z//o; + $fascicule =~ s/&(horbar|mdash|minus);/-/go; + $fascicule =~ s/ *, */-/go; + $fascicule =~ s/-\s*\z//o; + $fascicule =~ s|/|/|go; + } + + # Recherche des numéros de page + @pages = (); + ($pagedebut, $pagefin) = ("", ""); + if ( m|(.+?)|o ) { + my $champ = $1; + my $pages = ""; + if ( $champ =~ m|(.+?)|o ) { + $pages = $1; + } + elsif ( $champ =~ m|(.+?)|o ) { + $pages = $1; + } + if ( $pages !~ /^\s*\z/o ) { + $pages =~ s/•/./go; + $pages =~ s/&(horbar|mdash|minus);/-/go; + $pages =~ s/ / /go; + $pages =~ s/^ +//o; + $pages =~ s|/|/|go; + if ( $pages =~ /^(&\w+gr; )?vol\. *\S+?, */io or + $pages =~ /^(&\w+gr; )?vol \S+?, */io or + $pages =~ /^(&\w+gr; )?vol\d+, */io or + $pages =~ /^&\w+gr;,? */o ) { + $pages = $'; + } + if ( $pages =~ / *[.+?(].*)?.*\z/io or + $pages =~ / +\d+ p\.].*/io ) { + $pages = $`; + } + if ( $pages =~ / *\(.+?\).*\z/io ) { + $pages = $`; + } + if ( $pages =~ /^([0ivx][ivx]*(-[ivx]+)?,? *([ivx]+(-[ivx]+)?,? *)*) (.+)\z/io ) { + $pages = $5; + } + if ( $pages =~ /^ *(.+?)(, *[ivx]+(-[ivx]+)?((, *| +)[ivx]+(-[ivx]+)?)*,? *)\z/io ) { + my $tmp = $1; + if ( $pages !~ /^ *[ivx]+(-[ivx]+)?((, *| +)[ivx]+(-[ivx]+)?)*,? *\z/io ) { + $pages = $tmp; + } + } + if ( $pages =~ /^s\.p\.\s*\z/io ) { + $pagedebut = undef; + } + elsif ( $pages =~ /^(\d+)-(\d+) +p\./io ) { + $pagedebut = $1; + $pagefin = $2; + } + elsif ( $pages =~ /^\S+ +p\./io ) { + $pagedebut = undef; + } + elsif ( $pages =~ /^p\. *(\S+)\z/io ) { + $pagedebut = $1; + } + elsif ( $pages =~ /^(\S+-\S+)-(\S+-\S+)\z/o ) { + $pagedebut = $1; + $pagefin = $2; + } + elsif ( $pages =~ /^(\S+?)(-\S+)?,( *(\S+-)?(\S+),)* *(\S+-)?(\S+) *\z/o ) { + $pages[0] = { 'debut' => $1, 'fin' => $7 }; + my @items = split(/, */, $pages); + while(my $item = shift @items) { + if ( $item =~ /^(\S+?)-(\S+) *\z/o ) { + push(@pages, {'debut' => $1, 'fin' => $2,}); + } + else { + if ( @items ) { + push(@pages, {'debut' => $item, 'fin' => "",}); + } + else { + push(@pages, {'debut' => "", 'fin' => $item}); + } + } + } + } + elsif ( $pages =~ /^(\S+?)-(\S+) *\z/o ) { + $pagedebut = $1; + $pagefin = $2; + } + elsif ( $pages =~ /^(\S+) *\z/o ) { + $pagedebut = $1; + } + } + } + + # Recherche du premier auteur + $nom1 = ""; + $prenom1 = ""; + my @auteurs = (); + if ( $nivbib eq "A" ) { + @auteurs = grep(!//, grep(/^A11 /, @champs)); + if ( $#auteurs < 0 ) { + @auteurs = grep(/^A11 /, @champs); + } + } + elsif ( $nivbib eq "M" ) { + @auteurs = grep(!//, grep(/^A12 /, @champs)); + if ( $#auteurs < 0 ) { + @auteurs = grep(/^A12 /, @champs); + } + } + elsif ( $nivbib eq "C" ) { + @auteurs = grep(!//, grep(/^A13 /, @champs)); + if ( $#auteurs < 0 ) { + @auteurs = grep(/^A13 /, @champs); + } + } + if ( $auteurs[0] ) { + my ($auteur) = $auteurs[0] =~ m|(.+?)|o; + if ( $auteur =~ /^(.+?) *\((.+?)\) *\z/ ) { + $nom1 = $1; + $prenom1 = $2; + } + else { + $nom1 = $auteur; + } + shift @auteurs; + } + + # Autres auteurs + @autres = (); + foreach my $suivant (@auteurs) { + my ($auteur) = $suivant =~ m|(.+?)|o; + if ( $auteur =~ /^(.+?) *\((.+?)\) *\z/ ) { + push(@autres, "$1, $2"); + } + else { + push(@autres, $auteur); + } + } + my $autres = join("|", @autres); + + # Recherche ISTEX + # Cas n° 1 : article de revue + if ( $nivbib eq "A" and ( $journal or $issn ) ) { + if ( @pages ) { + my $max = 0.0; + my %alt = (); + $label = "\nURI"; + foreach my $item (@pages) { + $pagedebut = $item->{'debut'}; + $pagefin = $item->{'fin'}; + my ($score, $id, $ark, $doi, $pii, $pmid, @pb) = traite(); + $label = "ETC"; + alerte(); + if ( $id ) { + $score *= 5.0; + my $nb = int($score); + my $note = "*" x $nb; + if ( $score - $nb > 0.5 ) { + $note .= "+"; + } + $note = "." if not $note; + $note .= "\x{A0}" x (5 - length($note)); + $score = sprintf("%.3f", $score); + if ( $score < 3.490 and $rdvfp{$id} ) { + $score .= $rdvfp{$id}; + } + if ( $score > $max or + ( $score =~ /^\d\.\d+\!\z/o and not + $max =~ /^\d\.\d+\!\z/o ) ) { + $max = $score; + } + if ( $alt{$id} ) { + if ( $alt{$id}{'score'} < $score or + ( $score =~ /^\d\.\d+\!\z/o and not + $alt{$id}{'score'} =~ /^\d\.\d+\!\z/o ) ) { + $alt{$id}{'score'} = $score; + $alt{$id}{'note'} = $note; + $alt{$id}{'debut'} = $pagedebut; + $alt{$id}{'fin'} = $pagefin; + $alt{$id}{'ark'} = $ark; + $alt{$id}{'doi'} = $doi; + $alt{$id}{'pii'} = $pii; + $alt{$id}{'pmid'} = $pmid; + } + } + else { + $alt{$id}{'score'} = $score; + $alt{$id}{'note'} = $note; + $alt{$id}{'debut'} = $pagedebut; + $alt{$id}{'fin'} = $pagefin; + $alt{$id}{'ark'} = $ark; + $alt{$id}{'doi'} = $doi; + $alt{$id}{'pii'} = $pii; + $alt{$id}{'pmid'} = $pmid; + } + } + else { + $alt{-1}{'score'} = 0.000; + $alt{-1}{'note'} = 0.000; + $alt{-1}{'debut'} = $pages[0]->{'debut'}; + $alt{-1}{'fin'} = $pages[0]->{'fin'}; + if ( $pb[0] == -1 ) { + $alt{-1}{'blanc'} = "\n"; + } + else { + $alt{-1}{'blanc'} = ""; + } + $alt{-1}{'ark'} = undef; + $alt{-1}{'doi'} = undef; + $alt{-1}{'pii'} = undef; + $alt{-1}{'pmid'} = undef; + } + } + foreach my $id (keys %alt) { + if ( $id eq "-1" and $max == 0.0 ) { + print "$alt{-1}{'blanc'}"; + print "0\t0.000\t$nivbib\t$num\t$titre\t$journal\t$livre\t$issn\t$isbn\t$date"; + print "\t$volume\t$fascicule\t$pagedebut\t$pagefin\t$nom1\t$prenom1\t$autres\t\t\t\t\t\n"; + delete $alt{-1}; + last; + } + elsif ( $alt{$id}{'score'} eq $max ) { + print "$alt{$id}{'note'}\t$alt{$id}{'score'}\t$nivbib\t$num\t$titre\t$journal"; + print "\t$livre\t$issn\t$isbn\t$date\t$volume\t$fascicule\t$pages[0]->{'debut'}"; + print "\t$pages[0]->{'fin'}\t$nom1\t$prenom1\t$autres\t$id\t$alt{$id}{'ark'}"; + print "\t$alt{$id}{'doi'}\t$alt{$id}{'pii'}\t$alt{$id}{'pmid'}\n"; + delete $alt{$id}; + last; + } + } + foreach my $id (keys %alt) { + if ( $alt{$id}{'score'} > 3.490 or $alt{$id}{'score'} =~ /^\d\.\d+\!\z/o ) { + print " --> \t$alt{$id}{'debut'}\t$alt{$id}{'fin'}\t$alt{$id}{'score'}\t$id"; + print "\t$alt{$id}{'ark'}\t$alt{$id}{'doi'}\t$alt{$id}{'pii'}\t$alt{$id}{'pmid'}\n"; + } + } + } + else { + $label = "\nURI"; + my ($score, $id, $ark, $doi, $pii, $pmid, @pb) = traite(); + alerte(); + if ( $id ) { + $score *= 5.0; + my $nb = int($score); + my $note = "*" x $nb; + if ( $score - $nb > 0.5 ) { + $note .= "+"; + } + $note = "." if not $note; + $note .= "\x{A0}" x (5 - length($note)); + $score = sprintf("%.3f", $score); + if ( $score < 3.490 and $rdvfp{$id} ) { + $score .= $rdvfp{$id}; + } + print "$note\t$score\t$nivbib\t$num\t$titre\t$journal\t$livre\t$issn\t$isbn\t$date"; + print "\t$volume\t$fascicule\t$pagedebut\t$pagefin\t$nom1\t$prenom1\t$autres\t$id"; + print "\t$ark\t$doi\t$pii\t$pmid\n"; + if ( @pb ) { + foreach my $pb (sort {$a <=> $b} @pb) { + print " ~~> \t$pb\n"; + } + } + } + else { + print "\n" if $pb[0] == -1; + print "0\t0.000\t$nivbib\t$num\t$titre\t$journal\t$livre\t$issn\t$isbn\t$date"; + print "\t$volume\t$fascicule\t$pagedebut\t$pagefin\t$nom1\t$prenom1\t$autres\t\t\t\t\t\n"; + } + } + } + else { + print "\n_\t-\t$nivbib\t$num\t$titre\t$journal\t$livre\t$issn\t$isbn\t$date"; + print "\t$volume\t$fascicule\t$pagedebut\t$pagefin\t$nom1\t$prenom1\t$autres\t\t\t\t\t\n"; + } + } + +if ( $verbeux ) { + my $moment = date(); + print STDERR " -> $moment : fin = $notice notices\n"; + } + + +exit 0; + + +sub usage +{ +print STDERR "\n$usage\n"; + +exit shift; +} + +sub date +{ +my @time = localtime(); + +my $jour = (qw(Dimanche Lundi Mardi Mercredi Jeudi Vendredi Samedi))[$time[6]]; +my $mois = (qw(Janvier Février Mars Avril Mai juin Juillet Août Septembre Octobre Novembre Décembre))[$time[4]]; +my $annee = $time[5] + 1900; + +my $moment = "$jour $time[3] $mois $annee "; +$moment .= sprintf("%02d:%02d:%02D", $time[2], $time[1], $time[0]); + +return $moment; +} + +sub alerte +{ +my $message = shift; + +if ( not $message ) { + print @alertes; + @alertes = (); + %alerte = (); + } +elsif ( not $alerte{$message} ) { + push(@alertes, "\t => $message !\n"); + $alerte{$message} ++; + } +} + +sub max +{ +my ($num1, $num2) = @_; + +if ( $num1 > $num2 ) { + return $num1; + } + +return $num2; +} + +sub get_hits +{ +my $query = shift; +$query =~ s/ /+/go; + +my $uri = "$url$query&$out&size=$size"; +$uri .= "&sid=tdm-alignement-pf"; + +my ($code, $json) = mon_get("$uri"); + +my $perl = undef; +my $suivant = undef; +my $total = undef; +my @hits = (); + +if ( defined $json ) { + eval { + $perl = decode_json $json; + }; + if ( $@ ) { + alerte("ERREUR CONVERSION JSON -> PERL"); + return(undef); + } + my %top = %{$perl}; + if ( $top{'_error'} ) { + alerte("ERREUR \"$top{'_error'}\""); + return(undef); + } + $total = $top{'total'}; + if ( $total > 0 ) { + if ( $total > 10000 ) { + alerte("RÉPONSE SUPÉRIEURE À 10 000"); + } + if ( defined $top{'nextPageURI'} ) { + $suivant = $top{'nextPageURI'}; + } + push(@hits, @{$top{'hits'}}); + + while($suivant) { + ($code, $json) = mon_get("$suivant"); + eval { + $perl = decode_json $json; + }; + if ( $@ ) { + alerte("ERREUR CONVERSION JSON -> PERL"); + return(undef); + } + %top = %{$perl}; + if ( defined $top{'nextPageURI'} ) { + $suivant = $top{'nextPageURI'}; + } + else { + $suivant = undef; + } + push(@hits, @{$top{'hits'}}); + } + } + + %top = ("total" => $total, "hits" => \@hits); + return \%top; + } +else { + return undef; + } +} + +sub get_total +{ +my $query = shift; +$query =~ s/ /+/go; + +my $uri = "$url$query&$out&size=0"; +$uri .= "&sid=tdm-alignement-pf"; + +my ($code, $json) = mon_get("$uri"); + +my $perl = undef; +my $total = undef; + +if ( defined $json ) { + eval { + $perl = decode_json $json; + }; + if ( $@ ) { + alerte("ERREUR CONVERSION JSON -> PERL"); + if ( $json =~ /\"total\": (d+),/o ) { + return($1); + } + else { + return(undef); + } + } + my %top = %{$perl}; + if ( $top{'_error'} ) { + alerte("ERREUR \"$top{'_error'}\""); + return(undef); + } + return($top{'total'}); + } +} + +sub mon_get +{ +my $cible = shift; + +my $requete = HTTP::Request->new(GET => "$cible"); + +my $reponse = $agent->request($requete); +my $code = $reponse->code; + +# Vérification de la réponse +if ($reponse->is_success) { + + + return ($code, $reponse->decoded_content); + } +else { + my $echec = 0; + my $message = $reponse->status_line; + if ( $message =~ /\b(read timeout|Proxy Error)\b/o and $echec < 10 ) { + $echec ++; + alerte("INTERRUPTION #$echec"); + sleep 2; + return mon_get($cible); + } + else { + alerte("ERREUR $code \"$message\""); + return($code, undef); + } + } +} + +sub traite +{ +my $id = undef; +my $ark = undef; +my $doi = undef; +my $pii = undef; +my $pmid = undef; +my $score = 0; + +my $query = ""; +my $altq = 0; +my @attendus = (); +my @mac = (); +my @tmp = (); +my %ok = (); + +%alerte = (); +%pages = (); +%pb = (); +%rdvfp = (); + +foreach my $item (@journaux) { + push(@tmp, "host.title:\"".propre($item)."\""); + } +foreach my $item (@issns) { + push(@tmp, "host.issn:\"$item\""); + push(@tmp, "host.eissn:\"$item\""); + push(@tmp, "serie.issn:\"$item\""); + push(@tmp, "serie.eissn:\"$item\""); + } +if ( @tmp ) { + if ( $#tmp == 0 ) { + $query .= $tmp[0]; + } + elsif ( $#tmp > 0 ) { + $query .= "(" . join(" OR ", @tmp) . ")"; + } + if ( defined $source{$query} ) { + if ( $source{$query} == 0 ) { + return(0, undef, undef, undef, undef, undef, -1); + } + } + else { + my $tmp = undef; + while( not defined $tmp ) { + $tmp = get_total($query); + if ( defined $tmp ) { + $source{$query} = $tmp; + if ( $tmp == 0 ) { + return(0, undef, undef, undef, undef, undef, -1); + } + } + } + } + } +else { + alerte("PAS DE REVUE OU ISSN"); + return(0, undef, undef, undef, undef, undef, -1); + } + +my %query = (); +$query{'journal'} = $query; + +if ( $date ) { + $query{'date'} = "publicationDate:$date OR copyrightDate:$date"; + $query{'date'} .= " OR host.publicationDate:$date OR host.copyrightDate:$date"; + $query{'date'} .= " OR serie.publicationDate:$date OR serie.copyrightDate:$date"; + } +if ( $volume ) { + if ( $volume =~ /^\d+\z/o ) { + $query{'volume'} = "host.volume:$volume"; + } + elsif ( $volume =~ m|^(\d+)[-/](\d+)\z|o ) { + my $inf = $1; + my $sup = $2; + $query{'volume'} = "host.volume.raw:\"$volume\""; + if ( $inf < $sup ) { + $query{'volume'} .= " OR host.volume:(".join(" ", ($inf .. $sup)).")"; + } + } + elsif ( $volume =~ /^(\d+)[A-Za-z]+\z/o ) { + $query{'volume'} = "host.volume.raw:\"$volume\" OR host.volume:$1"; + } + elsif ( $volume =~ /^[A-Za-z]+(\d+)\z/o ) { + $query{'volume'} = "host.volume.raw:\"$volume\" OR host.volume:$1"; + } + else { + my $tmp = propre($volume); + $query{'volume'} = "host.volume.raw:\"$tmp\""; + alerte("VOLUME \"$tmp\""); + } + } +if ( $fascicule ) { + if ( $fascicule =~ /^\d+\z/o ) { + $query{'fascicule'} = "host.issue:$fascicule"; + } + elsif ( $fascicule =~ m|^(\d+)[-/,](\d+)\z|o ) { + my $inf = $1; + my $sup = $2; + $query{'fascicule'} = "host.issue.raw:\"$fascicule\""; + if ( $inf < $sup ) { + $query{'fascicule'} .= " OR host.issue:(".join(" ", ($inf .. $sup)).")"; + } + } + elsif ( $fascicule =~ /^(\d+)([A-Za-z]+)\z/o ) { + $query{'fascicule'} = "host.issue.raw:(\"$1$2\" OR \"$2$1\") OR host.issue:$1"; + } + elsif ( $fascicule =~ /^([A-Za-z]+)(\d+)\z/o ) { + $query{'fascicule'} = "host.issue.raw:(\"$1$2\" OR \"$2$1\") OR host.issue:$2"; + } + elsif ( $fascicule =~ /^\d*[A-Za-z]+\d*[A-Za-z]*\z/o or + $fascicule =~ /^[A-Za-z]+-[A-Za-z]+\z/o ) { + $query{'fascicule'} = "host.issue.raw:\"$fascicule\""; + } + else { + my $tmp = propre($fascicule); + $query{'fascicule'} = "host.issue.raw:\"$tmp\""; + alerte("FASCICULE \"$tmp\""); + } + } + +if ( $nom1 ) { + my $auteurs = ""; + my $tmp = decode_entities($nom1); + my %tmp = ($tmp => 1); + my @tmp1 = (); + my @mots = $tmp =~ /(\p{IsAlpha}+)/go; + foreach my $mot (@mots) { + next if $mot =~ /^\p{IsAlpha}\z/o; + if ( $mot =~ /[^\0-\177]/ ) { + my $umot = unidecode($mot); + push(@tmp1, "\"".propre($mot)."\""); + push(@tmp1, "\"".propre($umot)."\""); + } + else { + push(@tmp1, "\"".propre($mot)."\""); + } + } + $auteurs = join(" OR ", @tmp1); + @tmp1 = (); + foreach my $item (@autres) { + my $nom = $item; + if ( $item =~ /^(.+?), / ) { + $nom = $1; + } + push @tmp1, decode_entities($nom); + } + my @tmp2 = grep {not $tmp{$_} ++;} @tmp1; + @tmp1 = ($auteurs); + foreach my $nom (@tmp2[0 .. 4]) { + next if not $nom; + next if $nom =~ /^\p{IsAlpha}\z/o; + if ( $nom =~ /[^\0-\177]/ ) { + $tmp = unidecode($nom); + push(@tmp1, "\"".propre($nom)."\" OR \"".propre($tmp)."\""); + } + else { + push(@tmp1, "\"".propre($nom)."\""); + } + } + $auteurs = join(" OR ", grep(/\S+/, @tmp1)); + if ( $auteurs ) { + $query{'auteur'} = "author.name:($auteurs)"; + } + } + +if ( $pagedebut ) { + @tmp = (); + my %tmp = (); + $pages{'first'}{$pagedebut} ++; + if ( $pagedebut =~ /^(.+)-\z/o ) { + $pagedebut = $1; + $pages{'first'}{$pagedebut} ++; + } + if ( $pagefin ) { + $pages{'last'}{$pagefin} ++; + if ( $pagedebut =~ /^\d+\z/o ) { + if ( $pagefin =~ /^\d+\z/o ) { + push(@{$tmp{'first'}}, "[$pagedebut TO $pagefin]"); + push(@{$tmp{'last'}}, "[$pagedebut TO $pagefin]"); + } + else { + push(@{$tmp{'first'}}, $pagedebut); + push(@{$tmp{'last.raw'}}, "\"$pagefin\""); + my $tmp = $pagefin; + if ( $fascicule and $pagefin =~ /^$fascicule[-.](.+)\z/ ) { + $tmp = $1; + $pages{'last'}{$tmp} ++; + } + elsif ( $pagefin =~ /^\w+[-.](.+)\z/ ) { + $tmp = $1; + $pages{'last'}{$tmp} ++; + } + if ( $tmp =~ /^\d+\z/o ) { + push(@{$tmp{'last'}}, $tmp); + $pages{'last'}{$pagefin} ++; + } + elsif ( $tmp =~ /^(\d+)([A-Za-z]+)\z/o ) { + push(@{$tmp{'last'}}, $1); + push(@{$tmp{'last.raw'}}, "\"$tmp\""); + push(@{$tmp{'last.raw'}}, "\"$2$1\""); + $pages{'last'}{"$1"} ++; + $pages{'last'}{"$2$1"} ++; + } + elsif ( $tmp =~ /^([A-Za-z]+)(\d+)\z/o ) { + push(@{$tmp{'last'}}, $2); + push(@{$tmp{'last.raw'}}, "\"$tmp\""); + push(@{$tmp{'last.raw'}}, "\"$2$1\""); + $pages{'last'}{"$2"} ++; + $pages{'last'}{"$2$1"} ++; + } + elsif ( $tmp ne $pagefin ) { + push(@{$tmp{'last.raw'}}, "\"$tmp\""); + } + } + } + else { + my $tmp = $pagedebut; + if ( $fascicule and $pagedebut =~ /^$fascicule[-.](.+)\z/ ) { + $tmp = $1; + # push(@{$tmp{'first.raw'}}, "\"$tmp\""); + } + elsif ( $pagedebut =~ /^\w+[-.](.+)\z/ ) { + $tmp = $1; + # push(@{$tmp{'first.raw'}}, "\"$tmp\""); + } + if ( $tmp =~ /^\d+\z/o ) { + push(@{$tmp{'first'}}, $tmp); + $pages{'first'}{$tmp} ++; + } + elsif ( $tmp =~ /^(\d+)([A-Za-z]+)\z/o ) { + push(@{$tmp{'first'}}, $1); + push(@{$tmp{'first.raw'}}, "\"$tmp\""); + push(@{$tmp{'first.raw'}}, "\"$2$1\""); + $pages{'first'}{"$1"} ++; + $pages{'first'}{"$2$1"} ++; + } + elsif ( $tmp =~ /^([A-Za-z]+)(\d+)\z/o ) { + push(@{$tmp{'first'}}, $2); + push(@{$tmp{'first.raw'}}, "\"$tmp\""); + push(@{$tmp{'first.raw'}}, "\"$2$1\""); + $pages{'first'}{"$2"} ++; + $pages{'first'}{"$2$1"} ++; + } + else { + push(@{$tmp{'first.raw'}}, "\"$tmp\""); + $pages{'first'}{$tmp} ++; + } + $tmp = $pagefin; + if ( $fascicule and $pagefin =~ /^$fascicule[-.](.+)\z/ ) { + $tmp = $1; + push(@{$tmp{'last.raw'}}, "\"$tmp\""); + } + elsif ( $pagefin =~ /^\w+[-.](.+)\z/ ) { + $tmp = $1; + push(@{$tmp{'last.raw'}}, "\"$tmp\""); + } + $pages{'last'}{$tmp} ++; + if ( $tmp =~ /^\d+\z/o ) { + push(@{$tmp{'last'}}, $tmp); + } + elsif ( $tmp =~ /^(\d+)([A-Za-z]+)\z/o ) { + push(@{$tmp{'last'}}, $1); + push(@{$tmp{'last.raw'}}, "\"$tmp\""); + push(@{$tmp{'last.raw'}}, "\"$2$1\""); + $pages{'last'}{"$1"} ++; + $pages{'last'}{"$2$1"} ++; + } + elsif ( $tmp =~ /^([A-Za-z]+)(\d+)\z/o ) { + push(@{$tmp{'last'}}, $2); + push(@{$tmp{'last.raw'}}, "\"$tmp\""); + push(@{$tmp{'last.raw'}}, "\"$2$1\""); + $pages{'last'}{"$2"} ++; + $pages{'last'}{"$2$1"} ++; + } + else { + push(@{$tmp{'last.raw'}}, "\"$tmp\""); + } + } + } + else { + if ( $pagedebut =~ /^\d+\z/o ) { + push(@{$tmp{'first'}}, "\"$pagedebut\""); + } + else { + my $tmp = $pagedebut; + if ( $fascicule and $pagedebut =~ /^$fascicule[-.](.+)\z/ ) { + $tmp = $1; + push(@{$tmp{'first.raw'}}, "\"$tmp\""); + } + elsif ( $pagedebut =~ /^\w+[-.](.+)\z/ ) { + $tmp = $1; + push(@{$tmp{'first.raw'}}, "\"$tmp\""); + } + $pages{'first'}{$tmp} ++; + if ( $tmp =~ /^\d+\z/o ) { + push(@{$tmp{'first'}}, $tmp); + } + elsif ( $tmp =~ /^(\d+)([A-Za-z]+)\z/o ) { + push(@{$tmp{'first'}}, $1); + push(@{$tmp{'first.raw'}}, "\"$tmp\""); + push(@{$tmp{'first.raw'}}, "\"$2$1\""); + $pages{'first'}{"$1"} ++; + $pages{'first'}{"$2$1"} ++; + } + elsif ( $tmp =~ /^([A-Za-z]+)(\d+)\z/o ) { + push(@{$tmp{'first'}}, $2); + push(@{$tmp{'first.raw'}}, "\"$tmp\""); + push(@{$tmp{'first.raw'}}, "\"$2$1\""); + $pages{'first'}{"$2"} ++; + $pages{'first'}{"$2$1"} ++; + } + else { + push(@{$tmp{'first.raw'}}, "\"$tmp\""); + } + } + } + foreach my $tmp (sort keys %tmp) { + my @items = @{$tmp{$tmp}}; + next if $#items < 0; + if ( $#items == 0 ) { + push(@tmp, "host.pages.$tmp:$items[0]"); + } + else { + push(@tmp, "host.pages.$tmp:(".join(" OR ", @items).")"); + } + } + if ( @tmp ) { + $query{'page'} = join(" OR ", @tmp); + } + } + +if ( $titre ) { + @tmp = grep(!/^(and|not|or)\z/io, ($titre =~ /(\w+)/go)); + my @mots = sort {length($b) <=> length($a) or $a cmp $b} @tmp; + splice (@mots, 3); + $query{'titre'} = "title:(" . join(" ", @mots) . ")"; + } + +# Composition de la requête +if ( $query{'date'} or $query{'volume'} or $query{'fascicule'} ) { + $query .= " AND (".join(" OR ", grep {defined $_;} $query{'date'}, + $query{'volume'}, $query{'fascicule'}).")"; + } +$query{'base'} = $query; +if ( $query{'auteur'} or $query{'page'} ) { + $query .= " AND (".join(" OR ", grep {defined $_;} $query{'auteur'}, + $query{'page'}).")"; + } +else { + $query .= " AND " . $query{'titre'}; + $query{'base'} = undef; + } + +# Envoi de la requête +print "$label : \"$url$query&$out\"\n"; + +my $perl = get_hits($query); +if ( defined $perl ) { + %modifs = (); + my %top = %{$perl}; + printf "\t => %d\n", $top{'total'}; + if ( $top{'total'} >= 1 ) { + @attendus = (); + if ( $nom1 ) { + push @attendus, decode_entities($nom1) ; + } + foreach my $item (@autres) { + my $nom = $item; + if ( $item =~ /^(.+?), / ) { + $nom = $1; + } + push @attendus, decode_entities($nom); + } + @mac = (); + my $mac = 0; + foreach my $item (@attendus) { + if ( $item =~ /\bma?c ?(\w+)/oi ) { + my $canon = "$`Mc$1$'"; + $mac ++ if $canon ne $item; + push(@mac, $canon); + } + else { + push(@mac, $item); + } + } + if ( $mac == 0 ) { + @mac = (); + } + my @hits = @{$top{'hits'}}; + foreach my $hit (@hits) { + my ($valeur, $idCourant, $arkCourant, $refId, $ok) = evalue($hit, \@attendus, \@mac); + if ( $valeur > $score ) { + $score = $valeur; + $id = $idCourant; + $ark = $arkCourant; + $doi = defined $refId->{'doi'} ? $refId->{'doi'} : undef; + $pii = defined $refId->{'pii'} ? $refId->{'pii'} : undef; + $pmid = defined $refId->{'pmid'} ? $refId->{'pmid'} : undef; + print STDERR "\t=> score = $score\n\n" if $debug; + } + elsif ( $debug ) { + print STDERR "\n"; + } + if ( $ok =~ /D[VIFvif]+/o ) { + $ok{$ok} ++; + } + last if $score == 1.000; + } + } + } +else { + alerte("ERREUR"); + } + +if ( $score == 0 and $query{'base'} and $query{'titre'} ) { + alerte(); + $query = $query{'base'}; + $query .= " AND " . $query{'titre'}; + # Envoi de la requête + print "RAC : \"$url$query&$out\"\n"; + + my $perl = get_hits($query); + if ( defined $perl ) { + my %top = %{$perl}; + printf "\t => %d\n", $top{'total'}; + if ( $top{'total'} >= 1 ) { + my @hits = @{$top{'hits'}}; + foreach my $hit (@hits) { + my ($valeur, $idCourant, $arkCourant, $refId, $ok) = evalue($hit, \@attendus, \@mac); + if ( $valeur > $score ) { + $score = $valeur; + $id = $idCourant; + $ark = $arkCourant; + $doi = defined $refId->{'doi'} ? $refId->{'doi'} : undef; + $pii = defined $refId->{'pii'} ? $refId->{'pii'} : undef; + $pmid = defined $refId->{'pmid'} ? $refId->{'pmid'} : undef; + print STDERR "\t=> score = $score\n\n" if $debug; + } + elsif ( $debug ) { + print STDERR "\n"; + } + if ( $ok =~ /D[VIFvif]+/o ) { + $ok{$ok} ++; + } + last if $score == 1.000; + } + } + } + else { + alerte("ERREUR"); + } + } + +my @liste = groupe($pb{$id}) if not @pages; + +return ($score, $id, $ark, $doi, $pii, $pmid, @liste); +} + +sub evalue +{ +my $hit = shift; +my $ref = shift; +my @attendus = @{$ref}; +$ref = shift; +my @mac1 = @{$ref}; + +my $proche = ""; +my $score = 0; +my $succes = 0; +my $total = 0; + +my %hit = %{$hit}; +my $id = $hit{'id'}; +my $ark = $hit{'arkIstex'}; +my %id = (); + +$id{'doi'} = defined $hit{'doi'} ? $hit{'doi'}->[0] : 'N/A'; +$id{'pii'} = defined $hit{'pii'} ? $hit{'pii'}->[0] : undef; +$id{'pmid'} = defined $hit{'pmid'} ? $hit{'pmid'}->[0] : undef; + +my %host = (); +if ( $hit{'host'} ) { + %host = %{$hit{'host'}}; + } +my %serie = (); +if ( $hit{'serie'} ) { + %serie = %{$hit{'serie'}}; + } +# Test de l'ISSN +my %bib = (); +my @nums = (); +if ( $host{'issn'} ) { + @nums = @{$host{'issn'}}; + } +if ( @nums and @issns ) { + $total ++; + my $trouve = 0; + foreach my $issn1 (@nums) { + foreach my $issn2 (@issns) { + if ( $issn1 eq $issn2 or + uc($issn1) eq uc($issn2) ) { + $trouve = $issn1; + } + } + } + if ( $trouve ) { + $succes ++ ; + $bib{'issn'} = $trouve; + print STDERR "ISSN : +1\n" if $debug; + } + elsif ( $debug ) { + print STDERR "ISSN : 0\n" if $debug; + } + } +elsif ( $debug ) { + print STDERR "ISSN : N.A.\n" if $debug; + } +my $revue = $host{'title'}; +my $serie = defined $serie{'title'} ? $serie{'title'} : undef; +# Test du nom de la revue +if ( @journaux and ( $revue or $serie ) ) { + $total ++; + my $trouve = 0; + if ( $revue ) { + $trouve = revue($revue, @journaux); + if ( $trouve ) { + $succes ++ ; + $bib{'revue'} ++; + } + } + if ( $serie and not $trouve ) { + $trouve = revue($serie, @journaux); + if ( $trouve ) { + $succes ++ ; + $bib{'serie'} ++; + } + } + if ( $trouve ) { + print STDERR "Revue : +1\n" if $debug; + } + else { + print STDERR "Revue : 0\n" if $debug; + } + } +elsif ( $debug ) { + print STDERR "Revue : N.A.\n" if $debug; + } +# Récupération de la date +$bib{'publicationDate'} = $hit{'publicationDate'} ? $hit{'publicationDate'} : undef; +$bib{'copyrightDate'} = $hit{'copyrightDate'} ? $hit{'copyrightDate'} : undef; +$bib{'host.publicationDate'} = $host{'publicationDate'} ? $host{'publicationDate'} : undef; +$bib{'host.copyrightDate'} = $host{'copyrightDate'} ? $host{'copyrightDate'} : undef; +$bib{'serie.publicationDate'} = $serie{'publicationDate'} ? $serie{'publicationDate'} : undef; +$bib{'serie.copyrightDate'} = $serie{'copyrightDate'} ? $serie{'copyrightDate'} : undef; +if ( defined $bib{'publicationDate'} and + $bib{'publicationDate'} eq $date ) { + $bib{'date'} ++; + } +if ( defined $bib{'copyrightDate'} and + $bib{'copyrightDate'} eq $date ) { + $bib{'date'} ++; + } +if ( defined $bib{'host.publicationDate'} and + $bib{'host.publicationDate'} eq $date ) { + $bib{'date'} ++; + } +if ( defined $bib{'host.copyrightDate'} and + $bib{'host.copyrightDate'} eq $date ) { + $bib{'date'} ++; + } +if ( defined $bib{'serie.publicationDate'} and + $bib{'serie.publicationDate'} eq $date ) { + $bib{'date'} ++; + } +if ( defined $bib{'serie.copyrightDate'} and + $bib{'serie.copyrightDate'} eq $date ) { + $bib{'date'} ++; + } +if ( $bib{'date'} ) { + $proche .= "D"; + } +# Test du volume +my $vol = $host{'volume'}; +my $masque = ""; +if ( $vol and $volume ) { + $total ++; + $proche .= "v"; + if ( $vol eq $volume or biniou($vol, $volume, "VOLUME") ) { + $succes ++; + $bib{'volume'} ++; + $proche =~ s/v/V/o; + print "Volume : +1\n" if $debug; + } + elsif ( unidecode($vol) eq unidecode($volume) ) { + $succes ++; + $bib{'volume'} ++; + $proche =~ s/v/V/o; + print "Volume : +1\n" if $debug; + } + elsif ( $debug ) { + print "Volume : 0\n" if $debug; + } + } +elsif ( $vol and $fascicule ) { + $total ++; + $proche .= "i"; + if ( $vol eq $fascicule or biniou($vol, $fascicule, "VOLUME/FASCICULE") ) { + $succes ++; + $bib{'volume'} ++; + $proche =~ s/i/I/o; + $masque = $fascicule; + $fascicule = ""; + alerte("VOLUME = ISSUE"); + print "Volume : +1\n" if $debug; + } + elsif ( unidecode($vol) eq unidecode($fascicule) ) { + $succes ++; + $bib{'volume'} ++; + $proche =~ s/i/I/o; + $masque = $fascicule; + $fascicule = ""; + alerte("VOLUME = ISSUE"); + print "Volume : +1\n" if $debug; + } + elsif ( $debug ) { + print "Volume : 0\n" if $debug; + } + } +elsif ( $debug ) { + print STDERR "Volume : N.A.\n"; + } +# Test du numéro de fascicule +my $issue = $host{'issue'}; +if ( $issue and $fascicule ) { + $total ++; + $proche .= "f"; + my $trouve = 0; + if ( $issue eq $fascicule or biniou($issue, $fascicule, "FASCICULE") ) { + $trouve ++; + } + elsif ( unidecode($issue) eq unidecode($fascicule) ) { + $trouve ++; + } + if ( not $trouve ) { + if ( $fascicule =~ /^\p{IsAlpha}+(\d+)\z/ ) { + my $tmp = $1; + $trouve ++ if $issue eq $tmp; + } + elsif ( $fascicule =~ /^(\d+)\p{IsAlpha}+\z/ ) { + my $tmp = $1; + $trouve ++ if $issue eq $tmp; + } + } + if ( $trouve ) { + $succes ++; + $proche =~ s/f/F/o; + $bib{'fascicule'} ++; + print "Numéro : +1\n" if $debug; + } + elsif ( $debug ) { + print "Numéro : 0\n" if $debug; + } + } +elsif ( $debug ) { + print STDERR "Numéro : N.A.\n"; + } +if ( $masque and not $fascicule ) { + $fascicule = $masque; + $masque = ""; + } + +my $okdebut = undef; +my $okfin = undef; +my $first = $host{'pages'}{'first'}; +# Test du numéro de première page +if ( $first and $pagedebut ) { + $total ++; + my @tmp = (); + my $safe = $first; + $safe =~ s/(\W)/\\$1/go; + if ( defined $pages{'first'} ) { + @tmp = sort keys %{$pages{'first'}}; + } + if ( $first eq $pagedebut ) { + $succes ++; + $bib{'pagedebut'} ++; + $okdebut = $pagedebut; + print STDERR "Page début : +1\n" if $debug; + } + elsif ( grep(/^$safe\z/, @tmp) > 0 ) { + $succes ++; + $bib{'pagedebut'} ++; + $okdebut = (grep(/^$first\z/, @tmp))[0]; + print STDERR "Page début : +1\n" if $debug; + } + elsif ( $debug ) { + print STDERR "Page début : 0\n"; + } + } +elsif ( $debug ) { + print STDERR "Page début : N.A.\n"; + } +my $last = $host{'pages'}{'last'}; +# Test du numéro de dernière page +if ( $last and $pagefin ) { + $total ++; + my @tmp = (); + my $safe = $last; + $safe =~ s/(\W)/\\$1/go; + if ( defined $pages{'last'} ) { + @tmp = sort keys %{$pages{'last'}}; + } + if ( $last eq $pagefin ) { + $succes ++; + $bib{'pagefin'} ++; + $okfin = $pagefin; + print STDERR "Page fin : +1\n" if $debug; + } + elsif ( grep(/^$safe\z/, @tmp) > 0 ) { + $succes ++; + $bib{'pagefin'} ++; + $okfin = (grep(/^$last\z/, @tmp))[0]; + print STDERR "Page fin : +1\n" if $debug; + } + elsif ( $debug ) { + print STDERR "Page fin : 0\n"; + } + } +elsif ( $debug ) { + print STDERR "Page fin : N.A.\n"; + } + +# Indication que les données bibliographiques +# sont a priori correctes +if ( ( $bib{'revue'} or $bib{'issn'} ) and + ( $bib{'volume'} or $bib{'fascicule'} ) and + $bib{'date'} and $okdebut and $okfin ) { + $rdvfp{$id} = "!"; + } + +# Test du pb des notices Pascal (et Francis ?) +# groupant plusieurs articles +$pb{$id} = undef; +if ( $first and $pagedebut and $last and $pagefin ) { + if ( ( $first > $pagedebut and $first <= $pagefin ) or + ( $last < $pagefin and $last >= $pagedebut ) ) { + my %tmp = (); + if ( ( $bib{'revue'} or $bib{'issn'} ) and + ( $bib{'volume'} or $bib{'fascicule'} or + $bib{'publicationDate'} or $bib{'copyrightDate'} or + $bib{'host.publicationDate'} or $bib{'host.copyrightDate'} or + $bib{'serie.publicationDate'} or $bib{'serie.copyrightDate'} ) ) { + $tmp{'revue'} = $revue if $revue; + $tmp{'serie'} = $serie if $serie; + $tmp{'issn'} = $bib{'issn'} if $bib{'issn'}; + $tmp{'volume'} = $vol if $vol; + $tmp{'fascicule'} = $issue if $issue; + $tmp{'publicationDate'} = $bib{'publicationDate'}; + $tmp{'copyrightDate'} = $bib{'copyrightDate'}; + $tmp{'host.publicationDate'} = $bib{'host.publicationDate'}; + $tmp{'host.copyrightDate'} = $bib{'host.copyrightDate'}; + $tmp{'serie.publicationDate'} = $bib{'serie.publicationDate'}; + $tmp{'serie.copyrightDate'} = $bib{'serie.copyrightDate'}; + $tmp{'pagedebut'} = $pagedebut; + $tmp{'pagefin'} = $pagefin; + $pb{$id} = \%tmp; + } + } + } +# Le test devrait être plus circonstancié !!! +# Seulement si on a assez d'éléments pour être sûr !!! +my $bib = 0; +if ( $total >= 4 and $total == $succes ) { + if ( ($bib{'revue'} or $bib{'issn'}) and + ($bib{'volume'} or $bib{'fascicule'}) and + ($bib{'pagedebut'} or $bib{'pagefin'}) ) { + $bib = 1; + } + } +my $poids = 2; +# Commenté pour l'instant en attendant de voir +# si c'est aussi futé que je le croyais ! +# if ( $total <= 4 ) { +# $poids = 1; +# } +# if ( not $bib ) { +# $total = max(4, $total + 1); +# } +if ( $debug ) { + print STDERR "\n\t=> total = $total\n"; + print STDERR "\t=> bib = $bib\n"; + print STDERR "\t=> poids = $poids\n\n"; + } +# Test du titre +my $title = $hit{'title'}; +if ( $title and $titre ) { + $total ++; + my $trouve = 0; + my $pot = 0; + if ( $titre =~ /^Pro and con\s*:\s*/o and + $title =~ /^(Pro|Con)\s*:/o ) { + my ($tmp1) = $titre =~ /^Pro and con\s*:\s*(\S.+)/o; + my ($tmp2) = $title =~ /^(?:Pro|Con)\s*:\s*(\S.+)/o; + if ( $pot = titre($tmp2, $tmp1, $bib) ) { + $trouve ++; + } + } + elsif ( $pot = titre($title, $titre, $bib) ) { + $trouve ++; + } + if ( not $trouve and $titre =~ / [IVXLCDM]+ *:/o ) { + if ( $pot = romain($title, $titre) ) { + $trouve ++; + } + } + if ( $trouve ) { + $succes += $pot; + print STDERR "Titre : +$pot\n" if $debug; + } + elsif ( $debug ) { + print STDERR "Titre : 0\n"; + } + } +elsif ( $debug ) { + print STDERR "Titre : N.A.\n"; + } +# +my @auteurs = (); +if ( $hit{'author'} ) { + @auteurs = @{$hit{'author'}}; + } +# Test des noms d'auteur +my @mac2 = (); +my $mac2 = 0; +if ( ( @auteurs and not @attendus ) or + ( @attendus and not @auteurs ) ) { + $total += 1; + } +elsif ( @auteurs and @attendus ) { + my $ld = 0; + my $trouve = 0; + my @names = (); + foreach my $auteur (@auteurs) { + my %auteur = %{$auteur}; + next unless $auteur{'name'}; # Est-ce possible et si oui, que fait-on ? + my $name = $auteur{'name'}; + if ( $name =~ /\bma?c ?(\w+)/oi ) { + my $canon = "$`Mc$1$'"; + $mac2 ++ if $canon ne $name; + push(@mac2, $canon); + } + else { + push(@mac2, $name); + } + push(@names, $name); + } + if ( $mac2 == 0 ) { + @mac2 = (); + } + ($trouve, $ld) = ld_au(\@attendus, \@names); + if ( $ld and (@mac1 or @mac2) ) { + my $trouve2 = undef; + my $ld2 = undef; + if ( @mac1 and @mac2 ) { + ($trouve2, $ld2) = ld_au(\@mac1, \@mac2); + } + elsif ( @mac1 ) { + ($trouve2, $ld2) = ld_au(\@mac1, \@names); + } + else { + ($trouve2, $ld2) = ld_au(\@attendus, \@mac2); + } + if ( $trouve2 >= $trouve and $ld2 <= $ld ) { + $trouve = $trouve2; + $ld = $ld2; + alerte("CORRECTION MAC"); + } + } + if ( $trouve == ($#attendus + 1) and $ld and $bib ) { + alerte("RATTRAPAGE"); + $ld = 0; + } + my $ild = 1 - ($ld / ($#auteurs + $#attendus + 2)); + $total += $poids; + if ( $#auteurs == $#attendus ) { + $poids = $poids * $trouve * $ild / ($#auteurs + 1); +# $succes += $poids * $trouve * $ild / ($#auteurs + 1); + $succes += $poids; + } + elsif ( $#auteurs > $#attendus ) { + if ( $#attendus > 34 and $#auteurs > 59 and $#auteurs - $#attendus > 2 ) { + $poids = $poids * $trouve * $ild / ($#attendus + 1); + } + else { + $poids = $poids * $trouve * $ild / ($#auteurs + 1); + } + $succes += $poids; + } + else { + $poids = $poids * $trouve * $ild / ($#attendus + 1); + $succes += $poids; + } + print STDERR "Auteurs : +$poids\n\n" if $debug; + } +elsif ( $debug ) { + print STDERR "Auteurs : N.A.\n\n"; + } +print STDERR "\t=> total = $total\n" if $debug; +if ( $total ) { + $score = $succes/$total; + print STDERR "\t=> succes = $succes\n" if $debug; + print STDERR "\t=> score = $score\n\n" if $debug; + return($score, $id, $ark, \%id, $proche); + } +else { + alerte("TOTAL NUL"); + return(0, $id, $ark, \%id, $proche); + } +} + +sub propre +{ +my $chaine = shift; +my $tmp1 = ""; +my $tmp2 = ""; + +# Conversion vers UTF-8 et échappement des caractères réservés +$tmp1 = decode_entities($chaine); +$tmp1 =~ s#([+&|!(){}^"~*?:\/])#\\$1#go; +$tmp1 =~ s#([][])#\\$1#go; + +# URLencodage +$tmp2 = uri_encode($tmp1); +$tmp2 =~ s/%20/ /go; +$tmp2 =~ s/&/%26/go; + +return $tmp2; +} + +sub titre +{ +my ($t1, $t2, $sb) = @_; + +if ( $t1 eq $t2 or + lc($t1) eq lc($t2) or + lc(unidecode($t1)) eq lc(unidecode($t2))) { + return 1; + } + +$t2 =~ s|||go; + +my $tmp1 = lc(join(" ", ($t1 =~ /(\w+)/go))); +my $tmp2 = lc(join(" ", ($t2 =~ /(\w+)/go))); + +if ( $tmp1 eq $tmp2 ) { + return 1; + } +elsif ( unidecode($tmp1) eq unidecode($tmp2) ) { + return 1; + } +else { + $tmp1 = unidecode($tmp1); + $tmp2 = unidecode($tmp2); + my $ld = levenshtein_damereau($tmp1, $tmp2); + if ( $ld == 1 and length($tmp2) > 20 ) { + return 1; + } + elsif ( $ld == 2 and length($tmp2) > 40 ) { + return 1; + } + } + +if ( $sb ) { + if ( $tmp1 =~ /\b$tmp2\b/ or $tmp2 =~ /\b$tmp1\b/ ) { + return 0.5; + } + } + +if ( $t1 =~ /: /o or $t2 =~ /: /o ) { + my ($tmp1) = $t1 =~ /^(.+?)\s*: /o; + my ($tmp2) = $t2 =~ /^(.+?)\s*: /o; + if ( $tmp1 eq $tmp2 ) { + alerte("TITRE RÉDUIT 1"); + return 1; + } + else { + $tmp1 = lc(join(" ", ($tmp1 =~ /(\w+)/go))); + $tmp2 = lc(join(" ", ($tmp2 =~ /(\w+)/go))); + if ( $tmp1 eq $tmp2 ) { + alerte("TITRE RÉDUIT 2"); + return 1; + } + elsif ( unidecode($tmp1) eq unidecode($tmp2) ) { + alerte("TITRE RÉDUIT 3"); + return 1; + } + } + } + +## Test partie de titre +($tmp1 = $t1) =~ s/([^\w ])/\\$1/go; +($tmp2 = $t2) =~ s/([^\w ])/\\$1/go; +if ( length($t1) > length($t2) ) { + if ( $tmp1 =~ /^$tmp2\. .+/i ) { + alerte("TITRE RÉDUIT 4"); + return 0.5; + } + } +elsif ( length($t1) < length($t2) ) { + if ( $tmp2 =~ /^$tmp1\. .+/i ) { + alerte("TITRE RÉDUIT 4"); + return 0.5; + } + } + +return 0; +} + +sub ld_au # Levenshtein-Damereau adapté aux listes d'auteurs +{ +my ($ref1, $ref2) = @_; + +my @liste1 = @{$ref1}; +my @liste2 = @{$ref2}; + +my $len1 = $#liste1 + 1; +my $len2 = $#liste2 + 1; + +return $len2 if $len1 == 0; +return $len1 if $len2 == 0; + +my %mat = (); + +for ( my $i = 0 ; $i <= $len1 ; $i ++ ) { + for ( my $j = 0 ; $j <= $len2 ; $j ++ ) { + $mat{$i}{$j} = 0; + $mat{0}{$j} = $j; + } + $mat{$i}{0} = $i; + } + +my $cost = 0; +my %trouve = (); + +for ( my $i = 1 ; $i <= $len1 ; $i ++ ) { + my $trouve = 0; + for ( my $j = 1 ; $j <= $len2 ; $j ++ ) { + $cost = 1 - compare($liste1[$i-1], $liste2[$j-1]); + $trouve ++ if $cost == 0; + $mat{$i}{$j} = min ([$mat{$i-1}{$j} + 1, + $mat{$i}{$j-1} + 1, + $mat{$i-1}{$j-1} + $cost]); + if ( $i > 1 and $j > 1 and + compare($liste1[$i - 1], $liste2[$j - 2]) and + compare($liste1[$i - 2], $liste2[$j - 1]) ) { + $mat{$i}{$j} = min ([$mat{$i}{$j}, + $mat{$i-2}{$j-2} + $cost]); + } + } + $trouve{$liste1[$i-1]} ++ if $trouve; + } + +my $trouve = 0; +foreach my $item (keys %trouve) { + $trouve += $trouve{$item}; + } + +# print "\t -> $trouve ; $mat{$len1}{$len2}\n"; +if ( $len1 > 35 and $len2 > 60 and $len2 - $len1 > 2 ) { + return ($trouve, $mat{$len1}{$len1}); + } +else { + return ($trouve, $mat{$len1}{$len2}); + } +} + +sub levenshtein_damereau +{ +my ($s1, $s2) = @_; + +my $len1 = length $s1; +my $len2 = length $s2; + +return $len2 if $len1 == 0; +return $len1 if $len2 == 0; + +my %mat = (); + +for ( my $i = 0 ; $i <= $len1 ; $i ++ ) { + for ( my $j = 0 ; $j <= $len2 ; $j ++ ) { + $mat{$i}{$j} = 0; + $mat{0}{$j} = $j; + } + $mat{$i}{0} = $i; + } + +my @ar1 = split(//, $s1); +my @ar2 = split(//, $s2); + +my $cost = 0; + +for ( my $i = 1 ; $i <= $len1 ; $i ++ ) { + for ( my $j = 1 ; $j <= $len2 ; $j ++ ) { + $cost = $ar1[$i-1] eq $ar2[$j-1] ? 0 : 1; + $mat{$i}{$j} = min ([$mat{$i-1}{$j} + 1, + $mat{$i}{$j-1} + 1, + $mat{$i-1}{$j-1} + $cost]); + if ( $i > 1 and $j > 1 and + $ar1[$i - 1] eq $ar2[$j - 2] and + $ar1[$i - 2] eq $ar2[$j - 1] ) { + $mat{$i}{$j} = min ([$mat{$i}{$j}, $mat{$i-2}{$j-2} + $cost]); + } + } + } + +return $mat{$len1}{$len2}; +} + +sub min +{ +my @liste = @{$_[0]}; +my $min = shift @liste; + +foreach my $i (@liste) { + $min = $i if $i < $min; + } + +return $min; +} + +sub romain +{ +my ($t2, $t1) = @_; + +my $p2 = join(" ", ($t2 =~ /(\p{IsAlnum}+)/go)); + +while( $t1 =~ / ([IVXLCDM]+) *: */go ) { + my $avant = $`; + my $apres = $'; + my $numR = $1; + (my $tav = $avant) =~ s/([^\p{IsAlnum} ])/\\$1/go; + (my $tap = $apres) =~ s/([^\p{IsAlnum} ])/\\$1/go; + my $val = $convert->arabic($numR); + if ( $t2 =~ /^$tav +(\p{IsAlnum}+ )?($val|$numR) *[:.] *$tap\z/i ) { + return 1; + } + else { + my $pav = join(" ", ($avant =~ /(\p{IsAlnum}+)/go)); + my $pap = join(" ", ($apres =~ /(\p{IsAlnum}+)/go)); + if ( $p2 =~ /^$pav +(\p{IsAlnum}+ )?($val|$numR) *$pap\z/i ) { + return 1; + } + elsif ( $p2 =~ /^$pav +(\p{IsAlnum}+ )?\p{IsAlnum}+ *$pap\z/i ) { + return 0.5; + } + } + } + +return 0; +} + +sub compare +{ +my ($attendu, $name) = @_; + +(my $safe = $attendu) =~ s/([^\p{IsAlnum} ])/\\$1/go; + +if ( $name =~ /\b$safe\b/i ) { +# print "\t - $attendu <=> $name : 1\n"; + return 1; + } + +$name =~ s/[\x{60}\x{B4}\x{B8}]//go; + +my $tmp1 = lc(unidecode($name)); +my $tmp2 = lc(unidecode($attendu)); +($safe = $tmp2) =~ s/([^\p{IsAlnum} ])/\\$1/go; +if ( $tmp1 =~ /\b$safe\b/ ) { +# print "\t - $attendu <=> $name : 1\n"; + return 1; + } + +my @tmp1 = split(/\s+/o, $tmp1); +my $premier = shift @tmp1; +$tmp1 = join(" ", @tmp1, $premier); +if ( $tmp1 =~ /\b$safe\b/ ) { + return 1; + } +unshift @tmp1, $premier; +my $dernier = pop @tmp1; +$tmp1 = join(" ", $dernier, @tmp1); +if ( $tmp1 =~ /\b$safe\b/ ) { + return 1; + } + +# print "\t - $attendu <=> $name : 0\n"; +return 0; +} + +sub biniou +{ +my ($val1, $val2, $champ) = @_; + +if ( $val1 =~ /^\d+\z/o ) { + if ( $val2 =~ /^(\d+)[-\x{2010}-\x{2015}\x{2212}](\d+)\z/o ) { + my $inf = $1; + my $sup = $2; + if ( $inf < $sup and $val1 >= $inf and $val1 <= $sup ) { + return 1; + } + } + } +elsif ( $val1 =~ m|^(\d+)[-\x{2010}-\x{2015}\x{2212}/](\d+)\z|o ) { + my $inf = $1; + my $sup = $2; + if ( $val2 =~ /^\d+\z/o ) { + if ( $inf < $sup and $val2 >= $inf and $val2 <= $sup ) { + return 1; + } + } + elsif ( $val2 =~ m|^(\d+)[-\x{2010}-\x{2015}\x{2212}/](\d+)\z|o ) { + my $inf2 = $1; + my $sup2 = $2; + if ( $inf == $inf2 and $sup == $sup2 ) { + return 1; + } + } + } +elsif ( $val1 =~ /^([A-Za-z]+)(\d+)\z/o ) { + my $vl = $1; + my $vn = $2; + if ( $val2 =~ /^([A-Za-z])(\d+)[-\x{2010}-\x{2015}\x{2212}]([A-Za-z])(\d+)\z/o ) { + my $infl = $1; + my $infn = $2; + my $supl = $3; + my $supn = $4; + if ( $infl eq $supl ) { + if ( $vl eq $infl and $vn >= $infn and $vn <= $supl ) { + return 1; + } + } + elsif ( $infl lt $supl ) { + if ( $vl eq $infl and $vn >= $infn ) { + return 1; + } + if ( $vl eq $supl and $vn <= $supn ) { + return 1; + } + if ( $vl gt $infl and $vl lt $supl ) { + return 1 + } + } + else { + alerte("FORMAT $champ ISTEX BIZARRE \"$val2\"") + } + } + } +elsif ( $val1 =~ /^([A-Za-z])(\d+)[-\x{2010}-\x{2015}\x{2212}]([A-Za-z])(\d+)\z/o ) { + my $infl = $1; + my $infn = $2; + my $supl = $3; + my $supn = $4; + if ( $val2 =~ /^([A-Za-z]+)(\d+)\z/o ) { + my $vl = $1; + my $vn = $2; + if ( $infl eq $supl ) { + if ( $vl eq $infl and $vn >= $infn and $vn <= $supl ) { + return 1; + } + } + elsif ( $infl lt $supl ) { + if ( $vl eq $infl and $vn >= $infn ) { + return 1; + } + if ( $vl eq $supl and $vn <= $supn ) { + return 1; + } + if ( $vl gt $infl and $vl lt $supl ) { + return 1 + } + } + else { + alerte("FORMAT $champ INIST BIZARRE \"$val1\"") + } + } + } +else { + alerte("FORMAT $champ INIST INATTENDU \"$val1\""); + if ( $val2 !~ /^\d+\z/o and + $val2 !~ /^([A-Za-z]+)(\d+)\z/o and + $val2 !~ /^(\d+)[-\x{2010}-\x{2015}\x{2212}](\d+)\z/o and + $val2 !~ /^([A-Za-z])(\d+)[-\x{2010}-\x{2015}\x{2212}]([A-Za-z])(\d+)\z/o ) { + alerte("FORMAT $champ ISTEX INATTENDU \"$val2\""); + } + } + +return 0; +} + +sub groupe +{ +my $ref = shift; + +my $query = ""; +my @liste = (); + +return @liste if not defined $ref or ref($ref) ne 'HASH'; + +alerte(); + +my %hash = %{$ref}; + +if ( defined $hash{'revue'} ) { + my $revue = propre($hash{'revue'}); + $query = "host.title:\"$revue\""; + } + +if ( defined $hash{'serie'} ) { + my $revue = propre($hash{'serie'}); + if ( $revue !~ /#/o ) { + $query .= " AND " if $query; + $query = "serie.title:\"$revue\""; + } + } + +if ( defined $hash{'issn'} ) { + my $issn = propre($hash{'issn'}); + $query .= " AND " if $query; + $query .= "host.issn:\"$issn\""; + } + +if ( defined $hash{'publicationDate'} ) { + my $pdate = propre($hash{'publicationDate'}); + $query .= " AND " if $query; + $query .= "host.publicationDate:\"$pdate\""; + } +elsif ( defined $hash{'copyrightDate'} ) { + my $cdate = propre($hash{'copyrightDate'}); + $query .= " AND " if $query; + $query .= "host.copyrightDate:\"$cdate\""; + } + +if ( defined $hash{'volume'} ) { + my $volume = propre($hash{'volume'}); + $query .= " AND " if $query; + if ( $volume =~ /^\d+\z/o ) { + $query .= "host.volume:\"$volume\""; + } + else { + $query .= "host.volume.raw:\"$volume\""; + } + } + +if ( defined $hash{'fascicule'} ) { + my $fascicule = propre($hash{'fascicule'}); + $query .= " AND " if $query; + if ( $fascicule =~ /^\d+\z/o ) { + $query .= "host.issue:$fascicule"; + } + else { + $query .= "host.issue.raw:\"$fascicule\""; + } + } + +if ( defined $hash{'pagedebut'} and + defined $hash{'pagefin'} ) { + my $pagedebut = propre($hash{'pagedebut'}); + my $pagefin = propre($hash{'pagefin'}); + $query .= " AND " if $query; + if ( $pagedebut =~ /^\d+\z/o and $pagefin =~ /^\d+\z/o ) { + $query .= "host.pages.first:[$pagedebut TO $pagefin]"; + $query .= " AND host.pages.last:[$pagedebut TO $pagefin]"; + } + else { + $query .= "host.pages.first.raw:[$pagedebut TO $pagefin]"; + $query .= " AND host.pages.last.raw:[$pagedebut TO $pagefin]"; + } + } + +## Envoi de la requête +print "ALT : \"$url$query&$out\"\n"; + +my $perl = get_hits($query); +if ( defined $perl ) { + my %top = %{$perl}; + if ( $top{'total'} >= 1 ) { + foreach my $hit (@{$top{'hits'}}) { + my %hit = %{$hit}; + my $id = $hit{'id'}; + my $ark = $hit{'arkIstex'}; + my $titre = $hit{'title'}; + my $doi = 'N/A'; + if ( defined $hit{'doi'} ) { + $doi = $hit{'doi'}->[0]; + } + my @auteurs = (); + if ( $hit{'author'} ) { + foreach my $auteur (@{$hit{'author'}}) { + if ( defined $auteur->{'name'} ) { + push(@auteurs, $auteur->{'name'}); + } + } + } + if ( $hit{'host'} ) { + my %host = %{$hit{'host'}}; + if ( defined $host{'pages'} ) { + my ($debut, $fin) = (undef, undef); + if ( defined $host{'pages'}->{'first'} ) { + $debut = $host{'pages'}->{'first'}; + } + if ( defined $host{'pages'}->{'last'} ) { + $fin = $host{'pages'}->{'last'}; + } + if ( $debut and $fin ) { + my $tmp = "$debut\t$fin\t"; + $tmp .= join("|", @auteurs); + $tmp .= "\t$titre\t$id\t$ark\t$doi"; + push(@liste, $tmp); + } + } + } + } + } + } + +return @liste; +} + +sub revue +{ +my ($title, @liste) = @_; + +my $match = 0; + +my $rv = lc(join(" ", ($title =~ /(\w+)/go))); +foreach my $item (@liste) { + my $jn = decode_entities($item); + if ( $jn eq $title or lc($jn) eq lc($title) ) { + $match ++; + } + else { + my $tmp = lc(join(" ", ($jn =~ /(\w+)/go))); + if ( $tmp eq $rv ) { + $match ++; + } + elsif ( $tmp =~ / [a-z]\z/o and $rv =~ /^$tmp / ) { + $match ++; + print "\t -> $num\t$tmp\t$rv\n" if not $modifs{"$tmp\t$rv"}; + $modifs{"$tmp\t$rv"} ++; + } + elsif ( $rv =~ / [a-z]\z/o and $tmp =~ /^$rv / ) { + $match ++; + print "\t -> $num\t$tmp\t$rv\n" if not $modifs{"$tmp\t$rv"}; + $modifs{"$tmp\t$rv"} ++; + } + else { + my $tmp1 = join(" ", grep(!/^(and|et|und|e|y)\z/, split(/ +/, $tmp))); + my $tmp2 = join(" ", grep(!/^(and|et|und|e|y)\z/, split(/ +/, $rv))); + if ( $tmp1 eq $tmp2 ) { + $match ++; + } + else { + $tmp1 =~ s/^(the|die|das|les?|la?|du|[ei]l) //o; + $tmp2 =~ s/^(the|die|das|les?|la?|du|[ei]l) //o; + $match ++ if $tmp1 eq $tmp2; + } + } + } + if ( not $match ) { + if ( $jn =~ /\s*: /o ) { + my ($tmp) = ($jn =~ /^(.+)\s*:/o); + if ( $tmp eq $title or lc($tmp) eq lc($title) ) { + $match ++; + } + } + if ( $title =~ /\s*: /o ) { + my ($tmp) = ($title =~ /^(.+)\s*:/o); + if ( $jn eq $tmp or lc($jn) eq lc($tmp) ) { + $match ++; + } + } + } + } + +return $match; +} + + +__DATA__ + +## +## NE PAS MODIFIER ! +## +## DO NOT EDIT! +## + +33 excl +34 dquot +35 num +36 dollar +37 percnt +40 lpar +41 rpar +42 ast +43 plus +44 comma +45 hyphen +46 period +47 sol +58 colon +59 semi +61 equals +63 quest +64 commat +91 lsqb +92 bsol +93 rsqb +95 lowbar +96 grave +123 lcub +124 verbar +256 Amacr; +257 amacr; +258 Abreve; +259 abreve; +260 Aogon; +261 aogon; +262 Cacute; +263 cacute; +264 Ccirc; +265 ccirc; +266 Cdot; +267 cdot; +268 Ccaron; +269 ccaron; +270 Dcaron; +271 dcaron; +272 Dstrok; +273 dstrok; +274 Emacr; +275 emacr; +278 Edot; +279 edot; +280 Eogon; +281 eogon; +282 Ecaron; +283 ecaron; +284 Gcirc; +285 gcirc; +286 Gbreve; +287 gbreve; +288 Gdot; +289 gdot; +290 Gcedil; +291 gcedil; +292 Hcirc; +293 hcirc; +294 Hstrok; +295 hstrok; +296 Itilde; +297 itilde; +298 Imacr; +299 imacr; +302 Iogon; +303 iogon; +304 Idot; +305 inodot; +306 IJlig; +307 ijlig; +308 Jcirc; +309 jcirc; +310 Kcedil; +311 kcedil; +312 kgreen; +313 Lacute; +314 lacute; +315 Lcedil; +316 lcedil; +317 Lcaron; +318 lcaron; +319 Lmidot; +320 lmidot; +321 Lstrok; +322 lstrok; +323 Nacute; +324 nacute; +325 Ncedil; +326 ncedil; +327 Ncaron; +328 ncaron; +329 napos; +330 ENG; +331 eng; +332 Omacr; +333 omacr; +336 Odblac; +337 odblac; +340 Racute; +341 racute; +342 Rcedil; +343 rcedil; +344 Rcaron; +345 rcaron; +346 Sacute; +347 sacute; +348 Scirc; +349 scirc; +350 Scedil; +351 scedil; +354 Tcedil; +355 tcedil; +356 Tcaron; +357 tcaron; +358 Tstrok; +359 tstrok; +360 Utilde; +361 utilde; +362 Umacr; +363 umacr; +364 Ubreve; +365 ubreve; +366 Uring; +367 uring; +368 Udblac; +369 udblac; +370 Uogon; +371 uogon; +372 Wcirc; +373 wcirc; +374 Ycirc; +375 ycirc; +377 Zacute; +378 zacute; +379 Zdot; +380 zdot; +381 Zcaron; +382 zcaron; +501 gacute; +711 caron; +728 breve; +729 dot; +730 ring; +731 ogon; +733 dblac; +902 Aacgr; +904 Eacgr; +905 EEacgr; +906 Iacgr; +908 Oacgr; +910 Uacgr; +911 OHacgr; +912 idiagr; +913 Agr; +914 Bgr; +915 Ggr; +916 Dgr; +917 Egr; +918 Zgr; +919 EEgr; +920 THgr; +921 Igr; +922 Kgr; +923 Lgr; +924 Mgr; +925 Ngr; +926 Xgr; +927 Ogr; +928 Pgr; +929 Rgr; +931 Sgr; +932 Tgr; +933 Ugr; +934 PHgr; +935 KHgr; +936 PSgr; +937 OHgr; +938 Idigr; +939 Udigr; +940 aacgr; +941 eacgr; +942 eeacgr; +943 iacgr; +944 udiagr; +945 agr; +946 bgr; +947 ggr; +948 dgr; +949 egr; +950 zgr; +951 eegr; +952 thgr; +953 igr; +954 kgr; +955 lgr; +956 mgr; +957 ngr; +958 xgr; +959 ogr; +960 pgr; +961 rgr; +962 sfgr; +963 sgr; +964 tgr; +965 ugr; +966 phgr; +967 khgr; +968 psgr; +969 ohgr; +970 idigr; +971 udigr; +972 oacgr; +973 uacgr; +974 ohacgr; +977 thetav; +981 phiv; +988 gammad; +1008 kappav; +1009 rhov; +1025 IOcy; +1026 DJcy; +1027 GJcy; +1028 Jukcy; +1029 DScy; +1030 Iukcy; +1031 YIcy; +1032 Jsercy; +1033 LJcy; +1034 NJcy; +1035 TSHcy; +1036 KJcy; +1038 Ubrcy; +1039 DZcy; +1040 Acy; +1041 Bcy; +1042 Vcy; +1043 Gcy; +1044 Dcy; +1045 IEcy; +1046 ZHcy; +1047 Zcy; +1048 Icy; +1049 Jcy; +1050 Kcy; +1051 Lcy; +1052 Mcy; +1053 Ncy; +1054 Ocy; +1055 Pcy; +1056 Rcy; +1057 Scy; +1058 Tcy; +1059 Ucy; +1060 Fcy; +1061 KHcy; +1062 TScy; +1063 CHcy; +1064 SHcy; +1065 SHCHcy; +1066 HARDcy; +1067 Ycy; +1068 SOFTcy; +1069 Ecy; +1070 YUcy; +1071 YAcy; +1072 acy; +1073 bcy; +1074 vcy; +1075 gcy; +1076 dcy; +1077 iecy; +1078 zhcy; +1079 zcy; +1080 icy; +1081 jcy; +1082 kcy; +1083 lcy; +1084 mcy; +1085 ncy; +1086 ocy; +1087 pcy; +1088 rcy; +1089 scy; +1090 tcy; +1091 ucy; +1092 fcy; +1093 khcy; +1094 tscy; +1095 chcy; +1096 shcy; +1097 shchcy; +1098 hardcy; +1099 ycy; +1100 softcy; +1101 ecy; +1102 yucy; +1103 yacy; +1105 iocy; +1106 djcy; +1107 gjcy; +1108 jukcy; +1109 dscy; +1110 iukcy; +1111 yicy; +1112 jsercy; +1113 ljcy; +1114 njcy; +1115 tshcy; +1116 kjcy; +1118 ubrcy; +1119 dzcy; +8196 emsp13; +8197 emsp14; +8199 numsp; +8200 puncsp; +8202 hairsp; +8208 dash; +8211 ndash; +8212 mdash; +8213 horbar; +8214 Verbar; +8229 nldr; +8244 tprime; +8245 bprime; +8257 caret; +8259 hybull; +8411 tdot; +8412 DotDot; +8453 incare; +8459 hamilt; +8463 planck; +8466 lagran; +8467 ell; +8470 numero; +8471 copysr; +8478 rx; +8486 ohm; +8491 angst; +8492 bernou; +8499 phmmat; +8500 order; +8502 beth; +8503 gimel; +8504 daleth; +8531 frac13; +8532 frac23; +8533 frac15; +8534 frac25; +8535 frac35; +8536 frac45; +8537 frac16; +8538 frac56; +8539 frac18; +8540 frac38; +8541 frac58; +8542 frac78; +8597 varr; +8598 nwarr; +8599 nearr; +8600 drarr; +8601 dlarr; +8602 nlarr; +8603 nrarr; +8605 rarrw; +8606 Larr; +8608 Rarr; +8610 larrtl; +8611 rarrtl; +8614 map; +8617 larrhk; +8618 rarrhk; +8619 larrlp; +8620 rarrlp; +8621 harrw; +8622 nharr; +8624 lsh; +8625 rsh; +8630 cularr; +8631 curarr; +8634 olarr; +8635 orarr; +8636 lharu; +8637 lhard; +8638 uharr; +8639 uharl; +8640 rharu; +8641 rhard; +8642 dharr; +8643 dharl; +8644 rlarr2; +8646 lrarr2; +8647 larr2; +8648 uarr2; +8649 rarr2; +8650 darr2; +8651 lrhar2; +8652 rlhar2; +8653 nlArr; +8654 nhArr; +8655 nrArr; +8661 vArr; +8666 lAarr; +8667 rAarr; +8705 comp; +8708 nexist; +8714 epsis; +8717 bepsi; +8720 coprod; +8722 minus; +8723 mnplus; +8724 plusdo; +8726 setmn; +8728 compfn; +8735 ang90; +8737 angmsd; +8738 angsph; +8739 mid; +8740 nmid; +8741 par; +8742 npar; +8750 conint; +8757 becaus; +8765 bsim; +8768 wreath; +8769 nsim; +8771 sime; +8772 nsime; +8775 ncong; +8777 nap; +8778 ape; +8780 bcong; +8782 bump; +8783 bumpe; +8784 esdot; +8785 eDot; +8786 efDot; +8787 erDot; +8788 colone; +8789 ecolon; +8790 ecir; +8791 cire; +8793 wedgeq; +8796 trie; +8802 nequiv; +8806 lE; +8807 gE; +8808 lne; +8809 gne; +8810 Lt; +8811 Gt; +8812 twixt; +8814 nlt; +8815 ngt; +8816 nle; +8817 nge; +8818 lsim; +8819 gsim; +8822 lg; +8823 gl; +8826 pr; +8827 sc; +8828 pre; +8829 sce; +8830 prsim; +8831 scsim; +8832 npr; +8833 nsc; +8837 nsup; +8840 nsube; +8841 nsupe; +8842 subne; +8843 supne; +8846 uplus; +8847 sqsub; +8848 sqsup; +8849 sqsube; +8850 sqsupe; +8851 sqcap; +8852 sqcup; +8854 ominus; +8856 osol; +8857 odot; +8858 ocir; +8859 oast; +8861 odash; +8862 plusb; +8863 minusb; +8864 timesb; +8865 sdotb; +8866 vdash; +8867 dashv; +8868 top; +8871 models; +8872 vDash; +8873 Vdash; +8874 Vvdash; +8876 nvdash; +8877 nvDash; +8878 nVdash; +8879 nVDash; +8882 vltri; +8883 vrtri; +8884 ltrie; +8885 rtrie; +8888 mumap; +8890 intcal; +8891 veebar; +8892 barwed; +8900 diam; +8902 sstarf; +8903 divonx; +8904 bowtie; +8905 ltimes; +8906 rtimes; +8907 lthree; +8908 rthree; +8909 bsime; +8910 cuvee; +8911 cuwed; +8912 Sub; +8913 Sup; +8914 Cap; +8915 Cup; +8916 fork; +8918 ldot; +8919 gsdot; +8920 Ll; +8921 Gg; +8922 leg; +8923 gel; +8924 els; +8925 egs; +8926 cuepr; +8927 cuesc; +8928 npre; +8929 nsce; +8934 lnsim; +8935 gnsim; +8936 prnsim; +8937 scnsim; +8938 nltri; +8939 nrtri; +8940 nltrie; +8941 nrtrie; +8942 vellip; +8966 Barwed; +8972 drcrop; +8973 dlcrop; +8974 urcrop; +8975 ulcrop; +8981 telrec; +8982 target; +8988 ulcorn; +8989 urcorn; +8990 dlcorn; +8991 drcorn; +8994 frown; +8995 smile; +9251 blank; +9416 oS; +9472 boxh; +9474 boxv; +9484 boxdr; +9488 boxdl; +9492 boxur; +9496 boxul; +9500 boxvr; +9508 boxvl; +9516 boxhd; +9524 boxhu; +9532 boxvh; +9552 boxH; +9553 boxV; +9554 boxdR; +9555 boxDr; +9556 boxDR; +9557 boxdL; +9558 boxDl; +9559 boxDL; +9560 boxuR; +9561 boxUr; +9562 boxUR; +9563 boxuL; +9564 boxUl; +9565 boxUL; +9566 boxvR; +9567 boxVr; +9568 boxVR; +9569 boxvL; +9570 boxVl; +9571 boxVL; +9572 boxHd; +9573 boxhD; +9574 boxHD; +9575 boxHu; +9576 boxhU; +9577 boxHU; +9578 boxvH; +9579 boxVh; +9580 boxVH; +9600 uhblk; +9604 lhblk; +9608 block; +9617 blk14; +9618 blk12; +9619 blk34; +9633 squ; +9642 squf; +9645 rect; +9646 marker; +9651 xutri; +9652 utrif; +9653 utri; +9656 rtrif; +9657 rtri; +9661 xdtri; +9662 dtrif; +9663 dtri; +9666 ltrif; +9667 ltri; +9675 cir; +9733 starf; +9734 star; +9742 phone; +9792 female; +9794 male; +9834 sung; +9837 flat; +9838 natur; +9839 sharp; +10003 check; +10007 cross; +10016 malt; +10022 lozf; +10038 sext; +64256 fflig; +64257 filig; +64258 fllig; +64259 ffilig; + +## +## The End! +## diff --git a/correction/recupErreurs.pl b/correction/recupErreurs.pl new file mode 100755 index 0000000..e798454 --- /dev/null +++ b/correction/recupErreurs.pl @@ -0,0 +1,368 @@ +#!/usr/bin/perl + + +# Déclaration des pragmas +use strict; +use utf8; +use open qw/:std :utf8/; + +# Appel des modules externes de base +use Encode; +use Getopt::Long; + +# Appel des modules spécifiques à l'application +# ??? + +my ($programme) = $0 =~ m|^(?:.*/)?(.+)|; +my $version = "3.1.5"; +my $dateModif = "26 Février 2021"; + +my $usage = "Usage : \n" . + " $programme -h hfd -a alignement -r rejet [ -l log ] [ -m matchStan ] \n" . + " $programme -f fichier -a alignement -r rejet [ -l log ] [ -m matchStan ] \n" . + " $programme -i \n"; + +my $alignement = undef; +my $fichier = undef; +my $hfd = undef; +my $info = undef; +my $log = undef; +my $rejet = undef; +my $matchStan = "matchStan2Istex_v12c.pl"; + +eval { + $SIG{__WARN__} = sub {usage(1);}; + GetOptions( + "alignement=s" => \$alignement, + "fichier=s" => \$fichier, + "hfd=s" => \$hfd, + "info" => \$info, + "log=s" => \$log, + "rejet=s" => \$rejet, + "match=s" => \$matchStan, + ); + }; +$SIG{__WARN__} = sub {warn $_[0];}; + +if ( $info ) { + print "Programme : \n"; + print " “$programme”, version $version ($dateModif)\n"; + print " Permet de chercher les erreurs dans le fichier de résultats obtenu \n"; + print " avec le programme “$matchStan” et de les corriger. \n"; + print "\n"; + print $usage; + print "\nOptions : \n"; + print " -a indique le nom du fichier de résultats de l’alignement dont on doit \n"; + print " corriger les erreurs \n"; + print " -f indique le nom du fichier d’entrée contenant les notices Pascal ou Francis \n"; + print " (qui peut être un fichier compressé avec “gzip” ou “bzip2”) \n"; + print " -h indique le nom du fichier HFD servant d’entrée au programme \n"; + print " -i affiche cette aide. \n"; + print " -l indique le nom du fichier qui recevra, à la fois, le résultat avant et \n"; + print " après lorsqu’une notice traitée donnera un résultat différent \n"; + print " -m indique le nom du programme d’alignement si ce n'est pas celui par défaut \n"; + print " -r indique le nom du fichier contenant les notices Pascal ou Francis ayant \n"; + print " été modifiées parce que les notices originales ont provoqué une erreur \n"; + print " \n"; + print "N.B. : pour ne pas passer par le proxy sur le réseau interne de l’INIST, il faut \n"; + print " effacer les variables globales du proxy par la commande “unset http_proxy \n"; + print " https_proxy no_proxy” avant de lancer le programme “$programme”. \n"; + print " \n"; + + exit 0; + } + +usage(2) if not $alignement or not $rejet; +usage(2) if not $fichier and not $hfd; +usage(2) if $fichier and $hfd; + +if ( $hfd and not -d $hfd ) { + print STDERR "Fichier HFD \"$hfd\" absent !\n"; + exit(3); + } + +# Variables +my $blancs = 0; +my $temp = "rEtmp$$.sgml"; +my @lignes = (); +my %correct = (); +my %notice = (); + +# Gestion des interruptions +$SIG{'HUP'} = 'nettoie'; +$SIG{'INT'} = 'nettoie'; +$SIG{'QUIT'} = 'nettoie'; +$SIG{'TERM'} = 'nettoie'; + +if ( $rejet ) { + if ( -f $rejet ) { + open(INP, "<:raw", $rejet) or die "$!,"; + while() { + next if /^\s*$/o; + next if /^#/o; + my ($inist) = m|(.+?)|o; + $correct{$inist} = $_; + } + close INP; + } + } + +if ( defined $fichier ) { + if ( $fichier eq '-' ) { + open(REC, "<&STDIN") or die "$!,"; + binmode(REC, ":raw"); + } + elsif ( $fichier =~ /\.gz\z/o ) { + open(REC, "gzip -cd $fichier |") or die "$!,"; + binmode(REC, ":raw"); + } + elsif ( $fichier =~ /\.bz2\z/o ) { + open(REC, "bzip2 -cd $fichier |") or die "$!,"; + binmode(REC, ":raw"); + } + else { + open(REC, "<:raw", $fichier) or die "$!,"; + } + } +elsif ( defined $hfd ) { + open(REC, "IhfdCat $hfd |") or die "$!,"; + binmode(REC, ":raw"); + } + +my ($rec, $inist, $refaire) = suivant(); + +open(LOG, ">:utf8", $log) or die "$!,"; + +if ( $alignement =~ /\.gz\z/o ) { + open(INP, "gzip -cd $alignement |") or die "$!,"; + binmode(INP, ":utf8"); + } +elsif ( $alignement =~ /\.bz2\z/o ) { + open(INP, "bzip2 -cd $alignement |") or die "$!,"; + binmode(INP, ":utf8"); + } +else { + open(INP, "<:utf8", $alignement) or die "$!,"; + } +while() { + if ( /^\s*$/o ) { + $blancs ++; + if ( @lignes ) { + passe(@lignes); + @lignes = (); + print "\n"; + } + else { + print $_ if $blancs == 1; + } + } + else { + push(@lignes, $_); + $blancs = 0; + } + } +close INP; + +if ( @lignes ) { + passe(@lignes); + } + + +exit 0; + + +sub usage +{ +print STDERR $usage; + +exit shift; +} + +sub suivant +{ +my $ligne = ; + +if ( defined $ligne ) { + my ($id) = $ligne =~ m|(.+?)|o; + if ( $correct{$id} ) { + $ligne = $correct{$id}; + } + my $status = erreur($ligne); + return ($ligne, $id, $status); + } + +return (undef, undef, undef); +} + +sub erreur +{ +my $ligne = shift; + +if ( m|(.+?)|o ) { + my $champ = $1; + my $pages = ""; + if ( $champ =~ m|(.+?)|o ) { + $pages = $1; + } + elsif ( $champ =~ m|(.+?)|o ) { + $pages = $1; + } + if ( $pages ) { + $pages =~ s|•|.|go; + $pages =~ s|−|-|go; + $pages =~ s|/|/|go; + $pages =~ s/^ +//o; + if ( $pages =~ /^(&\w+gr; )?vol\. *\S+?, */io or + $pages =~ /^(&\w+gr; )?vol \S+?, */io or + $pages =~ /^(&\w+gr; )?vol\d+, */io or + $pages =~ /^&\w+gr;,? */o ) { + $pages = $'; + } + if ( $pages =~ / *[.+?(].*)?.*\z/io or + $pages =~ / +\d+ p\.].*/io ) { + $pages = $`; + } + if ( $pages =~ / *\(.+?\).*\z/io ) { + $pages = $`; + } + if ( $pages =~ /^([0ivx][ivx]*(-[ivx]+)?,? *([ivx]+(-[ivx]+)?,? *)*) (.+)\z/io ) { + $pages = $5; + } + if ( $pages =~ /^s\.p\.\s*\z/io ) { + return 0; + } + elsif ( $pages =~ /^(\d+)-(\d+) +p\./io ) { + return 0; + } + elsif ( $pages =~ /^\S+ +p\./io ) { + return 0; + } + elsif ( $pages =~ /^p\. *(\S+)\z/io ) { + return 0; + } + elsif ( $pages =~ /^(\S+-\S+)-(\S+-\S+)\z/o ) { + return 0; + } + elsif ( $pages =~ /^(\S+?)(-\S+)?,( *(\S+-)?(\S+),)* *(\S+-)?(\S+) *\z/o ) { + return 1; + } + } + } + +return 0; +} + +sub passe +{ +my @liste = @_; + +foreach my $ligne (@liste) { + if ( $ligne =~ /^[_0\.\*\+]+[^\t]*\t(\d+\.\d+\W?|-)\t[AMC]\t(\d+(-\d+)+)\t/o ) { + my $valeur = $1; + my $id = $2; + while ( $id ne $inist ) { + ($rec, $inist, $refaire) = suivant(); + } + if ( $valeur =~ /^\d+\.\d+\z/o ) { + $refaire ++ if $valeur <= 4.000 and $valeur >= 3.490; + } + $refaire ++ if grep(/^\t => ERREUR /o, @liste) > 0; + if ( $refaire ) { + open(TMP, ">:raw", $temp) or die "$!,"; + print TMP $rec; + close TMP; + my $nb = 0; + my $ok = 0; + while( not $ok ) { + open(MSI, "$matchStan -f $temp |") or die "$!,"; + binmode(MSI, ":utf8"); + my @sortie = grep(/\S+/, ); + close MSI; + if ( grep(/^\t => ERREUR /o, @sortie) == 0 ) { + print @sortie; + compare($id, \@liste, \@sortie); + print STDERR "Notice \"$id\" modifiée \n"; + return; + } + elsif ( grep(/^\t => ERREUR 400 /o, @sortie) > 0 ) { + print @liste; + compare($id, \@liste, \@sortie); + print STDERR "Notice \"$id\" erreur 400 !\n"; + return; + } + else { + $nb ++; + if ( $nb >= 10 ) { + print STDERR "Problème notice \"$id\" !\n"; + print @liste; + return; + } + } + } + die "Impossible de supprimer \"$temp\" : $!," if not unlink $temp; + } + else { + print @liste; + } + return; + } + } + +print @liste; +} + +sub compare +{ +my ($id, $ref1, $ref2) = @_; + +my @liste1 = @{$ref1}; +my @liste2 = @{$ref2}; + +my ($score1, $score2) = (undef, undef); + +print LOG "===> $id <=== \n"; + +foreach my $ligne (@liste1) { + if ( $ligne =~ /^[_0\.\*\+]+[^\t]*\t(\d+\.\d+\W?|-)\t[AMC]\t(\d+(-\d+)+)\t/o ) { + $score1 = $1 + } + } + +foreach my $ligne (@liste2) { + if ( $ligne =~ /^[_0\.\*\+]+[^\t]*\t(\d+\.\d+\W?|-)\t[AMC]\t(\d+(-\d+)+)\t/o ) { + $score2 = $1 + } + } + +if ($#liste1 != $#liste2 or $score1 ne $score2 ) { + print LOG @liste1; + print LOG "-------------------- \n"; + print LOG @liste2; + } + +print LOG "\n"; +} + +sub nettoye +{ +my $signal = shift; + +if ( fileno(TMP) ) { + close TMP; + } +if ( -f "$temp" ) { + die "Impossible de supprimer \"$temp\" : $!," if not unlink $temp; + } + +if ( $signal =~ /^\d+\z/ ) { + exit $signal; + } +if ( $signal ) { + print STDERR "Signal SIG$signal détecté\n"; + exit 9; + } +else { + exit 0; + } +} + diff --git a/dedoublonnage/weedTei.pl b/dedoublonnage/weedTei.pl new file mode 100755 index 0000000..48f31f0 --- /dev/null +++ b/dedoublonnage/weedTei.pl @@ -0,0 +1,1414 @@ +#!/usr/bin/perl + + +# Déclaration des pragmas +use strict; +use utf8; +use open qw/:std :utf8/; + +# Appel des modules externes de base +use Encode; +use Getopt::Long; + +# Appel des modules spécifiques à l'application +use HTML::Entities qw(decode_entities %entity2char); +## use HTTP::CookieJar::LWP; +use JSON; +## use LWP::UserAgent; +use Text::Unidecode; +## use URI::Encode qw(uri_encode uri_decode); + +my ($programme) = $0 =~ m|^(?:.*/)?(.+)|; +my $Version = "1.5.1"; +my $dateModif = "10 Mars 2021"; + +my $usage = "Usage : \n" . + " $programme -f fichier[,fichier]* -r répertoire [ -l log ] [ -x ] \n" . + " $programme -h \n"; + +my $aide = undef; +my $log = undef; +my $rep = undef; +my $xclam = undef; +my @fichiers = (); +my %dejaVu = (); + +eval { + $SIG{__WARN__} = sub {usage(1);}; + GetOptions( + "fichier=s" => \@fichiers, + "help" => \$aide, + "log=s" => \$log, + "repertoire=s" => \$rep, + "xclam" => \$xclam, + ); + }; +$SIG{__WARN__} = sub {warn $_[0];}; + +if ( $aide ) { + print " \n"; + print "Programme : \n"; + print " “$programme”, version $Version ($dateModif)\n"; + print " Permet de créer les fichiers d’enrichissement avec les codes de classement \n"; + print " et les mots-clés Pascal ou Francis au format TEI StandOff \n"; + print "\n"; + print $usage; + print "\nOptions : \n"; + print " -f indique le nom du ou des fichiers d’entrée (qui peuvent être des fichiers \n"; + print " compressés avec “gzip” ou “bzip2”). L’option est répétitive et il est possible \n"; + print " d’indiquer plusieurs noms de fichier en les séparant par des virgules (mais \n"; + print " sans espace entre eux) \n"; + print " -h affiche cette aide \n"; + print " -l indique le nom du fichier “log” contenant la liste des appariements \n"; + print " supprimés \n"; + print " -r indique le nom du répertoire où seront créés les fichiers de sortie \n"; + print " portant le même nom que les fichiers d’entrée \n"; + print " -x accepte comme valides les appariements lorsque la valeur du score est \n"; + print " suivie d’un point d’exclamation (“!”) \n"; + print " \n"; + + exit 0; + } + +@fichiers = grep {not $dejaVu{$_} ++;} split(/,/, join(",", @fichiers)); +usage(2) if $#fichiers < 0; + +usage(2) if not $rep; + +# Récupération des noms de fichiers sur l'entrée standard +for ( my $nb = 0 ; $nb <= $#fichiers ; $nb ++ ) { + my $fichier = $fichiers[$nb]; + if ( $fichier eq '-' ) { + splice(@fichiers, $nb, 1); + while() { + chomp; + s/\r//o; + next if /^\s*$/o; + next if /^\s*-\s*$/o; + s/^\s+//o; + s/\s+$//o; + push(@fichiers, $_); + } + } + } + +# Variables +my $info = undef; +my $inist = undef; +my $nb = undef; +my $score = undef; +my %groupe = (); +my %info = (); +my %istex = (); +my %match = (); +my %rang = (); +my %rejetD = (); +my %rejetI = (); + +# Complétion de la table des entitées HTML +while() { + next if /^\s*$/o; + next if /^#/o; + chomp; + my ($num, $sgml) = split(/\t+/); + next if $entity2char{$sgml}; + $entity2char{$sgml} = chr($num); + } +close DATA; + +# Ouverture du fichier "log" +if ( $log ) { + open(LOG, ">:utf8", $log) or die "$!,"; + } +else { + open(LOG, ">:utf8", "/dev/null") or die "$!,"; + } + +foreach my $fichier (@fichiers) { + if ( $fichier =~ /\.gz\z/o ) { + open(INP, "gzip -cd $fichier |") or die "$!,"; + binmode(INP, ":utf8"); + } + elsif ( $fichier =~ /\.bz2\z/o ) { + open(INP, "bzip2 -cd $fichier |") or die "$!,"; + binmode(INP, ":utf8"); + } + else { + open(INP, "<:utf8", $fichier) or die "$!,"; + } + + while() { + if (/^([_.0\*\+]+\x{00A0}*)\t(\d\.\d+)(!?)\t/o) { + $score = $2; + my $statut = $3; + $statut = "" if not $xclam; + if ( $score < 3.490 and $statut ne '!' ) { + $inist = undef; + next; + } + chomp; + my @champs = split(/\t/); + $inist = $champs[3]; + my $id = $champs[17]; + $match{$id}{$inist} = $score; + $rang{$id}{$inist} = keys %{$match{$id}}; + $istex{$inist}{$id} = $score; + } + elsif (/^([_.0\*\+]+\x{00A0}*)\t(\d\.\d+)\W\t/o) { + $inist = undef; + } + elsif ( /^ ~~> \t/o ) { + next if not $inist; + chomp; + my @champs = split(/\t/); + my $id = $champs[5]; + next if defined $match{$id}{$inist}; + push(@{$groupe{$inist}}, "$id:$score"); + } + } + close INP; + + foreach $inist (sort keys %groupe) { + my @tmp = @{$groupe{$inist}}; + if ( $#tmp == 0 ) { + my ($id, $valeur) = split(/:/, $tmp[0]); + if ( $match{$id}{$inist} ) { + $rejetI{$inist}; + next; + } + } + my $erreur = 0; + foreach my $item (@tmp) { + my ($id, $valeur) = split(/:/, $item); + if ( defined $match{$id} ) { + my @inist = keys %{$match{$id}}; + $erreur ++ if @inist; + } + } + if ( $erreur ) { + $rejetI{$inist} ++; + } + else { + foreach my $item (@tmp) { + my ($id, $valeur) = split(/:/, $item); + $match{$id}{$inist} = $valeur; + } + } + } + } + +foreach my $id (keys %match) { + my @tmp = sort keys %{$match{$id}}; + if ( $#tmp == 0 ) { + delete $match{$id}; + next; + } + $nb ++; + foreach my $num (@tmp) { + $info{$num} ++; + } + } + +# print LOG "Nombre de doublons : $nb \n"; +# foreach my $id (sort keys %match) { +# print LOG " -> $id \n"; +# foreach my $num (sort keys %{$match{$id}}) { +# print LOG "\t$num => $match{$id}{$num} \n"; +# } +# } + +foreach my $fichier (@fichiers) { + if ( $fichier =~ /\.gz\z/o ) { + open(INP, "gzip -cd $fichier |") or die "$!,"; + binmode(INP, ":utf8"); + } + elsif ( $fichier =~ /\.bz2\z/o ) { + open(INP, "bzip2 -cd $fichier |") or die "$!,"; + binmode(INP, ":utf8"); + } + else { + open(INP, "<:utf8", $fichier) or die "$!,"; + } + + while() { + if (/^([_.0\*\+]+\x{00A0}*)\t/o) { + chomp; + my @champs = split(/\t/); + $inist = $champs[3]; + $info = join("\t", @champs[4 .. 16]); +# my $id = $champs[17]; + $info{$inist} = $info if $info{$inist}; + } + } + close INP; + } + +foreach my $id (sort keys %match) { + my %tmp = (); + next if not defined $match{$id}; + foreach my $num (sort keys %{$match{$id}}) { + push(@{$tmp{$match{$id}{$num}}}, $num); + } + my @tmp = sort {$b <=> $a} keys %tmp; + next if $#tmp < 0; + my $max = $tmp[0]; + if ( $#{$tmp{$max}} == 0 ) { + my $correct = $tmp{$max}->[0]; + foreach my $num (sort keys %{$match{$id}}) { + next if $num eq $correct; + $rejetD{$num} ++; + } + } + else { + @tmp = sort {$rang{$id}{$a} <=> $rang{$id}{$b}} @{$tmp{$max}}; + my $total = 0; + for (my $nb = 1 ; $nb <= $#tmp ; $nb ++) { + if ( $info{$tmp[0]} eq $info{$tmp[$nb]} ) { + $total ++; + next; + } + if ( compare($tmp[0], $tmp[$nb]) ) { + $total ++; + next; + } + last; + } + if ( $total == $#tmp ) { + foreach my $num (sort keys %{$match{$id}}) { + next if $num eq $tmp[0]; + $rejetD{$num} ++; + } + } + elsif ( $total > 1 and $total == $#tmp - 1 ) { + print LOG " => $id [$max] \n"; + foreach my $num (@{$tmp{$max}}) { + $rejetD{$num} ++; + print LOG "\t$num\t$info{$num}\n"; + } + print LOG "\n"; + } + else { + print LOG " -> $id [$max] \n"; + foreach my $num (@{$tmp{$max}}) { + $rejetD{$num} ++; + print LOG "\t$num\t$info{$num}\n"; + } + print LOG "\n"; + } + } + } + +foreach my $fichier (@fichiers) { + my $compression = undef; + if ( $fichier =~ /\.gz\z/o ) { + $compression = "gzip"; + open(INP, "gzip -cd $fichier |") or die "$!,"; + binmode(INP, ":utf8"); + } + elsif ( $fichier =~ /\.bz2\z/o ) { + $compression = "bzip2"; + open(INP, "bzip2 -cd $fichier |") or die "$!,"; + binmode(INP, ":utf8"); + } + else { + open(INP, "<:utf8", $fichier) or die "$!,"; + } + + if ( $compression ) { + open(OUT, "| $compression -c > $rep/$fichier") or die "$!,"; + binmode(OUT, ":utf8"); + } + else { + open(OUT, ">:utf8", "$rep/$fichier") or die "$!,"; + } + + while() { + if (/^([_.0\*\+]+\x{00A0}*)\t(\d\.\d+)(!?)\t/o) { + my $score = $2; + my $statut = $3; + my @champs = split(/\t/); + $inist = $champs[3]; + if ( $rejetD{$inist} ) { + $champs[1] = $score . '-'; + $_ = join("\t", @champs); + } + } + elsif (/^([_.0\*\+]+\x{00A0}*)\t(\d\.\d+)(\W?)\t/o) { + $inist = undef; + } + elsif ( /^ ~~> \t/o ) { + if ( $inist and $rejetI{$inist} ) { + s/^ ~~> \t/ ::> \t/o; + } + } + print OUT; + } + close INP; + close OUT; + } + +exit 0; + + +sub usage +{ +print STDERR "\n$usage\n"; + +exit shift; +} + +sub compare +{ +my ($n1, $n2) = @_; + +my $test = 0; +my %trouve = (); + +my @i1 = split(/\t/, $info{$n1}); +my @i2 = split(/\t/, $info{$n2}); +# 0 : titre ; 1 : journal ; 2 : livre ; 3 : $issn ; 4 : isbn ; 5 : date +# 6 : volume ; 7 : fascicule ; 8 : pagedebut ; 9 : pagefin ; 10 : nom1 +# 11 : prenom1 ; 12 : autres + +# ISSN +if ( $i1[3] and $i2[3] ) { + $test ++; + if ( $i1[3] eq $i2[3] or uc($i1[3]) eq uc($i2[3]) ) { + $trouve{'ISSN'} ++; + } + } + +# Revue +if ( $i1[1] and $i2[1] ) { + $test ++; + $trouve{'revue'} ++ if revue($i1[1], $i2[1]); + } + +# Date +if ( $i1[5] and $i2[5] ) { + $test ++; + if ( $i1[5] == $i2[5] or $i1[5] eq $i2[5] ) { + $trouve{'date'} ++; + } + } + +# Volume +if ( $i1[6] and $i2[6] ) { + $test ++; + if ( $i1[6] eq $i2[6] or biniou($i1[6], $i2[6], "VOLUME", $n1, $n2) ) { + $trouve{'volume'} ++; + } + elsif ( unidecode($i1[6]) eq unidecode($i2[6]) ) { + $trouve{'volume'} ++; + } + } + +# Fascicule +if ( $i1[7] and $i2[7] ) { + $test ++; + if ( $i1[7] eq $i2[7] or biniou($i1[7], $i2[7], "FASCICULE", $n1, $n2) ) { + $trouve{'fascicule'} ++; + } + elsif ( unidecode($i1[7]) eq unidecode($i2[7]) ) { + $trouve{'fascicule'} ++; + } + } + +# Page de début +if ( $i1[8] and $i2[8] ) { + $test ++; + $trouve{'pagedebut'} ++ if $i1[8] == $i2[8] or $i1[8] eq $i2[8]; + } + +# Page de fin +if ( $i1[9] and $i2[9] ) { + $test ++; + $trouve{'pagefin'} ++ if $i1[9] == $i2[9] or $i1[9] eq $i2[9]; + } + +# Titre du document +if ( $i1[0] and $i2[0] ) { + $test ++; + if ( ($trouve{'ISSN'} or $trouve{'revue'}) and + ($trouve{'date'} or $trouve{'volume'}) and + ($trouve{'pagedebut'} or $trouve{'pagefin'}) ) { + $trouve{'titre'} += titre($i1[0], $i2[0], 1); + } + else { + $trouve{'titre'} += titre($i1[0], $i2[0], 0); + } + } + +# Premier auteur +if ( $i1[10] and $i2[10] ) { + $test ++; + if ( $i1[10] eq $i2[10] or + uc($i1[10]) eq uc($i2[10]) ) { + $trouve{'auteur'} ++; + } + } + +my $trouve = scalar keys %trouve; + +if ( $test > 4 and $trouve == $test ) { + return 1; + } + +if ( ($trouve{'ISSN'} or $trouve{'revue'}) and + ($trouve{'date'} or $trouve{'volume'}) and + ($trouve{'pagedebut'} or $trouve{'pagefin'}) and + ($trouve{'auteur'} or $trouve{'titre'}) ) { + return 1; + } + +return 0; +} + +sub revue +{ +my ($title, @liste) = @_; + +my $match = 0; + +my $rv = lc(join(" ", ($title =~ /(\w+)/go))); +foreach my $item (@liste) { + my $jn = decode_entities($item); + if ( $jn eq $title or lc($jn) eq lc($title) ) { + $match ++; + } + else { + my $tmp = lc(join(" ", ($jn =~ /(\w+)/go))); + if ( $tmp eq $rv ) { + $match ++; + } + elsif ( $tmp =~ / [a-z]\z/o and $rv =~ /^$tmp / ) { + $match ++; + } + elsif ( $rv =~ / [a-z]\z/o and $tmp =~ /^$rv / ) { + $match ++; + } + else { + my $tmp1 = join(" ", grep(!/^(and|et|und|e|y)\z/, split(/ +/, $tmp))); + my $tmp2 = join(" ", grep(!/^(and|et|und|e|y)\z/, split(/ +/, $rv))); + if ( $tmp1 eq $tmp2 ) { + $match ++; + } + else { + $tmp1 =~ s/^(the|die|das|les?|la?|du|[ei]l) //o; + $tmp2 =~ s/^(the|die|das|les?|la?|du|[ei]l) //o; + $match ++ if $tmp1 eq $tmp2; + } + } + } + if ( not $match ) { + if ( $jn =~ /\s*: /o ) { + my ($tmp) = ($jn =~ /^(.+)\s*:/o); + if ( $tmp eq $title or lc($tmp) eq lc($title) ) { + $match ++; + } + } + if ( $title =~ /\s*: /o ) { + my ($tmp) = ($title =~ /^(.+)\s*:/o); + if ( $jn eq $tmp or lc($jn) eq lc($tmp) ) { + $match ++; + } + } + } + } + +return $match; +} + +sub biniou +{ +my ($val1, $val2, $champ, $num1, $num2) = @_; + +if ( $val1 =~ /^\d+\z/o ) { + if ( $val2 =~ /^(\d+)[-\x{2010}-\x{2015}\x{2212}](\d+)\z/o ) { + my $inf = $1; + my $sup = $2; + if ( $inf < $sup and $val1 >= $inf and $val1 <= $sup ) { + return 1; + } + } + } +elsif ( $val1 =~ m|^(\d+)[-\x{2010}-\x{2015}\x{2212}/](\d+)\z|o ) { + my $inf = $1; + my $sup = $2; + if ( $val2 =~ /^\d+\z/o ) { + if ( $inf < $sup and $val2 >= $inf and $val2 <= $sup ) { + return 1; + } + } + elsif ( $val2 =~ m|^(\d+)[-\x{2010}-\x{2015}\x{2212}/](\d+)\z|o ) { + my $inf2 = $1; + my $sup2 = $2; + if ( $inf == $inf2 and $sup == $sup2 ) { + return 1; + } + } + } +elsif ( $val1 =~ /^([A-Za-z]+)(\d+)\z/o ) { + my $vl = $1; + my $vn = $2; + if ( $val2 =~ /^([A-Za-z])(\d+)[-\x{2010}-\x{2015}\x{2212}]([A-Za-z])(\d+)\z/o ) { + my $infl = $1; + my $infn = $2; + my $supl = $3; + my $supn = $4; + if ( $infl eq $supl ) { + if ( $vl eq $infl and $vn >= $infn and $vn <= $supl ) { + return 1; + } + } + elsif ( $infl lt $supl ) { + if ( $vl eq $infl and $vn >= $infn ) { + return 1; + } + if ( $vl eq $supl and $vn <= $supn ) { + return 1; + } + if ( $vl gt $infl and $vl lt $supl ) { + return 1 + } + } + else { + alerte("FORMAT $champ \"$num2\" BIZARRE \"$val2\"") + } + } + } +elsif ( $val1 =~ /^([A-Za-z])(\d+)[-\x{2010}-\x{2015}\x{2212}]([A-Za-z])(\d+)\z/o ) { + my $infl = $1; + my $infn = $2; + my $supl = $3; + my $supn = $4; + if ( $val2 =~ /^([A-Za-z]+)(\d+)\z/o ) { + my $vl = $1; + my $vn = $2; + if ( $infl eq $supl ) { + if ( $vl eq $infl and $vn >= $infn and $vn <= $supl ) { + return 1; + } + } + elsif ( $infl lt $supl ) { + if ( $vl eq $infl and $vn >= $infn ) { + return 1; + } + if ( $vl eq $supl and $vn <= $supn ) { + return 1; + } + if ( $vl gt $infl and $vl lt $supl ) { + return 1 + } + } + else { + alerte("FORMAT $champ \"$num1\" BIZARRE \"$val1\"") + } + } + } +else { + alerte("FORMAT $champ \"$num1\" INATTENDU \"$val1\""); + if ( $val2 !~ /^\d+\z/o and + $val2 !~ /^([A-Za-z]+)(\d+)\z/o and + $val2 !~ /^(\d+)[-\x{2010}-\x{2015}\x{2212}](\d+)\z/o and + $val2 !~ /^([A-Za-z])(\d+)[-\x{2010}-\x{2015}\x{2212}]([A-Za-z])(\d+)\z/o ) { + alerte("FORMAT $champ \"$num2\" INATTENDU \"$val2\""); + } + } + +return 0; +} + +sub titre +{ +my ($t1, $t2, $sb) = @_; + +if ( $t1 eq $t2 or + lc($t1) eq lc($t2) or + lc(unidecode($t1)) eq lc(unidecode($t2))) { + return 1; + } + +$t2 =~ s|||go; + +my $tmp1 = lc(join(" ", ($t1 =~ /(\w+)/go))); +my $tmp2 = lc(join(" ", ($t2 =~ /(\w+)/go))); + +if ( $tmp1 eq $tmp2 ) { + return 1; + } +elsif ( unidecode($tmp1) eq unidecode($tmp2) ) { + return 1; + } +else { + $tmp1 = unidecode($tmp1); + $tmp2 = unidecode($tmp2); + my $ld = levenshtein_damereau($tmp1, $tmp2); + if ( $ld == 1 and length($tmp2) > 20 ) { + return 1; + } + elsif ( $ld == 2 and length($tmp2) > 40 ) { + return 1; + } + } + +if ( $sb ) { + if ( $tmp1 =~ /\b$tmp2\b/ or $tmp2 =~ /\b$tmp1\b/ ) { + return 0.5; + } + } + +if ( $t1 =~ /: /o or $t2 =~ /: /o ) { + my ($tmp1) = $t1 =~ /^(.+?)\s*: /o; + my ($tmp2) = $t2 =~ /^(.+?)\s*: /o; + if ( $tmp1 eq $tmp2 ) { + return 1; + } + else { + $tmp1 = lc(join(" ", ($tmp1 =~ /(\w+)/go))); + $tmp2 = lc(join(" ", ($tmp2 =~ /(\w+)/go))); + if ( $tmp1 eq $tmp2 ) { + return 1; + } + elsif ( unidecode($tmp1) eq unidecode($tmp2) ) { + return 1; + } + } + } + +## Test partie de titre +($tmp1 = $t1) =~ s/([^\w ])/\\$1/go; +($tmp2 = $t2) =~ s/([^\w ])/\\$1/go; +if ( length($t1) > length($t2) ) { + if ( $tmp1 =~ /^$tmp2\. .+/i ) { + return 0.5; + } + } +elsif ( length($t1) < length($t2) ) { + if ( $tmp2 =~ /^$tmp1\. .+/i ) { + return 0.5; + } + } + +return 0; +} + +sub levenshtein_damereau +{ +my ($s1, $s2) = @_; + +my $len1 = length $s1; +my $len2 = length $s2; + +return $len2 if $len1 == 0; +return $len1 if $len2 == 0; + +my %mat = (); + +for ( my $i = 0 ; $i <= $len1 ; $i ++ ) { + for ( my $j = 0 ; $j <= $len2 ; $j ++ ) { + $mat{$i}{$j} = 0; + $mat{0}{$j} = $j; + } + $mat{$i}{0} = $i; + } + +my @ar1 = split(//, $s1); +my @ar2 = split(//, $s2); + +my $cost = 0; + +for ( my $i = 1 ; $i <= $len1 ; $i ++ ) { + for ( my $j = 1 ; $j <= $len2 ; $j ++ ) { + $cost = $ar1[$i-1] eq $ar2[$j-1] ? 0 : 1; + $mat{$i}{$j} = min ([$mat{$i-1}{$j} + 1, + $mat{$i}{$j-1} + 1, + $mat{$i-1}{$j-1} + $cost]); + if ( $i > 1 and $j > 1 and + $ar1[$i - 1] eq $ar2[$j - 2] and + $ar1[$i - 2] eq $ar2[$j - 1] ) { + $mat{$i}{$j} = min ([$mat{$i}{$j}, $mat{$i-2}{$j-2} + $cost]); + } + } + } + +return $mat{$len1}{$len2}; +} + +sub min +{ +my @liste = @{$_[0]}; +my $min = shift @liste; + +foreach my $i (@liste) { + $min = $i if $i < $min; + } + +return $min; +} + +sub alerte +{ +my $message = shift; + +print LOG "ALERTE : $message\n"; +} + +__DATA__ + +## +## Liste d’entités caractères +## + +## +## NE PAS MODIFIER ! +## +## DO NOT EDIT! +## + +33 excl +34 dquot +35 num +36 dollar +37 percnt +40 lpar +41 rpar +42 ast +43 plus +44 comma +45 hyphen +46 period +47 sol +58 colon +59 semi +61 equals +63 quest +64 commat +91 lsqb +92 bsol +93 rsqb +95 lowbar +96 grave +123 lcub +124 verbar +256 Amacr; +257 amacr; +258 Abreve; +259 abreve; +260 Aogon; +261 aogon; +262 Cacute; +263 cacute; +264 Ccirc; +265 ccirc; +266 Cdot; +267 cdot; +268 Ccaron; +269 ccaron; +270 Dcaron; +271 dcaron; +272 Dstrok; +273 dstrok; +274 Emacr; +275 emacr; +278 Edot; +279 edot; +280 Eogon; +281 eogon; +282 Ecaron; +283 ecaron; +284 Gcirc; +285 gcirc; +286 Gbreve; +287 gbreve; +288 Gdot; +289 gdot; +290 Gcedil; +291 gcedil; +292 Hcirc; +293 hcirc; +294 Hstrok; +295 hstrok; +296 Itilde; +297 itilde; +298 Imacr; +299 imacr; +302 Iogon; +303 iogon; +304 Idot; +305 inodot; +306 IJlig; +307 ijlig; +308 Jcirc; +309 jcirc; +310 Kcedil; +311 kcedil; +312 kgreen; +313 Lacute; +314 lacute; +315 Lcedil; +316 lcedil; +317 Lcaron; +318 lcaron; +319 Lmidot; +320 lmidot; +321 Lstrok; +322 lstrok; +323 Nacute; +324 nacute; +325 Ncedil; +326 ncedil; +327 Ncaron; +328 ncaron; +329 napos; +330 ENG; +331 eng; +332 Omacr; +333 omacr; +336 Odblac; +337 odblac; +340 Racute; +341 racute; +342 Rcedil; +343 rcedil; +344 Rcaron; +345 rcaron; +346 Sacute; +347 sacute; +348 Scirc; +349 scirc; +350 Scedil; +351 scedil; +354 Tcedil; +355 tcedil; +356 Tcaron; +357 tcaron; +358 Tstrok; +359 tstrok; +360 Utilde; +361 utilde; +362 Umacr; +363 umacr; +364 Ubreve; +365 ubreve; +366 Uring; +367 uring; +368 Udblac; +369 udblac; +370 Uogon; +371 uogon; +372 Wcirc; +373 wcirc; +374 Ycirc; +375 ycirc; +377 Zacute; +378 zacute; +379 Zdot; +380 zdot; +381 Zcaron; +382 zcaron; +501 gacute; +711 caron; +728 breve; +729 dot; +730 ring; +731 ogon; +733 dblac; +902 Aacgr; +904 Eacgr; +905 EEacgr; +906 Iacgr; +908 Oacgr; +910 Uacgr; +911 OHacgr; +912 idiagr; +913 Agr; +914 Bgr; +915 Ggr; +916 Dgr; +917 Egr; +918 Zgr; +919 EEgr; +920 THgr; +921 Igr; +922 Kgr; +923 Lgr; +924 Mgr; +925 Ngr; +926 Xgr; +927 Ogr; +928 Pgr; +929 Rgr; +931 Sgr; +932 Tgr; +933 Ugr; +934 PHgr; +935 KHgr; +936 PSgr; +937 OHgr; +938 Idigr; +939 Udigr; +940 aacgr; +941 eacgr; +942 eeacgr; +943 iacgr; +944 udiagr; +945 agr; +946 bgr; +947 ggr; +948 dgr; +949 egr; +950 zgr; +951 eegr; +952 thgr; +953 igr; +954 kgr; +955 lgr; +956 mgr; +957 ngr; +958 xgr; +959 ogr; +960 pgr; +961 rgr; +962 sfgr; +963 sgr; +964 tgr; +965 ugr; +966 phgr; +967 khgr; +968 psgr; +969 ohgr; +970 idigr; +971 udigr; +972 oacgr; +973 uacgr; +974 ohacgr; +977 thetav; +981 phiv; +988 gammad; +1008 kappav; +1009 rhov; +1025 IOcy; +1026 DJcy; +1027 GJcy; +1028 Jukcy; +1029 DScy; +1030 Iukcy; +1031 YIcy; +1032 Jsercy; +1033 LJcy; +1034 NJcy; +1035 TSHcy; +1036 KJcy; +1038 Ubrcy; +1039 DZcy; +1040 Acy; +1041 Bcy; +1042 Vcy; +1043 Gcy; +1044 Dcy; +1045 IEcy; +1046 ZHcy; +1047 Zcy; +1048 Icy; +1049 Jcy; +1050 Kcy; +1051 Lcy; +1052 Mcy; +1053 Ncy; +1054 Ocy; +1055 Pcy; +1056 Rcy; +1057 Scy; +1058 Tcy; +1059 Ucy; +1060 Fcy; +1061 KHcy; +1062 TScy; +1063 CHcy; +1064 SHcy; +1065 SHCHcy; +1066 HARDcy; +1067 Ycy; +1068 SOFTcy; +1069 Ecy; +1070 YUcy; +1071 YAcy; +1072 acy; +1073 bcy; +1074 vcy; +1075 gcy; +1076 dcy; +1077 iecy; +1078 zhcy; +1079 zcy; +1080 icy; +1081 jcy; +1082 kcy; +1083 lcy; +1084 mcy; +1085 ncy; +1086 ocy; +1087 pcy; +1088 rcy; +1089 scy; +1090 tcy; +1091 ucy; +1092 fcy; +1093 khcy; +1094 tscy; +1095 chcy; +1096 shcy; +1097 shchcy; +1098 hardcy; +1099 ycy; +1100 softcy; +1101 ecy; +1102 yucy; +1103 yacy; +1105 iocy; +1106 djcy; +1107 gjcy; +1108 jukcy; +1109 dscy; +1110 iukcy; +1111 yicy; +1112 jsercy; +1113 ljcy; +1114 njcy; +1115 tshcy; +1116 kjcy; +1118 ubrcy; +1119 dzcy; +8196 emsp13; +8197 emsp14; +8199 numsp; +8200 puncsp; +8202 hairsp; +8208 dash; +8211 ndash; +8212 mdash; +8213 horbar; +8214 Verbar; +8229 nldr; +8244 tprime; +8245 bprime; +8257 caret; +8259 hybull; +8411 tdot; +8412 DotDot; +8453 incare; +8459 hamilt; +8463 planck; +8466 lagran; +8467 ell; +8470 numero; +8471 copysr; +8478 rx; +8486 ohm; +8491 angst; +8492 bernou; +8499 phmmat; +8500 order; +8502 beth; +8503 gimel; +8504 daleth; +8531 frac13; +8532 frac23; +8533 frac15; +8534 frac25; +8535 frac35; +8536 frac45; +8537 frac16; +8538 frac56; +8539 frac18; +8540 frac38; +8541 frac58; +8542 frac78; +8597 varr; +8598 nwarr; +8599 nearr; +8600 drarr; +8601 dlarr; +8602 nlarr; +8603 nrarr; +8605 rarrw; +8606 Larr; +8608 Rarr; +8610 larrtl; +8611 rarrtl; +8614 map; +8617 larrhk; +8618 rarrhk; +8619 larrlp; +8620 rarrlp; +8621 harrw; +8622 nharr; +8624 lsh; +8625 rsh; +8630 cularr; +8631 curarr; +8634 olarr; +8635 orarr; +8636 lharu; +8637 lhard; +8638 uharr; +8639 uharl; +8640 rharu; +8641 rhard; +8642 dharr; +8643 dharl; +8644 rlarr2; +8646 lrarr2; +8647 larr2; +8648 uarr2; +8649 rarr2; +8650 darr2; +8651 lrhar2; +8652 rlhar2; +8653 nlArr; +8654 nhArr; +8655 nrArr; +8661 vArr; +8666 lAarr; +8667 rAarr; +8705 comp; +8708 nexist; +8714 epsis; +8717 bepsi; +8720 coprod; +8722 minus; +8723 mnplus; +8724 plusdo; +8726 setmn; +8728 compfn; +8735 ang90; +8737 angmsd; +8738 angsph; +8739 mid; +8740 nmid; +8741 par; +8742 npar; +8750 conint; +8757 becaus; +8765 bsim; +8768 wreath; +8769 nsim; +8771 sime; +8772 nsime; +8775 ncong; +8777 nap; +8778 ape; +8780 bcong; +8782 bump; +8783 bumpe; +8784 esdot; +8785 eDot; +8786 efDot; +8787 erDot; +8788 colone; +8789 ecolon; +8790 ecir; +8791 cire; +8793 wedgeq; +8796 trie; +8802 nequiv; +8806 lE; +8807 gE; +8808 lne; +8809 gne; +8810 Lt; +8811 Gt; +8812 twixt; +8814 nlt; +8815 ngt; +8816 nle; +8817 nge; +8818 lsim; +8819 gsim; +8822 lg; +8823 gl; +8826 pr; +8827 sc; +8828 pre; +8829 sce; +8830 prsim; +8831 scsim; +8832 npr; +8833 nsc; +8837 nsup; +8840 nsube; +8841 nsupe; +8842 subne; +8843 supne; +8846 uplus; +8847 sqsub; +8848 sqsup; +8849 sqsube; +8850 sqsupe; +8851 sqcap; +8852 sqcup; +8854 ominus; +8856 osol; +8857 odot; +8858 ocir; +8859 oast; +8861 odash; +8862 plusb; +8863 minusb; +8864 timesb; +8865 sdotb; +8866 vdash; +8867 dashv; +8868 top; +8871 models; +8872 vDash; +8873 Vdash; +8874 Vvdash; +8876 nvdash; +8877 nvDash; +8878 nVdash; +8879 nVDash; +8882 vltri; +8883 vrtri; +8884 ltrie; +8885 rtrie; +8888 mumap; +8890 intcal; +8891 veebar; +8892 barwed; +8900 diam; +8902 sstarf; +8903 divonx; +8904 bowtie; +8905 ltimes; +8906 rtimes; +8907 lthree; +8908 rthree; +8909 bsime; +8910 cuvee; +8911 cuwed; +8912 Sub; +8913 Sup; +8914 Cap; +8915 Cup; +8916 fork; +8918 ldot; +8919 gsdot; +8920 Ll; +8921 Gg; +8922 leg; +8923 gel; +8924 els; +8925 egs; +8926 cuepr; +8927 cuesc; +8928 npre; +8929 nsce; +8934 lnsim; +8935 gnsim; +8936 prnsim; +8937 scnsim; +8938 nltri; +8939 nrtri; +8940 nltrie; +8941 nrtrie; +8942 vellip; +8966 Barwed; +8972 drcrop; +8973 dlcrop; +8974 urcrop; +8975 ulcrop; +8981 telrec; +8982 target; +8988 ulcorn; +8989 urcorn; +8990 dlcorn; +8991 drcorn; +8994 frown; +8995 smile; +9251 blank; +9416 oS; +9472 boxh; +9474 boxv; +9484 boxdr; +9488 boxdl; +9492 boxur; +9496 boxul; +9500 boxvr; +9508 boxvl; +9516 boxhd; +9524 boxhu; +9532 boxvh; +9552 boxH; +9553 boxV; +9554 boxdR; +9555 boxDr; +9556 boxDR; +9557 boxdL; +9558 boxDl; +9559 boxDL; +9560 boxuR; +9561 boxUr; +9562 boxUR; +9563 boxuL; +9564 boxUl; +9565 boxUL; +9566 boxvR; +9567 boxVr; +9568 boxVR; +9569 boxvL; +9570 boxVl; +9571 boxVL; +9572 boxHd; +9573 boxhD; +9574 boxHD; +9575 boxHu; +9576 boxhU; +9577 boxHU; +9578 boxvH; +9579 boxVh; +9580 boxVH; +9600 uhblk; +9604 lhblk; +9608 block; +9617 blk14; +9618 blk12; +9619 blk34; +9633 squ; +9642 squf; +9645 rect; +9646 marker; +9651 xutri; +9652 utrif; +9653 utri; +9656 rtrif; +9657 rtri; +9661 xdtri; +9662 dtrif; +9663 dtri; +9666 ltrif; +9667 ltri; +9675 cir; +9733 starf; +9734 star; +9742 phone; +9792 female; +9794 male; +9834 sung; +9837 flat; +9838 natur; +9839 sharp; +10003 check; +10007 cross; +10016 malt; +10022 lozf; +10038 sext; +64256 fflig; +64257 filig; +64258 fllig; +64259 ffilig; + +## +## The End! +## diff --git a/generation_tei/alignment2tei.pl b/generation_tei/alignment2tei.pl new file mode 100755 index 0000000..87b285c --- /dev/null +++ b/generation_tei/alignment2tei.pl @@ -0,0 +1,2027 @@ +#!/usr/bin/perl + + +# Déclaration des pragmas +use strict; +use utf8; +use open qw/:std :utf8/; + +# Appel des modules externes de base +use Encode; +use Getopt::Long; + +# Appel des modules spécifiques à l'application +use HTML::Entities qw(decode_entities %entity2char); +## use HTTP::CookieJar::LWP; +use JSON; +## use LWP::UserAgent; +use Text::Unidecode; +## use URI::Encode qw(uri_encode uri_decode); + +my ($programme) = $0 =~ m|^(?:.*/)?(.+)|; +my $substitut = " " x (length($programme) - 2); +my $Version = "1.9.2"; +my $dateModif = "7 Mars 2021"; + +my $usage = "Usage : \n" . + " $programme -f (fichier|-) -a fichier_align -d date -v version [ -l log ] \n" . + " $substitut [ -c répertoire_cc ] [ -r (0|1) ] [ -x ] \n" . + " $programme -h fichier_HFD -a fichier_align -d date -v version [ -l log ] \n" . + " $substitut [ -c répertoire_cc ] [ -r (0|1) ] [ -x ] \n" . + " $programme -i \n"; +$substitut .= " "; + +my $align = undef; +my $cc_dir = "CC"; +my $date = undef; +my $fichier = undef; +my $hfd = undef; +my $info = undef; +my $log = undef; +my $ready = 1; +my $version = undef; +my $xclam = 0; + +eval { + $SIG{__WARN__} = sub {usage(1);}; + GetOptions( + "align=s" => \$align, + "cc_dir=s" => \$cc_dir, + "date=s" => \$date, + "fichier=s" => \$fichier, + "hfd=s" => \$hfd, + "info" => \$info, + "log=s" => \$log, + "ready=i" => \$ready, + "version=s" => \$version, + "xclam" => \$xclam, + ); + }; +$SIG{__WARN__} = sub {warn $_[0];}; + +if ( $info ) { + print " \n"; + print "Programme : \n"; + print " “$programme”, version $Version ($dateModif)\n"; + print " Permet de créer les fichiers d’enrichissement avec les codes de classement \n"; + print " et les mots-clés Pascal ou Francis au format TEI StandOff \n"; + print "\n"; + print $usage; + print "\nOptions : \n"; + print " -a indique le nom du fichier résultat de l’alignement (qui peut être un \n"; + print " fichier compressé avec “gzip” ou “bzip2”) \n"; + print " -c indique le nom du répertoire contenant les tables de correspondance \n"; + print " entre codes et verbalisation \n"; + print " -d indique la date à laquelle a été fait l’alignement, en utilisant le format \n"; + print " “aaaa-mm-jj” (par ex. “2020-09-28”) \n"; + print " -f indique le nom du fichier d’entrée (qui peut être un fichier compressé \n"; + print " avec “gzip” ou “bzip2”). Pour utiliser l’entrée standard, mettre un \n"; + print " tiret “-” comme argument \n"; + print " -h indique le nom du fichier HFD servant d’entrée au programme \n"; + print " -i affiche cette aide \n"; + print " -l indique le nom du fichier “log” contenant la liste des notices INIST \n"; + print " appariées ainsi que les identifiants des documents ISTEX correspondants \n"; + print " -r crée l'organisation hiérarchique en 4 répertoires d’ISTEX si la valeur \n"; + print " est 1 (valeur par défaut). Autrement, les fichiers sont créés dans le \n"; + print " répertoire courant \n"; + print " -v indique le numéro de version du programme “matchStan2Istex.pl” utilisé \n"; + print " pour réaliser l’alignement \n"; + print " -x accepte comme valides les appariements lorsque la valeur du score est \n"; + print " suivie d’un point d’exclamation (“!”) \n"; + print " \n"; + + exit 0; + } + +usage(2) if not $align or not $date or not $version; +usage(2) if not $fichier and not $hfd; +usage(2) if $fichier and $hfd; + +if ( $date !~ /^(\d\d\d\d)-(\d\d)-(\d\d)\z/ ) { + print STDERR "\n"; + print STDERR "$programme : erreur dans le format de la date ! \n"; + print STDERR "$substitut Utilisez le format “aaaa-mm-jj”. \n"; + exit 3; + } + +if ( $version !~ /^\d+\.\d+\.\d+\z/ ) { + print STDERR "\n"; + print STDERR "$programme : erreur dans le format de la version ! \n\n"; + print STDERR "Vérifiez la bonne version avec la commande : “matchStan2Istex.pl -i” \n"; + exit 4; + } + +if ( $cc_dir =~ m|^(.*)/\z| ) { + $cc_dir = $1; + } + +# Variables +my $annee = ""; +my $inist = ""; +my $num = 0; +my @matchs = (); +my @parties = (); +my %base = (); +my %canon = (); +my %classe = (); +my %equiv = (); +my %francis = (); +my %lodex = (); +my %match = (); +my %verb = (); + +# Tables nécessaires aux verbalisation +my $verbPascalFr = "$cc_dir/verbPascalFr.txt"; +my $verbPascalEn = "$cc_dir/verbPascalEn.txt"; +my $verbFrancisFr = "$cc_dir/verbFrancisFr.txt"; +my $verbFrancisEn = "$cc_dir/verbFrancisEn.txt"; +my $equivCCPascal = "$cc_dir/equivCCPascal.txt"; +my $equivCCFrancis = "$cc_dir/equivCCFrancis.txt"; +my $liensLodex = "$cc_dir/liensLodex.txt"; + +# Lecture des tables +open(VPF, "<:utf8", $verbPascalFr) or die "$!,"; +while() { + chomp; + s/\r//go; + my ($code, $intitule) = split(/\t/); + $verb{$code}{'FR'} = $intitule; + } +close VPF; + +open(VPE, "<:utf8", $verbPascalEn) or die "$!,"; +while() { + chomp; + s/\r//go; + my ($code, $intitule) = split(/\t/); + $verb{$code}{'EN'} = $intitule; + } +close VPE; + +open(VFF, "<:utf8", $verbFrancisFr) or die "$!,"; +while() { + chomp; + s/\r//go; + my ($code, $intitule) = split(/\t/); + $verb{$code}{'FR'} = $intitule; + if ( $code =~ /^[567]\d\d\z/o ) { + $francis{$code} ++; + } + } +close VFF; + +open(VFE, "<:utf8", $verbFrancisEn) or die "$!,"; +while() { + chomp; + s/\r//go; + my ($code, $intitule) = split(/\t/); + $verb{$code}{'EN'} = $intitule; + } +close VFE; + +open(EQP, "<:utf8", $equivCCPascal) or die "$!,"; +while() { + chomp; + s/\r//go; + my ($ac, $canon) = split(/\t/); + my ($an, $code) = split(/-/, $ac); + $equiv{'Pascal'}{$an}{$code} = $canon; + $canon{'Pascal'}{$code}{$canon} ++; + } +close EQP; + +open(EQF, "<:utf8", $equivCCFrancis) or die "$!,"; +while() { + chomp; + s/\r//go; + my ($ca, $canon) = split(/\t/); + if ( $canon =~ /^-(.+)/ ) { + $canon = $1; + } +# if ( $canon =~ /^(.+?)-(.+)/o ) { +# $canon = $1.$2; +# } + if ( $canon =~ /^(.+?) (.+)/o ) { + $canon = "$1-$2"; + } + my ($code, $an) = split(/#/, $ca); +# if ( $code =~ /^(.+?)-(.+)/o ) { +# $code = $1.$2; +# } + if ( $code =~ /^(.+?) (.+)/o ) { + $code = "$1-$2"; + } + $equiv{'Francis'}{$an}{$code} = $canon; + } +close EQF; + +open(LDX, "<:utf8", $liensLodex) or die "$!,"; +while() { + chomp; + s/\r//go; + my ($lien, $code, $intitule) = split(/\t/); + $lodex{$code} = $lien; + } +close LDX; + +# Lecture du modèle de fichier +while() { + next if /^\s*$/o; + next if /^#/o; + if ( /^%%\s+partie\s+(\d+)/o ) { + $num = $1; + } + elsif ( /^%%\s+FIN/o ) { + last; + } + else { + $parties[$num] .= $_; + } + } + +# Modification de la date et du numéro de version +$parties[3] =~ s/%DATE%/$date/; +$parties[3] =~ s/%VERSION%/$version/; + +# Complétion de la table des entitées HTML +#foreach my $item (qw/amp gt lt/) { +# delete $entity2char{$item}; +# } +while() { + next if /^\s*$/o; + next if /^#/o; + chomp; + my ($num, $sgml) = split(/\t+/); + next if $entity2char{$sgml}; + $entity2char{$sgml} = chr($num); + } +close DATA; + +my %trans = ( + "205" => 1, + "210" => 1, + "215" => 1, + "220" => 1, + "221" => 1, + "222" => 1, + "223" => 1, + "224" => 1, + "225" => 1, + "226" => 1, + "227" => 1, + "230" => 1, + "235" => 1, + "240" => 1, + "250" => 1, + "260" => 1, + "280" => 1, + "295" => 1, + ); + +my %voc = ( + "2" => "Sciences de la Terre", + "9" => "Traductions", + "X" => "Sciences exactes, Sciences de la vie, Technologies", + "A" => "Art et archéologie", + "B" => "Science administrative", + "C" => "Sciences de l’éducation", + "D" => "Gestion des entreprises", + "E" => "Économie de l’énergie", + "G" => "Bibliographie géographique internationale", + "H" => "Préhistoire et protohistoire", + "I" => "Histoire et sciences de la littérature", + "J" => "Informatique et sciences juridiques", + "L" => "Sciences du langage", + "N" => "Éthnologie", + "P" => "Philosophie", + "R" => "Histoire et sciences des religions", + "S" => "Sociologie", + "T" => "Histoire des sciences et techniques", + "U" => "Amérique latine", + "V" => "Sciences humaines de la santé", + "W" => "Économie générale", + "1" => "Sciences de l’ingénieur", + "3" => "Physique", + ); +my %nfe = ( + "agr" => "Agrovoc descriptor", + "ar" => "Architect", + "cog" => "Geographical coordinate", + "da" => "Anion", + "dc" => "Cation", + "de" => "Enzyme", + "dg" => "Geography", + "dp" => "Psychometrics", + "ds" => "Systematics", + "dw" => "Virus", + "fa" => "Author", + "fc" => "Organic ligand", + "fd" => "Chronology", + "fe" => "Enzyme", + "ff" => "Pesticide", + "fm" => "Methodology", + "fr" => "Medicament", + "fs" => "Site", + "fx" => "Toxic", + "na" => "Anion", + "nb" => "Cited work", + "nc" => "Cation", + "nd" => "Date", + "ne" => "Technique", + "nf" => "Person (human being, divinity)", + "ng" => "Geography", + "ni" => "Matter-Concept", + "nj" => "Organization-Institution", + "nk" => "Chemical compound", + "nl" => "Language", + "nm" => "Disease", + "nn" => "Ethnic group", + "no" => "Celestial body", + "np" => "Psychometric test", + "ns" => "Systematics", + "nt" => "Soil", + "nv" => "Petrography", + "nw" => "Virus", + "nx" => "Stratigraphy", + "ny" => "Paleontology", + "nz" => "Mineralogy", + "sa" => "Works (buildings)", + "si" => "Information system", + "tec" => "Commercial name. Technical name", + "vf" => "Pesticides", + "vk" => "Chemical terms", + "vr" => "Medicaments", + "vx" => "Toxics", + "CD" => "Candidate keyword", + "INC" => "Uncontrolled term", + ); +my %nff = ( + "agr" => "Descripteur Agrovoc", + "ar" => "Architecte", + "cog" => "Coordonnée géographique", + "da" => "Anion", + "dc" => "Cation", + "de" => "Enzyme", + "dg" => "Géographie", + "dp" => "Psychométrie", + "ds" => "Systématique", + "dw" => "Virus", + "fa" => "Auteur", + "fc" => "Coordinat organique", + "fd" => "Chronologie", + "fe" => "Enzyme", + "ff" => "Pesticide", + "fm" => "Méthodologie", + "fr" => "Médicament", + "fs" => "Site", + "fx" => "Toxique", + "na" => "Anion", + "nb" => "Oeuvre citée", + "nc" => "Cation", + "nd" => "Date", + "ne" => "Technique", + "nf" => "Personne (être humain, divinité)", + "ng" => "Géographie", + "ni" => "Matière-Concept", + "nj" => "Organisme-Institution", + "nk" => "Composé chimique", + "nl" => "Langue", + "nm" => "Maladie", + "nn" => "Ethnie", + "no" => "Objet céleste", + "np" => "Test de psychométrie", + "ns" => "Systématique", + "nt" => "Sol", + "nv" => "Pétrographie", + "nw" => "Virus", + "nx" => "Stratigraphie", + "ny" => "Paléontologie", + "nz" => "Minéralogie", + "sa" => "Ouvrages (bâtiments)", + "si" => "Système d'information", + "tec" => "Nom commercial. Nom technique", + "vf" => "Pesticides", + "vk" => "Termes chimiques", + "vr" => "Médicaments", + "vx" => "Toxiques", + "CD" => "Candidat descripteur", + "INC" => "Terme non contrôlé", + ); + +if ( $align =~ /\.gz\z/o ) { + open(INP, "gzip -cd $align |") or die "$!,"; + binmode(INP, ":utf8"); + } +elsif ( $align =~ /\.bz2\z/o ) { + open(INP, "bzip2 -cd $align |") or die "$!,"; + binmode(INP, ":utf8"); + } +else { + open(INP, "<:utf8", $align) or die "$!,"; + } + +if ( $log ) { + open(LOG, ">:utf8", $log) or die "$!,"; + } +else { + open(LOG, ">:utf8", "/dev/null") or die "$!,"; + } + +while() { + if (/^([_.0\*\+]+\x{00A0}*)\t(\d\.\d+)(!?)\t/o) { + my $score = $1; + my $grade = $2; + my $statut = $3; + $statut = undef if not $xclam; + if ( $grade < 3.490 and not $statut ) { + $inist = undef; + next; + } + chomp; + my @champs = split(/\t/); + $inist = $champs[3]; + my $id = $champs[17]; + my $ark = $champs[18]; + my $doi = $champs[19]; + my $pii = $champs[20]; + my $pmid = $champs[21]; + print LOG "$inist\t$grade$statut\t$id\t$ark\t$doi\t$pii\t$pmid\n"; + push(@{$match{$inist}}, "$id\t$ark\t$doi\t$pii\t$pmid"); + } + elsif (/^([_.0\*\+]+\x{00A0}*)\t(\d\.\d+)\W\t/o) { + $inist = undef; + } + elsif ( /^ ~~> \t/o ) { + next if not $inist; + chomp; + my @champs = split(/\t/); + my $id = $champs[5]; + my $ark = $champs[6]; + my $doi = $champs[7]; + my $pii = $champs[8]; + my $pmid = $champs[9]; + print LOG "$inist\t \" \" \t$id\t$ark\t$doi\t$pii\t$pmid\n"; + push(@{$match{$inist}}, "$id\t$ark\t$doi\t$pii\t$pmid"); + } + } +close INP; + +if ( $fichier ) { + if ( $fichier =~ /\.gz\z/o ) { + open(INP, "gzip -cd $fichier |") or die "$!,"; + binmode(INP, ":raw"); + } + elsif ( $fichier =~ /\.bz2\z/o ) { + open(INP, "bzip2 -cd $fichier |") or die "$!,"; + binmode(INP, ":raw"); + } + else { + open(INP, "<:raw", $fichier) or die "$!,"; + } + } +elsif ( $hfd ) { + open(INP, "IhfdCat $hfd |") or die "$!,"; + binmode(INP, ":raw"); + } + +while() { +# ($inist) = m|((?:\d\d\d-)?\d\d-\d+)|o; + ($inist) = m|(.+?)|o; + next if not defined $match{$inist}; + + $annee = undef; + if ( $inist =~ /^(\d\d)-\d+\z/o ) { + $annee = $1; + } + elsif ( $inist =~ /^\d\d\d[-.](\d\d)[-.]\d+\z/o ) { + $annee = $1; + } + else { + print STDERR "N° INIST incorrect \"$inist\"\n"; + } + if ( $annee > 50 ) { + $annee = "19$annee"; + } + else { + $annee = "20$annee"; + } + + my @codes = (); + foreach my $item (m|(.+?)|go) { + if ( $item =~ m|(([567]\d\d)(.*?))(.+?)|o ) { + my $pos = $1; + my $base = $2; + my $code = $3; + if ( $5 ) { + $code = "$4-$5"; + } + elsif ( $codes[$#codes] =~ /^$code/ ) { + next; + } + $codes[$pos] = "$base:$code"; + $base{$code} = "$4"; + $classe{$code} = "$4-$6"; + } + elsif ( $item =~ m|(.+?)|o ) { + my $pos = $1; + my $base = $2; + my $code = $3; + if ( $code =~ /^([567]\d\d)(\w+)\z/o ) { + $code = "$1-$2" if $francis{$1}; + } + elsif ( $code =~ /^(2\d\d)\w*\z/o ) { + next if $trans{$1}; + } + else { + if ( $code =~ /^001B40F30A[12]\z/o ) { + $code .= "0"; + } + elsif ( $code =~ /^002B31\w+\z/o ) { + $code = "002B31"; + } + } + $codes[$pos] = "$base:$code"; + } + elsif ( $item =~ m|(.+?)|o ) { +# my $pos = $1; +# my $base = $2; + my $code = $3; + if ( $code =~ /^[IVXL]+\z/o ) { + if ( $codes[$#codes] =~ /^\w:(([567]\d\d)-?\w+)\z/o ) { + $classe{$1} = "$2-$code"; + } + } + } + } + my %specifiques = (); + my %generiques = (); + foreach my $item (m|(.+?)|go) { + my $langue = ""; + my $numero = ""; + my $ig2 = ""; + my $motcle = ""; + my $type = ""; + my @nf = (); + if ( $item =~ m||o ) { + $langue = $3; + $numero = $1; + $ig2 = $2; + } + if ( $item =~ m|(.+?)|o ) { + $motcle = decode_entities($1); + $motcle =~ s/^\s+//o; + $motcle =~ s/\s+\z//o; + $motcle =~ s/\s\s+/ /go; + } + if ( $item =~ m|.+?|o ) { + next if $item =~ m|PAC|o; + my %tmp = (); + foreach my $nf ($item =~ m|(.+?)|go) { + next if $tmp{$nf} ++; + push(@nf, $nf); + } + } + if ( $item =~ m#(CD|INC)#io ) { + $type = $1; + } + $specifiques{$langue}{$numero}{$motcle}{'voc'} = $ig2; + if ( @nf ) { + push(@{$specifiques{$langue}{$numero}{$motcle}{'nf'}}, @nf); + } + if ( $type ) { + $specifiques{$langue}{$numero}{$motcle}{'type'} = $type; + } + } + foreach my $item (m|(.+?)|go) { + my $langue = ""; + my $numero = ""; + my $ig2 = ""; + my $motcle = ""; + my $type = ""; + my @nf = (); + if ( $item =~ m||o ) { + $langue = $3; + $numero = $1; + $ig2 = $2; + } + if ( $item =~ m|(.+?)|o ) { + $motcle = decode_entities($1); + $motcle =~ s/^\s+//o; + $motcle =~ s/\s+\z//o; + $motcle =~ s/\s\s+/ /go; + } + if ( $item =~ m|.+?|o ) { + my %tmp = (); + foreach my $nf ($item =~ m|(.+?)|go) { + next if $tmp{$nf} ++; + push(@nf, $nf); + } + } + if ( $item =~ m#(CD|INC)#io ) { + $type = $1; + } + $generiques{$langue}{$numero}{$motcle}{'voc'} = $ig2; + if ( @nf ) { + push(@{$generiques{$langue}{$numero}{$motcle}{'nf'}}, @nf); + } + if ( $type ) { + $generiques{$langue}{$numero}{$motcle}{'type'} = $type; + } + } + foreach my $istex (@{$match{$inist}}) { + my ($id, $ark, $doi, $pii, $pmid) = split(/\t/, $istex); + if ( $ready ) { + my $chemin = substr($id, 0, 1); + if ( not -d $chemin ) { + mkdir($chemin, 0755) or die "$!,"; + } + $chemin .= "/" . substr($id, 1, 1); + if ( not -d $chemin ) { + mkdir($chemin, 0755) or die "$!,"; + } + $chemin .= "/" . substr($id, 2, 1); + if ( not -d $chemin ) { + mkdir($chemin, 0755) or die "$!,"; + } + $chemin .= "/$id"; + if ( not -d $chemin ) { + mkdir($chemin, 0755) or die "$!,"; + } + open(TEI, ">:utf8", "$chemin/${id}_ipf.tei") or die "$!,"; + } + else { + open(TEI, ">:utf8", "${id}_ipf.tei") or die "$!,"; + } + print TEI $parties[1]; + my $marge = 24; + print TEI " " x $marge, "$id\n"; + print TEI " " x $marge, "$ark\n"; + print TEI " " x $marge, "$inist\n"; + if ( $doi ) { + print TEI " " x $marge, "$doi\n"; + } + if ( $pii ) { + print TEI " " x $marge, "$pii\n"; + } + if ( $pmid ) { + print TEI " " x $marge, "$pmid\n"; + } + print TEI $parties[3]; + if ( @codes ) { + my @tmp = grep(/:00[12]/o, @codes); + if ( @tmp ) { + @codes = @tmp; + } + (my $tmp = $parties[4]) =~ s/%YEAR%/$annee/; + print TEI $tmp; + my @lignes = codes('FR', @codes); + print TEI @lignes; + print TEI $parties[6]; + ($tmp = $parties[7]) =~ s/%YEAR%/$annee/; + print TEI $tmp; + @lignes = codes('EN', @codes); + print TEI @lignes; + print TEI $parties[9] + # En l’absence de verbalisations espagnoles, + # on ne fait pas (pour l’instant) les parties + # 10 à 12 + } + if ( defined $specifiques{'FRE'} or defined $generiques{'FRE'} ) { + my @lignes = (); + print TEI $parties[13]; + if ( defined $specifiques{'FRE'} ) { + @lignes = motscles('FRE', %specifiques); + print TEI @lignes; + } + if ( defined $generiques{'FRE'} ) { + @lignes = motscles('FRE', %generiques); + print TEI @lignes; + } + print TEI $parties[15] + } + if ( defined $specifiques{'ENG'} or defined $generiques{'ENG'} ) { + my @lignes = (); + print TEI $parties[16]; + if ( defined $specifiques{'ENG'} ) { + @lignes = motscles('ENG', %specifiques); + print TEI @lignes; + } + if ( defined $generiques{'ENG'} ) { + @lignes = motscles('ENG', %generiques); + print TEI @lignes; + } + print TEI $parties[18] + } + if ( defined $specifiques{'SPA'} or defined $generiques{'SPA'} ) { + my @lignes = (); + print TEI $parties[19]; + if ( defined $specifiques{'SPA'} ) { + @lignes = motscles('SPA', %specifiques); + print TEI @lignes; + } + if ( defined $generiques{'SPA'} ) { + @lignes = motscles('SPA', %generiques); + print TEI @lignes; + } + print TEI $parties[21] + } + print TEI $parties[22]; + close TEI; + } + } +close INP; + + +exit 0; + + +sub usage +{ +print STDERR "\n$usage\n"; + +exit shift; +} + +sub niveaux +{ +my $code = shift; + +my @niveaux = (); +$niveaux[0] = 0; + +if ( $code =~ /^2\d\d/o ) { + if ( $code =~ /^(((2\d\d)[A-Z])\d\d)/o ) { + $niveaux[1] = $3; + $niveaux[2] = $2; + $niveaux[3] = $1; + $niveaux[0] = 3; + } + elsif ( $code =~ /^((2\d\d)[A-Z])/o ) { + $niveaux[1] = $2; + $niveaux[2] = $1; + $niveaux[0] = 2; + } + elsif ( $code =~ /^(2\d\d)/o ) { + $niveaux[1] = $1; + $niveaux[0] = 1; + } + } +elsif ( $code =~ /^00[12][A-Z]/o ) { + if ( $code =~ /^(((((((00[12][A-Z])\d\d)[A-Z])\d\d)[A-Z])\d)[A-Z])/o ) { + $niveaux[1] = $7; + $niveaux[2] = $6; + $niveaux[3] = $5; + $niveaux[4] = $4; + $niveaux[5] = $3; + $niveaux[6] = $2; + $niveaux[7] = $1; + $niveaux[0] = 7; + } + elsif ( $code =~ /^((((((00[12][A-Z])\d\d)[A-Z])\d\d)[A-Z])\d)/o ) { + $niveaux[1] = $6; + $niveaux[2] = $5; + $niveaux[3] = $4; + $niveaux[4] = $3; + $niveaux[5] = $2; + $niveaux[6] = $1; + $niveaux[0] = 6; + } + elsif ( $code =~ /^(((((00[12][A-Z])\d\d)[A-Z])\d\d)[A-Z])/o ) { + $niveaux[1] = $5; + $niveaux[2] = $4; + $niveaux[3] = $3; + $niveaux[4] = $2; + $niveaux[5] = $1; + $niveaux[0] = 5; + } + elsif ( $code =~ /^((((00[12][A-Z])\d\d)[A-Z])\d\d)/o ) { + $niveaux[1] = $4; + $niveaux[2] = $3; + $niveaux[3] = $2; + $niveaux[4] = $1; + $niveaux[0] = 4; + } + elsif ( $code =~ /^(((00[12][A-Z])\d\d)[A-Z])/o ) { + $niveaux[1] = $3; + $niveaux[2] = $2; + $niveaux[3] = $1; + $niveaux[0] = 3; + } + elsif ( $code =~ /^((00[12][A-Z])\d\d)/o ) { + $niveaux[1] = $2; + $niveaux[2] = $1; + $niveaux[0] = 2; + } + elsif ( $code =~ /^(00[12][A-Z])/o ) { + $niveaux[1] = $1; + $niveaux[0] = 1; + } + } +elsif ( $code =~ /^([567]\d\d)/o) { + if ( $base{$code} ) { + $niveaux[1] = $base{$code}; + } + else { + $niveaux[1] = $1; + } + $niveaux[0] = 1; + if ( $classe{$code} ) { + $niveaux[2] = $classe{$code}; + $niveaux[0] = 2; + } + if ( $code ne $niveaux[1] ) { + push(@niveaux, $code); + $niveaux[0] ++; + } + } + +return @niveaux; +} + +sub codes +{ +my ($langue, @liste) = @_; + +my $ligne = ""; +my $marge = 16; +my @lignes = (); +my %dejaVu = (); +my %inconnu = (); + +# my $ligne = " " x $marge . "\n"; +# push(@lignes, $ligne); + +foreach my $item (@liste) { + next if not $item; + my ($code) = $item =~ /^.:(.+)/o; + my @niveaux = niveaux($code); + if ( $code =~ /^00[12]/o ) { + for ( my $nb = $niveaux[0] ; $nb > 0 ; $nb -- ) { + my $cc = $niveaux[$nb]; + if ( not defined $equiv{'Pascal'}{$annee}{$cc} and + not defined $verb{$cc} ) { + if ( defined $canon{'Pascal'}{$cc} ) { + my @canon = keys %{$canon{'Pascal'}{$cc}}; + if ( $#canon == 0 ) { + $equiv{'Pascal'}{$annee}{$cc} = $canon[0]; + } + else { + my @items = sort {$canon{'Pascal'}{$b} <=> $canon{'Pascal'}{$a}} @canon; + $equiv{'Pascal'}{$annee}{$cc} = $items[0]; + } + } + else { + $inconnu{$cc} ++; + if ( $nb == $niveaux[0] ) { + $niveaux[0] --; + } + } + } + } + + } + next if $niveaux[0] < 1; + next if $dejaVu{$niveaux[$niveaux[0]]}; + $dejaVu{$niveaux[$niveaux[0]]} ++; + $ligne = " " x $marge . "\n"; + push(@lignes, $ligne); + $marge += 4; + for ( my $nb = 1 ; $nb <= $niveaux[0] ; $nb ++ ) { + my $cc = $niveaux[$nb]; + $ligne = " " x $marge . "$verb{$equiv}{$langue}\n"; + push(@lignes, $ligne); + $ligne = " " x $marge . "\n"; + push(@lignes, $ligne); + } + elsif ( $equiv{'Francis'}{$annee}{$cc} and + $equiv{'Francis'}{$annee}{$cc} ne $cc and + defined $verb{$equiv{'Francis'}{$annee}{$cc}}{$langue} ) { + my $equiv = $equiv{'Francis'}{$annee}{$cc}; + if ( $lodex{$equiv} ) { + $ligne .= " ref=\"$lodex{$equiv}\""; + } + $ligne .= ">\n"; + push(@lignes, $ligne); + $ligne = " " x ($marge + 4) . "$verb{$equiv}{$langue}\n"; + push(@lignes, $ligne); + $ligne = " " x $marge . "\n"; + push(@lignes, $ligne); + } + elsif ( defined $verb{$cc}{$langue} ) { + if ( $lodex{$cc} ) { + $ligne .= " ref=\"$lodex{$cc}\""; + } + $ligne .= ">\n"; + push(@lignes, $ligne); + if ( $inconnu{$cc} ) { + if ( $langue eq 'FR' ) { + $ligne = " " x ($marge + 4) . "Intitulé absent\n"; + } + elsif ( $langue eq 'EN' ) { + $ligne = " " x ($marge + 4) . "Term not available\n"; + } + } + else { + $ligne = " " x ($marge + 4) . "$verb{$cc}{$langue}\n"; + } + push(@lignes, $ligne); + $ligne = " " x $marge . "\n"; + push(@lignes, $ligne); + } + else { + print STDERR "Notice \"$inist\" : pas de verbalisation pour \"$cc\"\n"; + } + } + $marge -= 4; + push(@lignes, " " x $marge . "\n"); + } + +return @lignes +} + +sub motscles +{ +my ($langue, %motscles) = @_; + +my $marge = 20; +my @lignes = (); + +my $rnf = \%nff; +if ( $langue eq 'ENG' ) { + $rnf = \%nfe; + } + +foreach $num (sort {$a <=> $b} keys %{$motscles{$langue}}) { + my ($motcle) = keys %{$motscles{$langue}{$num}}; + my $type = undef; + my @nf = (); + if ( defined $motscles{$langue}{$num}{$motcle}{'type'} ) { + $type = $motscles{$langue}{$num}{$motcle}{'type'}; + } + if ( defined $motscles{$langue}{$num}{$motcle}{'nf'} ) { + @nf = @{$motscles{$langue}{$num}{$motcle}{'nf'}}; + } + my $ligne = " " x $marge . "\n"; + push(@lignes, $ligne); + $ligne = " " x $marge . " $motcle\n"; + push(@lignes, $ligne); + if ( @nf or $type ) { + $marge += 4; +# my $ligne = " " x $marge . "\n"; +# push(@lignes, $ligne); + foreach my $nf (@nf) { + my $intitule = $nf; + if ( defined $rnf->{$nf} ) { + $intitule = $rnf->{$nf}; + } + elsif ( defined $rnf->{lc($nf)} ) { + $intitule = $rnf->{lc($nf)}; + } + elsif ( $nf =~ /^[lqt][1-9]\z/io ) { + next; + } + else { + print STDERR "Notice \"$inist\" : NF \"$nf\" pour "; + print STDERR "mot-clé \"$motcle\"\n"; + next; + } + my $ligne = " " x $marge . "\n"; + push(@lignes, $ligne); + $ligne = " " x ($marge + 4) . "\n"; + push(@lignes, $ligne); + $ligne = " " x ($marge + 8) . "\n"; + push(@lignes, $ligne); + $ligne = " " x ($marge + 4) . "\n"; + push(@lignes, $ligne); + $ligne = " " x ($marge + 4) . "\n"; + push(@lignes, $ligne); + $ligne = " " x ($marge + 8) . "$intitule\n"; + push(@lignes, $ligne); + $ligne = " " x ($marge + 4) . "\n"; + push(@lignes, $ligne); + $ligne = " " x $marge . "\n"; + push(@lignes, $ligne); + } + if ( $type ) { + my $ligne = " " x $marge . "\n"; + push(@lignes, $ligne); + $ligne = " " x ($marge + 4) . "\n"; + push(@lignes, $ligne); + $ligne = " " x ($marge + 8) . "\n"; + push(@lignes, $ligne); + $ligne = " " x ($marge + 4) . "\n"; + push(@lignes, $ligne); + $ligne = " " x ($marge + 4) . "\n"; + push(@lignes, $ligne); + $ligne = " " x ($marge + 8) . "$rnf->{$type}\n"; + push(@lignes, $ligne); + $ligne = " " x ($marge + 4) . "\n"; + push(@lignes, $ligne); + $ligne = " " x $marge . "\n"; + push(@lignes, $ligne); + } +# $ligne = " " x $marge . "\n"; +# push(@lignes, $ligne); + $marge -= 4; + } + $ligne = " " x $marge . "\n"; + push(@lignes, $ligne); + } + +return @lignes; +} + + + +__DATA__ + +%% partie 1 + + + + + + + + Alignement Pascal/Francis — ISTEX + + enrichissement INIST-CNRS + INIST-CNRS + + + + ISTEX + + +

L’élément standOff de ce document est distribué sous licence + Creative Commons 4.0 non transposée (CC BY 4.0)

+

Ce standOff a été créé dans le cadre du projet ISTEX – Initiative + d’Excellence en Information Scientifique et Technique

+
+
+
+ + + +%% partie 2 # + + D6F3AA7EDD75E1885C4BB439D518F98194569F42 + 95-0599541 + 6340873 + +%% partie 3 + + + +
+ + + + + + + + + Alignement Pascal/Francis — Istex + +
+ +%% partie 4 + + + %YEAR% + + +%% partie 5 # + + + SCIENCES APPLIQUEES. + BATIMENT. TRAVAUX PUBLICS. + Matériaux + + + SCIENCES APPLIQUEES. + INDUSTRIE DES POLYMERES, PEINTURES, BOIS. + Technologie des polymères + + + SCIENCES APPLIQUEES. + INDUSTRIES CHIMIQUE ET PARACHIMIQUE. + Matériaux de construction. Céramique. Verres. + Verres. + Structure, analyse, propriétés + + + SCIENCES APPLIQUEES. + BATIMENT. TRAVAUX PUBLICS. + Calcul des constructions. Sollicitations. + Résistance des matériaux (élasticité, plasticité, flambage, etc.) + + + BATIMENT. TRAVAUX PUBLICS. + + +%% partie 6 + + + + +%% partie 7 + + + %YEAR% + + +%% partie 8 # + + + APPLIED SCIENCES. + BUILDINGS. PUBLIC WORKS. + Materials + + + APPLIED SCIENCES. + POLYMER INDUSTRY, PAINTS, WOOD. + Technology of polymers. + + + APPLIED SCIENCES. + CHEMICAL INDUSTRY AND CHEMICALS. + Building materials. Ceramics. Glasses. + Glasses. + Structure, analysis, properties + + + APPLIED SCIENCES. + BUILDINGS. PUBLIC WORKS. + Structural analysis. Stresses. + Strength of materials (elasticity, plasticity, buckling, etc.) + + + BUILDINGS. PUBLIC WORKS. + + +%% partie 9 + + + + +%% partie 10 + + + + +%% partie 11 # + + + CIENCIAS APLICADAS. + Edificios; Obras publicas + Materiales + + + CIENCIAS APLICADAS. + INDUSTRIA DEL POLIMERO, PINTURAS, MADERAS. + Tecnologia de los polimeros + + + CIENCIAS APLICADAS. + Quimica. Ciencia de los materiales. + Materiales de construcción. Cerámica. Vidrio. + Vidrio + Estructura, análisis, propiedades. + + + CIENCIAS APLICADAS. + Edificios; Obras publicas + Calculo de las construcciones; Solicitaciones + Resistencia de los materiales (elasticidad, plasticidad, pandeo, etc.) + + + Edificios; Obras publicas + + +%% partie 12 + + + + +%% partie 13 + + + + + +%% partie 14 + + + Composite hybride + + + + + + Candidat descripteur + + + + Experience + + Ethylène polymère + + + + + + Composé chimique + + + + Fibre verre + +%% partie 15 + + + +# + + +%% partie 16 + + + + + +%% partie 17 # + + + Hybrid composites + + + + + + Descriptor candidate + + + + + Flexural modulus + + + + + + Uncontrolled index + + + + + Interlaminar shear strength + + + + + + Uncontrolled index + + + + Experiments + + Polyethylenes + + + + + + Chemical compound + + + + Glass fibers + +%% partie 18 + + + + + +%% partie 19 + + + + + +%% partie 20 # + + + Compuesto hibrido + + + + + + Candidato descriptor + + + + Experiencia + + Etileno polímero + + + + + + Compuesto quimico + + + + Fibra vidrio + +%% partie 21 + + + + + +%% partie 22 + +
+
+ +%% FIN + +## +## Liste d’entités caractères +## + +## +## NE PAS MODIFIER ! +## +## DO NOT EDIT! +## + +33 excl +34 dquot +35 num +36 dollar +37 percnt +40 lpar +41 rpar +42 ast +43 plus +44 comma +45 hyphen +46 period +47 sol +58 colon +59 semi +61 equals +63 quest +64 commat +91 lsqb +92 bsol +93 rsqb +95 lowbar +96 grave +123 lcub +124 verbar +125 rcub +256 Amacr; +257 amacr; +258 Abreve; +259 abreve; +260 Aogon; +261 aogon; +262 Cacute; +263 cacute; +264 Ccirc; +265 ccirc; +266 Cdot; +267 cdot; +268 Ccaron; +269 ccaron; +270 Dcaron; +271 dcaron; +272 Dstrok; +273 dstrok; +274 Emacr; +275 emacr; +278 Edot; +279 edot; +280 Eogon; +281 eogon; +282 Ecaron; +283 ecaron; +284 Gcirc; +285 gcirc; +286 Gbreve; +287 gbreve; +288 Gdot; +289 gdot; +290 Gcedil; +291 gcedil; +292 Hcirc; +293 hcirc; +294 Hstrok; +295 hstrok; +296 Itilde; +297 itilde; +298 Imacr; +299 imacr; +302 Iogon; +303 iogon; +304 Idot; +305 inodot; +306 IJlig; +307 ijlig; +308 Jcirc; +309 jcirc; +310 Kcedil; +311 kcedil; +312 kgreen; +313 Lacute; +314 lacute; +315 Lcedil; +316 lcedil; +317 Lcaron; +318 lcaron; +319 Lmidot; +320 lmidot; +321 Lstrok; +322 lstrok; +323 Nacute; +324 nacute; +325 Ncedil; +326 ncedil; +327 Ncaron; +328 ncaron; +329 napos; +330 ENG; +331 eng; +332 Omacr; +333 omacr; +336 Odblac; +337 odblac; +340 Racute; +341 racute; +342 Rcedil; +343 rcedil; +344 Rcaron; +345 rcaron; +346 Sacute; +347 sacute; +348 Scirc; +349 scirc; +350 Scedil; +351 scedil; +354 Tcedil; +355 tcedil; +356 Tcaron; +357 tcaron; +358 Tstrok; +359 tstrok; +360 Utilde; +361 utilde; +362 Umacr; +363 umacr; +364 Ubreve; +365 ubreve; +366 Uring; +367 uring; +368 Udblac; +369 udblac; +370 Uogon; +371 uogon; +372 Wcirc; +373 wcirc; +374 Ycirc; +375 ycirc; +377 Zacute; +378 zacute; +379 Zdot; +380 zdot; +381 Zcaron; +382 zcaron; +501 gacute; +711 caron; +728 breve; +729 dot; +730 ring; +731 ogon; +733 dblac; +902 Aacgr; +904 Eacgr; +905 EEacgr; +906 Iacgr; +908 Oacgr; +910 Uacgr; +911 OHacgr; +912 idiagr; +913 Agr; +914 Bgr; +915 Ggr; +916 Dgr; +917 Egr; +918 Zgr; +919 EEgr; +920 THgr; +921 Igr; +922 Kgr; +923 Lgr; +924 Mgr; +925 Ngr; +926 Xgr; +927 Ogr; +928 Pgr; +929 Rgr; +931 Sgr; +932 Tgr; +933 Ugr; +934 PHgr; +935 KHgr; +936 PSgr; +937 OHgr; +938 Idigr; +939 Udigr; +940 aacgr; +941 eacgr; +942 eeacgr; +943 iacgr; +944 udiagr; +945 agr; +946 bgr; +947 ggr; +948 dgr; +949 egr; +950 zgr; +951 eegr; +952 thgr; +953 igr; +954 kgr; +955 lgr; +956 mgr; +957 ngr; +958 xgr; +959 ogr; +960 pgr; +961 rgr; +962 sfgr; +963 sgr; +964 tgr; +965 ugr; +966 phgr; +967 khgr; +968 psgr; +969 ohgr; +970 idigr; +971 udigr; +972 oacgr; +973 uacgr; +974 ohacgr; +977 thetav; +981 phiv; +988 gammad; +1008 kappav; +1009 rhov; +1025 IOcy; +1026 DJcy; +1027 GJcy; +1028 Jukcy; +1029 DScy; +1030 Iukcy; +1031 YIcy; +1032 Jsercy; +1033 LJcy; +1034 NJcy; +1035 TSHcy; +1036 KJcy; +1038 Ubrcy; +1039 DZcy; +1040 Acy; +1041 Bcy; +1042 Vcy; +1043 Gcy; +1044 Dcy; +1045 IEcy; +1046 ZHcy; +1047 Zcy; +1048 Icy; +1049 Jcy; +1050 Kcy; +1051 Lcy; +1052 Mcy; +1053 Ncy; +1054 Ocy; +1055 Pcy; +1056 Rcy; +1057 Scy; +1058 Tcy; +1059 Ucy; +1060 Fcy; +1061 KHcy; +1062 TScy; +1063 CHcy; +1064 SHcy; +1065 SHCHcy; +1066 HARDcy; +1067 Ycy; +1068 SOFTcy; +1069 Ecy; +1070 YUcy; +1071 YAcy; +1072 acy; +1073 bcy; +1074 vcy; +1075 gcy; +1076 dcy; +1077 iecy; +1078 zhcy; +1079 zcy; +1080 icy; +1081 jcy; +1082 kcy; +1083 lcy; +1084 mcy; +1085 ncy; +1086 ocy; +1087 pcy; +1088 rcy; +1089 scy; +1090 tcy; +1091 ucy; +1092 fcy; +1093 khcy; +1094 tscy; +1095 chcy; +1096 shcy; +1097 shchcy; +1098 hardcy; +1099 ycy; +1100 softcy; +1101 ecy; +1102 yucy; +1103 yacy; +1105 iocy; +1106 djcy; +1107 gjcy; +1108 jukcy; +1109 dscy; +1110 iukcy; +1111 yicy; +1112 jsercy; +1113 ljcy; +1114 njcy; +1115 tshcy; +1116 kjcy; +1118 ubrcy; +1119 dzcy; +8196 emsp13; +8197 emsp14; +8199 numsp; +8200 puncsp; +8202 hairsp; +8208 dash; +8211 ndash; +8212 mdash; +8213 horbar; +8214 Verbar; +8229 nldr; +8244 tprime; +8245 bprime; +8257 caret; +8259 hybull; +8411 tdot; +8412 DotDot; +8453 incare; +8459 hamilt; +8463 planck; +8466 lagran; +8467 ell; +8470 numero; +8471 copysr; +8478 rx; +8486 ohm; +8491 angst; +8492 bernou; +8499 phmmat; +8500 order; +8502 beth; +8503 gimel; +8504 daleth; +8531 frac13; +8532 frac23; +8533 frac15; +8534 frac25; +8535 frac35; +8536 frac45; +8537 frac16; +8538 frac56; +8539 frac18; +8540 frac38; +8541 frac58; +8542 frac78; +8597 varr; +8598 nwarr; +8599 nearr; +8600 drarr; +8601 dlarr; +8602 nlarr; +8603 nrarr; +8605 rarrw; +8606 Larr; +8608 Rarr; +8610 larrtl; +8611 rarrtl; +8614 map; +8617 larrhk; +8618 rarrhk; +8619 larrlp; +8620 rarrlp; +8621 harrw; +8622 nharr; +8624 lsh; +8625 rsh; +8630 cularr; +8631 curarr; +8634 olarr; +8635 orarr; +8636 lharu; +8637 lhard; +8638 uharr; +8639 uharl; +8640 rharu; +8641 rhard; +8642 dharr; +8643 dharl; +8644 rlarr2; +8646 lrarr2; +8647 larr2; +8648 uarr2; +8649 rarr2; +8650 darr2; +8651 lrhar2; +8652 rlhar2; +8653 nlArr; +8654 nhArr; +8655 nrArr; +8661 vArr; +8666 lAarr; +8667 rAarr; +8705 comp; +8708 nexist; +8714 epsis; +8717 bepsi; +8720 coprod; +8722 minus; +8723 mnplus; +8724 plusdo; +8726 setmn; +8728 compfn; +8735 ang90; +8737 angmsd; +8738 angsph; +8739 mid; +8740 nmid; +8741 par; +8742 npar; +8750 conint; +8757 becaus; +8765 bsim; +8768 wreath; +8769 nsim; +8771 sime; +8772 nsime; +8775 ncong; +8777 nap; +8778 ape; +8780 bcong; +8782 bump; +8783 bumpe; +8784 esdot; +8785 eDot; +8786 efDot; +8787 erDot; +8788 colone; +8789 ecolon; +8790 ecir; +8791 cire; +8793 wedgeq; +8796 trie; +8802 nequiv; +8806 lE; +8807 gE; +8808 lne; +8809 gne; +8810 Lt; +8811 Gt; +8812 twixt; +8814 nlt; +8815 ngt; +8816 nle; +8817 nge; +8818 lsim; +8819 gsim; +8822 lg; +8823 gl; +8826 pr; +8827 sc; +8828 pre; +8829 sce; +8830 prsim; +8831 scsim; +8832 npr; +8833 nsc; +8837 nsup; +8840 nsube; +8841 nsupe; +8842 subne; +8843 supne; +8846 uplus; +8847 sqsub; +8848 sqsup; +8849 sqsube; +8850 sqsupe; +8851 sqcap; +8852 sqcup; +8854 ominus; +8856 osol; +8857 odot; +8858 ocir; +8859 oast; +8861 odash; +8862 plusb; +8863 minusb; +8864 timesb; +8865 sdotb; +8866 vdash; +8867 dashv; +8868 top; +8871 models; +8872 vDash; +8873 Vdash; +8874 Vvdash; +8876 nvdash; +8877 nvDash; +8878 nVdash; +8879 nVDash; +8882 vltri; +8883 vrtri; +8884 ltrie; +8885 rtrie; +8888 mumap; +8890 intcal; +8891 veebar; +8892 barwed; +8900 diam; +8902 sstarf; +8903 divonx; +8904 bowtie; +8905 ltimes; +8906 rtimes; +8907 lthree; +8908 rthree; +8909 bsime; +8910 cuvee; +8911 cuwed; +8912 Sub; +8913 Sup; +8914 Cap; +8915 Cup; +8916 fork; +8918 ldot; +8919 gsdot; +8920 Ll; +8921 Gg; +8922 leg; +8923 gel; +8924 els; +8925 egs; +8926 cuepr; +8927 cuesc; +8928 npre; +8929 nsce; +8934 lnsim; +8935 gnsim; +8936 prnsim; +8937 scnsim; +8938 nltri; +8939 nrtri; +8940 nltrie; +8941 nrtrie; +8942 vellip; +8966 Barwed; +8972 drcrop; +8973 dlcrop; +8974 urcrop; +8975 ulcrop; +8981 telrec; +8982 target; +8988 ulcorn; +8989 urcorn; +8990 dlcorn; +8991 drcorn; +8994 frown; +8995 smile; +9251 blank; +9416 oS; +9472 boxh; +9474 boxv; +9484 boxdr; +9488 boxdl; +9492 boxur; +9496 boxul; +9500 boxvr; +9508 boxvl; +9516 boxhd; +9524 boxhu; +9532 boxvh; +9552 boxH; +9553 boxV; +9554 boxdR; +9555 boxDr; +9556 boxDR; +9557 boxdL; +9558 boxDl; +9559 boxDL; +9560 boxuR; +9561 boxUr; +9562 boxUR; +9563 boxuL; +9564 boxUl; +9565 boxUL; +9566 boxvR; +9567 boxVr; +9568 boxVR; +9569 boxvL; +9570 boxVl; +9571 boxVL; +9572 boxHd; +9573 boxhD; +9574 boxHD; +9575 boxHu; +9576 boxhU; +9577 boxHU; +9578 boxvH; +9579 boxVh; +9580 boxVH; +9600 uhblk; +9604 lhblk; +9608 block; +9617 blk14; +9618 blk12; +9619 blk34; +9633 squ; +9642 squf; +9645 rect; +9646 marker; +9651 xutri; +9652 utrif; +9653 utri; +9656 rtrif; +9657 rtri; +9661 xdtri; +9662 dtrif; +9663 dtri; +9666 ltrif; +9667 ltri; +9675 cir; +9733 starf; +9734 star; +9742 phone; +9792 female; +9794 male; +9834 sung; +9837 flat; +9838 natur; +9839 sharp; +10003 check; +10007 cross; +10016 malt; +10022 lozf; +10038 sext; +64256 fflig; +64257 filig; +64258 fllig; +64259 ffilig; + +## +## The End! +##