#!/usr/bin/perl
# Déclaration des pragmas
use strict;
use utf8;
use open qw/:std :utf8/;
## use warnings::unused;
## use warnings 'once';
# Appel des modules externes de base
use Encode;
use Getopt::Long;
# Appel des modules spécifiques à l'application
use HTML::Entities qw(decode_entities %entity2char);
use HTTP::CookieJar::LWP;
use JSON;
use LWP::UserAgent;
use Number::Convert::Roman;
use Text::Unidecode;
use URI::Encode qw(uri_encode uri_decode);
my ($programme) = $0 =~ m|^(?:.*/)?(.+)|;
my $substitut = " " x length($programme);
my $version = "15.3.1";
my $dateModif = "4 Octobre 2021";
my $usage = "Usage : \n" .
" $programme -f (fichier|-) [ -d ] [ -v nombre ] [ -r id ] \n" .
" $substitut [ -n notices ] [ -c corpus[,corpus]* ]* \n" .
" $programme -h fichier_HFD [ -d ] [ -v nombre ] [ -r id ] \n" .
" $substitut [ -n notices ] [ -c corpus[,corpus]* ]* \n" .
" $programme -i \n";
my $debug = undef;
my $fichier = undef;
my $hfd = undef;
my $info = undef;
my $notices = undef;
my $reprise = undef;
my $verbeux = undef;
my @corpus = ();
eval {
$SIG{__WARN__} = sub {usage(1);};
GetOptions(
"corpus=s" => \@corpus,
"notices=s" => \$notices,
"info" => \$info,
"debug" => \$debug,
"fichier=s" => \$fichier,
"hfd=s" => \$hfd,
"reprise=s" => \$reprise,
"verbeux=i" => \$verbeux,
);
};
$SIG{__WARN__} = sub {warn $_[0];};
if ( $info ) {
print "Programme : \n";
print " “$programme”, version $version ($dateModif)\n";
print " Permet de chercher les correspondances entre les notices bibliographiques des bases Pascal \n";
print " et Francis de l’INIST et les documents de la base ISTEX. \n";
print "\n";
print $usage;
print "\nOptions : \n";
print " -c indique le nom du ou des corpus de la base ISTEX à interroger (on peut soit répéter \n";
print " cette option, soit mettre tous les noms de corpus séparés par des virgules, mais sans \n";
print " espace) \n";
print " -d active le mode “débogage” \n";
print " -f indique le nom du fichier d’entrée (qui peut être un fichier compressé avec “gzip” ou \n";
print " “bzip2”). Pour utiliser l’entrée standard, mettre un tiret “-” \n";
print " comme argument \n";
print " -h indique le nom du fichier HFD servant d’entrée au programme \n";
print " -i affiche cette aide. \n";
print " -n indique le nom du fichier contenant les notices Pascal ou Francis modifiées parce \n";
print " que les notices originales provoquaient une erreur de syntaxe dans la requête à l’API \n";
print " ISTEX (N.B. : l’absence du fichier indiqué n’entraîne pas l’arrêt du programme) \n";
print " -r indique le numéro de la dernière notice Pascal ou Francis traitée précédemment, pour \n";
print " permettre la reprise de l’alignement si celui-ci a été prématurément arrêté \n";
print " -v permet de suivre la progression du traitement en affichant sur la sortie erreur l’heure \n";
print " de début, l’heure de fin et l’heure chaque fois qu’un lot de notices, correspondant au \n";
print " nombre de notices donné en argument, a été traité \n";
print " \n";
print "N.B. : pour ne pas passer par le proxy sur le réseau interne de l’INIST, il faut effacer les \n";
print " variables globales du proxy par la commande “unset http_proxy https_proxy no_proxy” \n";
print " avant de lancer le programme “$programme”. \n";
print " Également, l’absence du fichier donné par l’option “-n” n’entraîne pas l’arrêt du \n";
print " programme, seulement un message d’erreur. \n";
print " \n";
exit 0;
}
usage(2) if not $fichier and not $hfd;
usage(2) if $fichier and $hfd;
if ( @corpus ) {
@corpus = split(/,/, join(",", @corpus));
}
if ( $verbeux ) {
if ( $verbeux < 0 ) {
print STDERR "\n$programme : l’option “-v” doit être un entier positif !\n";
exit(3);
}
}
# Complétion de la table des entitées HTML
while(<DATA>) {
next if /^\s*$/o or /^#/o;
chomp;
my ($num, $sgml) = split(/\t+/);
next if $entity2char{$sgml};
$entity2char{$sgml} = chr($num);
}
close DATA;
# Paramètres de l'API ISTEX
my $base = "https://api.istex.fr";
my $url = "$base/document/?q=";
my $out = "output=author,copyrightDate,doi,host,pii,pmid,publicationDate,serie,title";
my $size = 100;
my $echec = 0;
# Initialisation de l'agent
my $agent = LWP::UserAgent->new(
cookie_jar => HTTP::CookieJar::LWP->new,
);
$agent->agent("$programme/$version");
$agent->default_header("Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
$agent->default_header("Accept-Language" => "fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3");
$agent->default_header("Accept-Encoding" => "gzip, deflate");
$agent->default_header("Connection" => "keep-alive");
# Allongement du temps d'attente
$agent->timeout(300);
$agent->env_proxy;
# Déclaration des variables concernant la notice à chercher
my $date = "";
my $fascicule = "";
my $isbn = "";
my $issn = "";
my $journal = "";
my $label = "";
my $livre = "";
my $nivbib = ""; # Niveau bibliographique (A, M ou C);
my $nom1 = ""; # Nom du premier auteur
my $num = "";
my $pagedebut = "";
my $pagefin = "";
my $prenom1 = ""; # Prénom du premier auteur
my $titre = "";
my $volume = "";
my @autres = (); # Auteurs (autres que le premier auteur)
my @isbns = ();
my @issns = ();
my @journaux = ();
my @langues = ();
my @livres = ();
my @pages = ();
my @titres = ();
my %modifs = ();
my %notice = ();
my %pages = ();
my %pb = ();
my %rdvfp = ();
my %source = ();
# Déclaration d'un hachage général pour éviter de répéter les
# messages plusieurs fois pour la même notice
my @alertes = ();
my %alerte = ();
# Nombre de notices traitées
my $notice = 0;
# Initialisation du convertisseur de chiffres romains
my $convert = Number::Convert::Roman->new;
# Sortie sans “buffer”
select STDERR;
$| = 1;
select STDOUT;
$| = 1;
# Récupération des notices alternatives du fichier indiqué
# par l’option “- a”
if ( $notices ) {
if ( -f $notices ) {
open(INP, "<:raw", $notices) or die "$!,";
while(<INP>) {
next if /^\s*$/o or /^#/o;
($num) = m|<fA47 dir=\d\d[12]000><s0>(.+?)</s0>|o;
$notice{$num} = $_;
}
close INP;
if ( $verbeux ) {
print STDERR "*** Fichier de notices corrigées : $notices *** \n";
}
}
elsif ( $verbeux ) {
print STDERR "*** Fichier de notices corrigées “$notices” absent *** \n";
}
}
if ( defined $fichier ) {
if ( $fichier eq '-' ) {
open(INP, "<&STDIN") or die "$!,";
binmode(INP, ":raw");
}
elsif ( $fichier =~ /\.gz\z/o ) {
open(INP, "gzip -cd $fichier |") or die "$!,";
binmode(INP, ":raw");
}
elsif ( $fichier =~ /\.bz2\z/o ) {
open(INP, "bzip2 -cd $fichier |") or die "$!,";
binmode(INP, ":raw");
}
else {
open(INP, "<:raw", $fichier) or die "$!,";
}
}
elsif ( defined $hfd ) {
open(INP, "IhfdCat $hfd |") or die "$!,";
binmode(INP, ":raw");
}
if ( $verbeux ) {
print STDERR "==> $programme, version $version ($dateModif) <== \n";
if ( $reprise ) {
print STDERR "*** Reprise du traitemment après la notice \"$reprise\" *** \n";
}
else {
if ( defined $hfd ) {
print STDERR "*** Notices traitées : $hfd *** \n";
}
elsif ( defined $fichier ) {
if ( $fichier eq '-' ) {
print STDERR "*** Notices traitées : entrée standard *** \n";
}
else {
print STDERR "*** Notices traitées : $fichier *** \n";
}
}
my $moment = date();
print STDERR " -> $moment : début\n";
}
}
if ( defined $reprise ) {
while(<INP>) {
next if /^\s*$/o;
# Recherche du numero de notice.
($num) = m|<fA47 dir=\d\d[12]000><s0>(.+?)</s0>|o;
$notice ++;
if ( $num eq $reprise ) {
if ( $verbeux ) {
my $moment = date();
print STDERR " -> $moment : reprise = $notice notices\n";
}
last;
}
}
}
# Modification de l’URL de base si la requête est restreinte à un
# (ou plusieurs) corpus plutôt qu’à l’ensemble de la base ISTEX
if ( @corpus ) {
if ( $#corpus == 0 ) {
$url .= "corpusName:$corpus[0]+AND+"
}
else {
$url .= "corpusName:(" . join("+", @corpus) . ")+AND+"
}
}
while(<INP>) {
next if /^\s*$/o;
next if /^#/o;
# Message pour chaque "$verbeux" notices traitées
$notice ++;
if ( $verbeux and $notice % $verbeux == 0 ) {
my $moment = date();
print STDERR " -> $moment : $notice notices\n";
}
# Remise à zéro des alertes
%alerte = ();
# Recherche du niveau bibbliographique.
($nivbib) = m|<fA61 dir=000000><s0>([AMC])</s0>|o;
# Recherche du numero de notice.
($num) = m|<fA47 dir=\d\d[12]000><s0>(.+?)</s0>|o;
if ( not $num ) {
print STDERR "pas de numéro INIST pour la notice #$notice\n";
next;
}
if ( $notice{$num} ) {
$_ = $notice{$num};
}
# Prétraitement de la notice avant découpe
chomp;
s/\r//o;
s|</record>||o;
# Problème de l'entité "&dquot;" et des trémas
s/&dquot;/"/go;
s/&(\w)\.die;/&${1}uml;/go;
s/&Lstok;/Ł/go;
# Découpe des champs.
my @champs = split(/<f/);
# Recherche de la langue du document.
@langues = ();
foreach my $champ (grep(/^A23 /, @champs)) {
my ($langue) = $champ =~ m|<s0>(.+?)</s0>|o;
push(@langues, $langue);
}
# Recherche du titre en fonction du niveau bib.
$titre = "";
if ( $nivbib eq "A" ) {
my @titres = grep(/^A[06]8 /, @champs);
($titre) = $titres[0] =~ m|<s1>(.+?)</s1>|o;
}
elsif ( $nivbib eq "M" ) {
my @titres = grep(/^A[06]9 /, @champs);
($titre) = $titres[0] =~ m|<s1>(.+?)</s1>|o;
}
elsif ( $nivbib eq "C" ) {
my @titres = grep(/^A[17]0 /, @champs);
($titre) = $titres[0] =~ m|<s1>(.+?)</s1>|o;
}
else {
print STDERR "Attention : pas de titre pour notice n° $num, ligne $.\n";
}
# On pense à mettre le titre INIST en UTF-8, mais une seul fois
$titre = decode_entities($titre);
# Recherche du titre de journal.
my @tmp = ();
my %tmp = ();
$journal = "";
@journaux = grep(/^A64 /, @champs);
if ( $#journaux == -1 ) {
@journaux = grep(/^A03 /, @champs);
}
foreach my $item (@journaux) {
($journal) = $item =~ m|<s0>(.+?)</s0>|o;
push(@tmp, $journal);
}
@journaux = grep(/./, @tmp);
foreach my $item (@journaux) {
if ( $item =~ /^([A-Z]{3,})\. (\p{isUpper}(\p{isAlpha}|&\w+;)+.*)/o ) {
push(@journaux, $1);
push(@journaux, $2);
}
$item =~ s/ : .*//o;
$item =~ s/^\s*\(([^)]+)\) +/$1 /o;
$item =~ s/^(.+) +\(([^)]+)\)\s*\z/$2 $1/o;
if ( $tmp{$item} ) {
$item = "";
}
else {
$tmp{$item} ++;
}
}
@journaux = grep(/./, @journaux);
if ( $#journaux >= 0 ) {
$journal = $journaux[0];
}
# Recherche du titre du livre.
%tmp = ();
$livre = "";
@livres = grep(/^A09 /, @champs);
foreach my $item (@livres) {
($livre) = $item =~ m|<s1>(.+?)</s1>|o;
if ( $tmp{$livre} ) {
$item = "";
}
else {
$item = $livre;
$tmp{$livre} ++;
}
}
@livres = grep(/./, @livres);
if ( $#livres >= 0 ) {
$livre = $livres[0];
}
# Recherche de l'ISSN.
%tmp = ();
$issn = "";
@issns = grep(/^A01 /, @champs);
foreach my $item (@issns) {
($issn) = $item =~ m|<s0>(.+?)</s0>|o;
if ( $issn =~ /^(\d\d\d\d)[^-](\d\d\d[0-9Xx])\z/o ) {
$issn = "$1-$2";
}
elsif ( $issn =~ /^(\d\d\d\d)/(\d\d\d[0-9Xx])\z/o ) {
$issn = "$1-$2";
}
if ( $tmp{$issn} ) {
$item = "";
}
else {
$item = $issn;
$tmp{$issn} ++;
}
}
@issns = grep(/./, @issns);
if ( $#issns >= 0 ) {
$issn = $issns[0];
}
# Recherche de l'ISBN.
%tmp = ();
$isbn = "";
@isbns = grep(/^A26 /, @champs);
foreach my $item (@isbns) {
($isbn) = $item =~ m|<s0>(.+?)</s0>|o;
if ( $tmp{$isbn} ) {
$item = "";
}
else {
$item = $isbn;
$tmp{$isbn} ++;
}
}
@isbns = grep(/./, @isbns);
if ( $#isbns >= 0 ) {
$isbn = $isbns[0];
}
# Recherche de la date de publication
if ( m|<fA21 dir=000000><s1>(.+?)</s1>|o ) {
my $tmp = $1;
if ( $tmp =~ /^[12]\d\d\d/ ) {
$date = substr($tmp, 0, 4);
}
else {
$date = undef;
}
}
# Recherche du numéro de volume
$volume = "";
if ( m|<fA05 dir=000000><s2>(.+?)</s2></fA05>|o ) {
$volume = $1;
$volume =~ s/^\s+//o;
$volume =~ s/\s+\z//o;
$volume =~ s|/|/|go;
if ( $volume =~ m|^(\d+)[-/](\d+)\z|o ) {
my $nb1 = $1;
my $nb2 = $2;
if ( $nb1 > $nb2 and length($nb1) > length($nb2) ) {
my $diff = length($nb1) - length($nb2);
$nb2 = substr($nb1, 0, $diff) . $nb2;
$volume = "$nb1-$nb2";
}
}
}
# Recherche du numéro de fascicule
$fascicule = "";
if ( m|<fA06 dir=000000><s2>(.+?)</s2>.*</fA06>|o ) {
$fascicule = $1;
$fascicule =~ s/^\s+//o;
$fascicule =~ s/\s+\z//o;
$fascicule =~ s/&(horbar|mdash|minus);/-/go;
$fascicule =~ s/ *, */-/go;
$fascicule =~ s/-\s*\z//o;
$fascicule =~ s|/|/|go;
}
# Recherche des numéros de page
@pages = ();
($pagedebut, $pagefin) = ("", "");
if ( m|<fA20 dir=000000>(.+?)</fA20>|o ) {
my $champ = $1;
my $pages = "";
if ( $champ =~ m|<s1>(.+?)</s1>|o ) {
$pages = $1;
}
elsif ( $champ =~ m|<s2>(.+?)</s2>|o ) {
$pages = $1;
}
if ( $pages !~ /^\s*\z/o ) {
$pages =~ s/•/./go;
$pages =~ s/&(horbar|mdash|minus);/-/go;
$pages =~ s/ / /go;
$pages =~ s/^ +//o;
$pages =~ s|/|/|go;
if ( $pages =~ /^(&\w+gr; )?vol\. *\S+?, */io or
$pages =~ /^(&\w+gr; )?vol \S+?, */io or
$pages =~ /^(&\w+gr; )?vol\d+, */io or
$pages =~ /^&\w+gr;,? */o ) {
$pages = $';
}
if ( $pages =~ / *[.+?(].*)?.*\z/io or
$pages =~ / +\d+ p\.].*/io ) {
$pages = $`;
}
if ( $pages =~ / *\(.+?\).*\z/io ) {
$pages = $`;
}
if ( $pages =~ /^([0ivx][ivx]*(-[ivx]+)?,? *([ivx]+(-[ivx]+)?,? *)*) (.+)\z/io ) {
$pages = $5;
}
if ( $pages =~ /^ *(.+?)(, *[ivx]+(-[ivx]+)?((, *| +)[ivx]+(-[ivx]+)?)*,? *)\z/io ) {
my $tmp = $1;
if ( $pages !~ /^ *[ivx]+(-[ivx]+)?((, *| +)[ivx]+(-[ivx]+)?)*,? *\z/io ) {
$pages = $tmp;
}
}
if ( $pages =~ /^s\.p\.\s*\z/io ) {
$pagedebut = undef;
}
elsif ( $pages =~ /^(\d+)-(\d+) +p\./io ) {
$pagedebut = $1;
$pagefin = $2;
}
elsif ( $pages =~ /^\S+ +p\./io ) {
$pagedebut = undef;
}
elsif ( $pages =~ /^p\. *(\S+)\z/io ) {
$pagedebut = $1;
}
elsif ( $pages =~ /^(\S+-\S+)-(\S+-\S+)\z/o ) {
$pagedebut = $1;
$pagefin = $2;
}
elsif ( $pages =~ /^(\S+?)(-\S+)?,( *(\S+-)?(\S+),)* *(\S+-)?(\S+) *\z/o ) {
$pages[0] = { 'debut' => $1, 'fin' => $7 };
my @items = split(/, */, $pages);
while(my $item = shift @items) {
if ( $item =~ /^(\S+?)-(\S+) *\z/o ) {
push(@pages, {'debut' => $1, 'fin' => $2,});
}
else {
if ( @items ) {
push(@pages, {'debut' => $item, 'fin' => "",});
}
else {
push(@pages, {'debut' => "", 'fin' => $item});
}
}
}
}
elsif ( $pages =~ /^(\S+?)-(\S+) *\z/o ) {
$pagedebut = $1;
$pagefin = $2;
}
elsif ( $pages =~ /^(\S+) *\z/o ) {
$pagedebut = $1;
}
}
}
# Recherche du premier auteur
$nom1 = "";
$prenom1 = "";
my @auteurs = ();
if ( $nivbib eq "A" ) {
@auteurs = grep(!/<s9>/, grep(/^A11 /, @champs));
if ( $#auteurs < 0 ) {
@auteurs = grep(/^A11 /, @champs);
}
}
elsif ( $nivbib eq "M" ) {
@auteurs = grep(!/<s9>/, grep(/^A12 /, @champs));
if ( $#auteurs < 0 ) {
@auteurs = grep(/^A12 /, @champs);
}
}
elsif ( $nivbib eq "C" ) {
@auteurs = grep(!/<s9>/, grep(/^A13 /, @champs));
if ( $#auteurs < 0 ) {
@auteurs = grep(/^A13 /, @champs);
}
}
if ( $auteurs[0] ) {
my ($auteur) = $auteurs[0] =~ m|<s1>(.+?)</s1>|o;
if ( $auteur =~ /^(.+?) *\((.+?)\) *\z/ ) {
$nom1 = $1;
$prenom1 = $2;
}
else {
$nom1 = $auteur;
}
shift @auteurs;
}
# Autres auteurs
@autres = ();
foreach my $suivant (@auteurs) {
my ($auteur) = $suivant =~ m|<s1>(.+?)</s1>|o;
if ( $auteur =~ /^(.+?) *\((.+?)\) *\z/ ) {
push(@autres, "$1, $2");
}
else {
push(@autres, $auteur);
}
}
my $autres = join("|", @autres);
# Recherche ISTEX
# Cas n° 1 : article de revue
if ( $nivbib eq "A" and ( $journal or $issn ) ) {
if ( @pages ) {
my $max = 0.0;
my %alt = ();
$label = "\nURI";
foreach my $item (@pages) {
$pagedebut = $item->{'debut'};
$pagefin = $item->{'fin'};
my ($score, $id, $ark, $doi, $pii, $pmid, @pb) = traite();
$label = "ETC";
alerte();
if ( $id ) {
$score *= 5.0;
my $nb = int($score);
my $note = "*" x $nb;
if ( $score - $nb > 0.5 ) {
$note .= "+";
}
$note = "." if not $note;
$note .= "\x{A0}" x (5 - length($note));
$score = sprintf("%.3f", $score);
if ( $score < 3.490 and $rdvfp{$id} ) {
$score .= $rdvfp{$id};
}
if ( $score > $max or
( $score =~ /^\d\.\d+\!\z/o and not
$max =~ /^\d\.\d+\!\z/o ) ) {
$max = $score;
}
if ( $alt{$id} ) {
if ( $alt{$id}{'score'} < $score or
( $score =~ /^\d\.\d+\!\z/o and not
$alt{$id}{'score'} =~ /^\d\.\d+\!\z/o ) ) {
$alt{$id}{'score'} = $score;
$alt{$id}{'note'} = $note;
$alt{$id}{'debut'} = $pagedebut;
$alt{$id}{'fin'} = $pagefin;
$alt{$id}{'ark'} = $ark;
$alt{$id}{'doi'} = $doi;
$alt{$id}{'pii'} = $pii;
$alt{$id}{'pmid'} = $pmid;
}
}
else {
$alt{$id}{'score'} = $score;
$alt{$id}{'note'} = $note;
$alt{$id}{'debut'} = $pagedebut;
$alt{$id}{'fin'} = $pagefin;
$alt{$id}{'ark'} = $ark;
$alt{$id}{'doi'} = $doi;
$alt{$id}{'pii'} = $pii;
$alt{$id}{'pmid'} = $pmid;
}
}
else {
$alt{-1}{'score'} = 0.000;
$alt{-1}{'note'} = 0.000;
$alt{-1}{'debut'} = $pages[0]->{'debut'};
$alt{-1}{'fin'} = $pages[0]->{'fin'};
if ( $pb[0] == -1 ) {
$alt{-1}{'blanc'} = "\n";
}
else {
$alt{-1}{'blanc'} = "";
}
$alt{-1}{'ark'} = undef;
$alt{-1}{'doi'} = undef;
$alt{-1}{'pii'} = undef;
$alt{-1}{'pmid'} = undef;
}
}
foreach my $id (keys %alt) {
if ( $id eq "-1" and $max == 0.0 ) {
print "$alt{-1}{'blanc'}";
print "0\t0.000\t$nivbib\t$num\t$titre\t$journal\t$livre\t$issn\t$isbn\t$date";
print "\t$volume\t$fascicule\t$pagedebut\t$pagefin\t$nom1\t$prenom1\t$autres\t\t\t\t\t\n";
delete $alt{-1};
last;
}
elsif ( $alt{$id}{'score'} eq $max ) {
print "$alt{$id}{'note'}\t$alt{$id}{'score'}\t$nivbib\t$num\t$titre\t$journal";
print "\t$livre\t$issn\t$isbn\t$date\t$volume\t$fascicule\t$pages[0]->{'debut'}";
print "\t$pages[0]->{'fin'}\t$nom1\t$prenom1\t$autres\t$id\t$alt{$id}{'ark'}";
print "\t$alt{$id}{'doi'}\t$alt{$id}{'pii'}\t$alt{$id}{'pmid'}\n";
delete $alt{$id};
last;
}
}
foreach my $id (keys %alt) {
if ( $alt{$id}{'score'} > 3.490 or $alt{$id}{'score'} =~ /^\d\.\d+\!\z/o ) {
print " --> \t$alt{$id}{'debut'}\t$alt{$id}{'fin'}\t$alt{$id}{'score'}\t$id";
print "\t$alt{$id}{'ark'}\t$alt{$id}{'doi'}\t$alt{$id}{'pii'}\t$alt{$id}{'pmid'}\n";
}
}
}
else {
$label = "\nURI";
my ($score, $id, $ark, $doi, $pii, $pmid, @pb) = traite();
alerte();
if ( $id ) {
$score *= 5.0;
my $nb = int($score);
my $note = "*" x $nb;
if ( $score - $nb > 0.5 ) {
$note .= "+";
}
$note = "." if not $note;
$note .= "\x{A0}" x (5 - length($note));
$score = sprintf("%.3f", $score);
if ( $score < 3.490 and $rdvfp{$id} ) {
$score .= $rdvfp{$id};
}
print "$note\t$score\t$nivbib\t$num\t$titre\t$journal\t$livre\t$issn\t$isbn\t$date";
print "\t$volume\t$fascicule\t$pagedebut\t$pagefin\t$nom1\t$prenom1\t$autres\t$id";
print "\t$ark\t$doi\t$pii\t$pmid\n";
if ( @pb ) {
foreach my $pb (sort {$a <=> $b} @pb) {
print " ~~> \t$pb\n";
}
}
}
else {
print "\n" if $pb[0] == -1;
print "0\t0.000\t$nivbib\t$num\t$titre\t$journal\t$livre\t$issn\t$isbn\t$date";
print "\t$volume\t$fascicule\t$pagedebut\t$pagefin\t$nom1\t$prenom1\t$autres\t\t\t\t\t\n";
}
}
}
else {
print "\n_\t-\t$nivbib\t$num\t$titre\t$journal\t$livre\t$issn\t$isbn\t$date";
print "\t$volume\t$fascicule\t$pagedebut\t$pagefin\t$nom1\t$prenom1\t$autres\t\t\t\t\t\n";
}
}
if ( $verbeux ) {
my $moment = date();
print STDERR " -> $moment : fin = $notice notices\n";
}
exit 0;
sub usage
{
print STDERR "\n$usage\n";
exit shift;
}
sub date
{
my @time = localtime();
my $jour = (qw(Dimanche Lundi Mardi Mercredi Jeudi Vendredi Samedi))[$time[6]];
my $mois = (qw(Janvier Février Mars Avril Mai juin Juillet Août Septembre Octobre Novembre Décembre))[$time[4]];
my $annee = $time[5] + 1900;
my $moment = "$jour $time[3] $mois $annee ";
$moment .= sprintf("%02d:%02d:%02D", $time[2], $time[1], $time[0]);
return $moment;
}
sub alerte
{
my $message = shift;
if ( not $message ) {
print @alertes;
@alertes = ();
%alerte = ();
}
elsif ( not $alerte{$message} ) {
push(@alertes, "\t => $message !\n");
$alerte{$message} ++;
}
}
sub max
{
my ($num1, $num2) = @_;
if ( $num1 > $num2 ) {
return $num1;
}
return $num2;
}
sub get_hits
{
my $query = shift;
$query =~ s/ /+/go;
my $uri = "$url$query&$out&size=$size";
$uri .= "&sid=tdm-alignement-pf";
my ($code, $json) = mon_get("$uri");
my $perl = undef;
my $suivant = undef;
my $total = undef;
my @hits = ();
if ( defined $json ) {
eval {
$perl = decode_json $json;
};
if ( $@ ) {
alerte("ERREUR CONVERSION JSON -> PERL");
return(undef);
}
my %top = %{$perl};
if ( $top{'_error'} ) {
alerte("ERREUR \"$top{'_error'}\"");
return(undef);
}
$total = $top{'total'};
if ( $total > 0 ) {
if ( $total > 10000 ) {
alerte("RÉPONSE SUPÉRIEURE À 10 000");
}
if ( defined $top{'nextPageURI'} ) {
$suivant = $top{'nextPageURI'};
}
push(@hits, @{$top{'hits'}});
while($suivant) {
($code, $json) = mon_get("$suivant");
eval {
$perl = decode_json $json;
};
if ( $@ ) {
alerte("ERREUR CONVERSION JSON -> PERL");
return(undef);
}
%top = %{$perl};
if ( defined $top{'nextPageURI'} ) {
$suivant = $top{'nextPageURI'};
}
else {
$suivant = undef;
}
push(@hits, @{$top{'hits'}});
}
}
%top = ("total" => $total, "hits" => \@hits);
return \%top;
}
else {
return undef;
}
}
sub get_total
{
my $query = shift;
$query =~ s/ /+/go;
my $uri = "$url$query&$out&size=0";
$uri .= "&sid=tdm-alignement-pf";
my ($code, $json) = mon_get("$uri");
my $perl = undef;
my $total = undef;
if ( defined $json ) {
eval {
$perl = decode_json $json;
};
if ( $@ ) {
alerte("ERREUR CONVERSION JSON -> PERL");
if ( $json =~ /\"total\": (d+),/o ) {
return($1);
}
else {
return(undef);
}
}
my %top = %{$perl};
if ( $top{'_error'} ) {
alerte("ERREUR \"$top{'_error'}\"");
return(undef);
}
return($top{'total'});
}
}
sub mon_get
{
my $cible = shift;
my $requete = HTTP::Request->new(GET => "$cible");
my $reponse = $agent->request($requete);
my $code = $reponse->code;
# Vérification de la réponse
if ($reponse->is_success) {
return ($code, $reponse->decoded_content);
}
else {
my $echec = 0;
my $message = $reponse->status_line;
if ( $message =~ /\b(read timeout|Proxy Error)\b/o and $echec < 10 ) {
$echec ++;
alerte("INTERRUPTION #$echec");
sleep 2;
return mon_get($cible);
}
else {
alerte("ERREUR $code \"$message\"");
return($code, undef);
}
}
}
sub traite
{
my $id = undef;
my $ark = undef;
my $doi = undef;
my $pii = undef;
my $pmid = undef;
my $score = 0;
my $query = "";
my $altq = 0;
my @attendus = ();
my @mac = ();
my @tmp = ();
my %ok = ();
%alerte = ();
%pages = ();
%pb = ();
%rdvfp = ();
foreach my $item (@journaux) {
push(@tmp, "host.title:\"".propre($item)."\"");
}
foreach my $item (@issns) {
push(@tmp, "host.issn:\"$item\"");
push(@tmp, "host.eissn:\"$item\"");
push(@tmp, "serie.issn:\"$item\"");
push(@tmp, "serie.eissn:\"$item\"");
}
if ( @tmp ) {
if ( $#tmp == 0 ) {
$query .= $tmp[0];
}
elsif ( $#tmp > 0 ) {
$query .= "(" . join(" OR ", @tmp) . ")";
}
if ( defined $source{$query} ) {
if ( $source{$query} == 0 ) {
return(0, undef, undef, undef, undef, undef, -1);
}
}
else {
my $tmp = undef;
while( not defined $tmp ) {
$tmp = get_total($query);
if ( defined $tmp ) {
$source{$query} = $tmp;
if ( $tmp == 0 ) {
return(0, undef, undef, undef, undef, undef, -1);
}
}
}
}
}
else {
alerte("PAS DE REVUE OU ISSN");
return(0, undef, undef, undef, undef, undef, -1);
}
my %query = ();
$query{'journal'} = $query;
if ( $date ) {
$query{'date'} = "publicationDate:$date OR copyrightDate:$date";
$query{'date'} .= " OR host.publicationDate:$date OR host.copyrightDate:$date";
$query{'date'} .= " OR serie.publicationDate:$date OR serie.copyrightDate:$date";
}
if ( $volume ) {
if ( $volume =~ /^\d+\z/o ) {
$query{'volume'} = "host.volume:$volume";
}
elsif ( $volume =~ m|^(\d+)[-/](\d+)\z|o ) {
my $inf = $1;
my $sup = $2;
$query{'volume'} = "host.volume.raw:\"$volume\"";
if ( $inf < $sup ) {
$query{'volume'} .= " OR host.volume:(".join(" ", ($inf .. $sup)).")";
}
}
elsif ( $volume =~ /^(\d+)[A-Za-z]+\z/o ) {
$query{'volume'} = "host.volume.raw:\"$volume\" OR host.volume:$1";
}
elsif ( $volume =~ /^[A-Za-z]+(\d+)\z/o ) {
$query{'volume'} = "host.volume.raw:\"$volume\" OR host.volume:$1";
}
else {
my $tmp = propre($volume);
$query{'volume'} = "host.volume.raw:\"$tmp\"";
alerte("VOLUME \"$tmp\"");
}
}
if ( $fascicule ) {
if ( $fascicule =~ /^\d+\z/o ) {
$query{'fascicule'} = "host.issue:$fascicule";
}
elsif ( $fascicule =~ m|^(\d+)[-/,](\d+)\z|o ) {
my $inf = $1;
my $sup = $2;
$query{'fascicule'} = "host.issue.raw:\"$fascicule\"";
if ( $inf < $sup ) {
$query{'fascicule'} .= " OR host.issue:(".join(" ", ($inf .. $sup)).")";
}
}
elsif ( $fascicule =~ /^(\d+)([A-Za-z]+)\z/o ) {
$query{'fascicule'} = "host.issue.raw:(\"$1$2\" OR \"$2$1\") OR host.issue:$1";
}
elsif ( $fascicule =~ /^([A-Za-z]+)(\d+)\z/o ) {
$query{'fascicule'} = "host.issue.raw:(\"$1$2\" OR \"$2$1\") OR host.issue:$2";
}
elsif ( $fascicule =~ /^\d*[A-Za-z]+\d*[A-Za-z]*\z/o or
$fascicule =~ /^[A-Za-z]+-[A-Za-z]+\z/o ) {
$query{'fascicule'} = "host.issue.raw:\"$fascicule\"";
}
else {
my $tmp = propre($fascicule);
$query{'fascicule'} = "host.issue.raw:\"$tmp\"";
alerte("FASCICULE \"$tmp\"");
}
}
if ( $nom1 ) {
my $auteurs = "";
my $tmp = decode_entities($nom1);
my %tmp = ($tmp => 1);
my @tmp1 = ();
my @mots = $tmp =~ /(\p{IsAlpha}+)/go;
foreach my $mot (@mots) {
next if $mot =~ /^\p{IsAlpha}\z/o;
if ( $mot =~ /[^\0-\177]/ ) {
my $umot = unidecode($mot);
push(@tmp1, "\"".propre($mot)."\"");
push(@tmp1, "\"".propre($umot)."\"");
}
else {
push(@tmp1, "\"".propre($mot)."\"");
}
}
$auteurs = join(" OR ", @tmp1);
@tmp1 = ();
foreach my $item (@autres) {
my $nom = $item;
if ( $item =~ /^(.+?), / ) {
$nom = $1;
}
push @tmp1, decode_entities($nom);
}
my @tmp2 = grep {not $tmp{$_} ++;} @tmp1;
@tmp1 = ($auteurs);
foreach my $nom (@tmp2[0 .. 4]) {
next if not $nom;
next if $nom =~ /^\p{IsAlpha}\z/o;
if ( $nom =~ /[^\0-\177]/ ) {
$tmp = unidecode($nom);
push(@tmp1, "\"".propre($nom)."\" OR \"".propre($tmp)."\"");
}
else {
push(@tmp1, "\"".propre($nom)."\"");
}
}
$auteurs = join(" OR ", grep(/\S+/, @tmp1));
if ( $auteurs ) {
$query{'auteur'} = "author.name:($auteurs)";
}
}
if ( $pagedebut ) {
@tmp = ();
my %tmp = ();
$pages{'first'}{$pagedebut} ++;
if ( $pagedebut =~ /^(.+)-\z/o ) {
$pagedebut = $1;
$pages{'first'}{$pagedebut} ++;
}
if ( $pagefin ) {
$pages{'last'}{$pagefin} ++;
if ( $pagedebut =~ /^\d+\z/o ) {
if ( $pagefin =~ /^\d+\z/o ) {
push(@{$tmp{'first'}}, "[$pagedebut TO $pagefin]");
push(@{$tmp{'last'}}, "[$pagedebut TO $pagefin]");
}
else {
push(@{$tmp{'first'}}, $pagedebut);
push(@{$tmp{'last.raw'}}, "\"$pagefin\"");
my $tmp = $pagefin;
if ( $fascicule and $pagefin =~ /^$fascicule[-.](.+)\z/ ) {
$tmp = $1;
$pages{'last'}{$tmp} ++;
}
elsif ( $pagefin =~ /^\w+[-.](.+)\z/ ) {
$tmp = $1;
$pages{'last'}{$tmp} ++;
}
if ( $tmp =~ /^\d+\z/o ) {
push(@{$tmp{'last'}}, $tmp);
$pages{'last'}{$pagefin} ++;
}
elsif ( $tmp =~ /^(\d+)([A-Za-z]+)\z/o ) {
push(@{$tmp{'last'}}, $1);
push(@{$tmp{'last.raw'}}, "\"$tmp\"");
push(@{$tmp{'last.raw'}}, "\"$2$1\"");
$pages{'last'}{"$1"} ++;
$pages{'last'}{"$2$1"} ++;
}
elsif ( $tmp =~ /^([A-Za-z]+)(\d+)\z/o ) {
push(@{$tmp{'last'}}, $2);
push(@{$tmp{'last.raw'}}, "\"$tmp\"");
push(@{$tmp{'last.raw'}}, "\"$2$1\"");
$pages{'last'}{"$2"} ++;
$pages{'last'}{"$2$1"} ++;
}
elsif ( $tmp ne $pagefin ) {
push(@{$tmp{'last.raw'}}, "\"$tmp\"");
}
}
}
else {
my $tmp = $pagedebut;
if ( $fascicule and $pagedebut =~ /^$fascicule[-.](.+)\z/ ) {
$tmp = $1;
# push(@{$tmp{'first.raw'}}, "\"$tmp\"");
}
elsif ( $pagedebut =~ /^\w+[-.](.+)\z/ ) {
$tmp = $1;
# push(@{$tmp{'first.raw'}}, "\"$tmp\"");
}
if ( $tmp =~ /^\d+\z/o ) {
push(@{$tmp{'first'}}, $tmp);
$pages{'first'}{$tmp} ++;
}
elsif ( $tmp =~ /^(\d+)([A-Za-z]+)\z/o ) {
push(@{$tmp{'first'}}, $1);
push(@{$tmp{'first.raw'}}, "\"$tmp\"");
push(@{$tmp{'first.raw'}}, "\"$2$1\"");
$pages{'first'}{"$1"} ++;
$pages{'first'}{"$2$1"} ++;
}
elsif ( $tmp =~ /^([A-Za-z]+)(\d+)\z/o ) {
push(@{$tmp{'first'}}, $2);
push(@{$tmp{'first.raw'}}, "\"$tmp\"");
push(@{$tmp{'first.raw'}}, "\"$2$1\"");
$pages{'first'}{"$2"} ++;
$pages{'first'}{"$2$1"} ++;
}
else {
push(@{$tmp{'first.raw'}}, "\"$tmp\"");
$pages{'first'}{$tmp} ++;
}
$tmp = $pagefin;
if ( $fascicule and $pagefin =~ /^$fascicule[-.](.+)\z/ ) {
$tmp = $1;
push(@{$tmp{'last.raw'}}, "\"$tmp\"");
}
elsif ( $pagefin =~ /^\w+[-.](.+)\z/ ) {
$tmp = $1;
push(@{$tmp{'last.raw'}}, "\"$tmp\"");
}
$pages{'last'}{$tmp} ++;
if ( $tmp =~ /^\d+\z/o ) {
push(@{$tmp{'last'}}, $tmp);
}
elsif ( $tmp =~ /^(\d+)([A-Za-z]+)\z/o ) {
push(@{$tmp{'last'}}, $1);
push(@{$tmp{'last.raw'}}, "\"$tmp\"");
push(@{$tmp{'last.raw'}}, "\"$2$1\"");
$pages{'last'}{"$1"} ++;
$pages{'last'}{"$2$1"} ++;
}
elsif ( $tmp =~ /^([A-Za-z]+)(\d+)\z/o ) {
push(@{$tmp{'last'}}, $2);
push(@{$tmp{'last.raw'}}, "\"$tmp\"");
push(@{$tmp{'last.raw'}}, "\"$2$1\"");
$pages{'last'}{"$2"} ++;
$pages{'last'}{"$2$1"} ++;
}
else {
push(@{$tmp{'last.raw'}}, "\"$tmp\"");
}
}
}
else {
if ( $pagedebut =~ /^\d+\z/o ) {
push(@{$tmp{'first'}}, "\"$pagedebut\"");
}
else {
my $tmp = $pagedebut;
if ( $fascicule and $pagedebut =~ /^$fascicule[-.](.+)\z/ ) {
$tmp = $1;
push(@{$tmp{'first.raw'}}, "\"$tmp\"");
}
elsif ( $pagedebut =~ /^\w+[-.](.+)\z/ ) {
$tmp = $1;
push(@{$tmp{'first.raw'}}, "\"$tmp\"");
}
$pages{'first'}{$tmp} ++;
if ( $tmp =~ /^\d+\z/o ) {
push(@{$tmp{'first'}}, $tmp);
}
elsif ( $tmp =~ /^(\d+)([A-Za-z]+)\z/o ) {
push(@{$tmp{'first'}}, $1);
push(@{$tmp{'first.raw'}}, "\"$tmp\"");
push(@{$tmp{'first.raw'}}, "\"$2$1\"");
$pages{'first'}{"$1"} ++;
$pages{'first'}{"$2$1"} ++;
}
elsif ( $tmp =~ /^([A-Za-z]+)(\d+)\z/o ) {
push(@{$tmp{'first'}}, $2);
push(@{$tmp{'first.raw'}}, "\"$tmp\"");
push(@{$tmp{'first.raw'}}, "\"$2$1\"");
$pages{'first'}{"$2"} ++;
$pages{'first'}{"$2$1"} ++;
}
else {
push(@{$tmp{'first.raw'}}, "\"$tmp\"");
}
}
}
foreach my $tmp (sort keys %tmp) {
my @items = @{$tmp{$tmp}};
next if $#items < 0;
if ( $#items == 0 ) {
push(@tmp, "host.pages.$tmp:$items[0]");
}
else {
push(@tmp, "host.pages.$tmp:(".join(" OR ", @items).")");
}
}
if ( @tmp ) {
$query{'page'} = join(" OR ", @tmp);
}
}
if ( $titre ) {
@tmp = grep(!/^(and|not|or)\z/io, ($titre =~ /(\w+)/go));
my @mots = sort {length($b) <=> length($a) or $a cmp $b} @tmp;
splice (@mots, 3);
$query{'titre'} = "title:(" . join(" ", @mots) . ")";
}
# Composition de la requête
if ( $query{'date'} or $query{'volume'} or $query{'fascicule'} ) {
$query .= " AND (".join(" OR ", grep {defined $_;} $query{'date'},
$query{'volume'}, $query{'fascicule'}).")";
}
$query{'base'} = $query;
if ( $query{'auteur'} or $query{'page'} ) {
$query .= " AND (".join(" OR ", grep {defined $_;} $query{'auteur'},
$query{'page'}).")";
}
else {
$query .= " AND " . $query{'titre'};
$query{'base'} = undef;
}
# Envoi de la requête
print "$label : \"$url$query&$out\"\n";
my $perl = get_hits($query);
if ( defined $perl ) {
%modifs = ();
my %top = %{$perl};
printf "\t => %d\n", $top{'total'};
if ( $top{'total'} >= 1 ) {
@attendus = ();
if ( $nom1 ) {
push @attendus, decode_entities($nom1) ;
}
foreach my $item (@autres) {
my $nom = $item;
if ( $item =~ /^(.+?), / ) {
$nom = $1;
}
push @attendus, decode_entities($nom);
}
@mac = ();
my $mac = 0;
foreach my $item (@attendus) {
if ( $item =~ /\bma?c ?(\w+)/oi ) {
my $canon = "$`Mc$1$'";
$mac ++ if $canon ne $item;
push(@mac, $canon);
}
else {
push(@mac, $item);
}
}
if ( $mac == 0 ) {
@mac = ();
}
my @hits = @{$top{'hits'}};
foreach my $hit (@hits) {
my ($valeur, $idCourant, $arkCourant, $refId, $ok) = evalue($hit, \@attendus, \@mac);
if ( $valeur > $score ) {
$score = $valeur;
$id = $idCourant;
$ark = $arkCourant;
$doi = defined $refId->{'doi'} ? $refId->{'doi'} : undef;
$pii = defined $refId->{'pii'} ? $refId->{'pii'} : undef;
$pmid = defined $refId->{'pmid'} ? $refId->{'pmid'} : undef;
print STDERR "\t=> score = $score\n\n" if $debug;
}
elsif ( $debug ) {
print STDERR "\n";
}
if ( $ok =~ /D[VIFvif]+/o ) {
$ok{$ok} ++;
}
last if $score == 1.000;
}
}
}
else {
alerte("ERREUR");
}
if ( $score == 0 and $query{'base'} and $query{'titre'} ) {
alerte();
$query = $query{'base'};
$query .= " AND " . $query{'titre'};
# Envoi de la requête
print "RAC : \"$url$query&$out\"\n";
my $perl = get_hits($query);
if ( defined $perl ) {
my %top = %{$perl};
printf "\t => %d\n", $top{'total'};
if ( $top{'total'} >= 1 ) {
my @hits = @{$top{'hits'}};
foreach my $hit (@hits) {
my ($valeur, $idCourant, $arkCourant, $refId, $ok) = evalue($hit, \@attendus, \@mac);
if ( $valeur > $score ) {
$score = $valeur;
$id = $idCourant;
$ark = $arkCourant;
$doi = defined $refId->{'doi'} ? $refId->{'doi'} : undef;
$pii = defined $refId->{'pii'} ? $refId->{'pii'} : undef;
$pmid = defined $refId->{'pmid'} ? $refId->{'pmid'} : undef;
print STDERR "\t=> score = $score\n\n" if $debug;
}
elsif ( $debug ) {
print STDERR "\n";
}
if ( $ok =~ /D[VIFvif]+/o ) {
$ok{$ok} ++;
}
last if $score == 1.000;
}
}
}
else {
alerte("ERREUR");
}
}
my @liste = groupe($pb{$id}) if not @pages;
return ($score, $id, $ark, $doi, $pii, $pmid, @liste);
}
sub evalue
{
my $hit = shift;
my $ref = shift;
my @attendus = @{$ref};
$ref = shift;
my @mac1 = @{$ref};
my $proche = "";
my $score = 0;
my $succes = 0;
my $total = 0;
my %hit = %{$hit};
my $id = $hit{'id'};
my $ark = $hit{'arkIstex'};
my %id = ();
$id{'doi'} = defined $hit{'doi'} ? $hit{'doi'}->[0] : 'N/A';
$id{'pii'} = defined $hit{'pii'} ? $hit{'pii'}->[0] : undef;
$id{'pmid'} = defined $hit{'pmid'} ? $hit{'pmid'}->[0] : undef;
my %host = ();
if ( $hit{'host'} ) {
%host = %{$hit{'host'}};
}
my %serie = ();
if ( $hit{'serie'} ) {
%serie = %{$hit{'serie'}};
}
# Test de l'ISSN
my %bib = ();
my @nums = ();
if ( $host{'issn'} ) {
@nums = @{$host{'issn'}};
}
if ( @nums and @issns ) {
$total ++;
my $trouve = 0;
foreach my $issn1 (@nums) {
foreach my $issn2 (@issns) {
if ( $issn1 eq $issn2 or
uc($issn1) eq uc($issn2) ) {
$trouve = $issn1;
}
}
}
if ( $trouve ) {
$succes ++ ;
$bib{'issn'} = $trouve;
print STDERR "ISSN : +1\n" if $debug;
}
elsif ( $debug ) {
print STDERR "ISSN : 0\n" if $debug;
}
}
elsif ( $debug ) {
print STDERR "ISSN : N.A.\n" if $debug;
}
my $revue = $host{'title'};
my $serie = defined $serie{'title'} ? $serie{'title'} : undef;
# Test du nom de la revue
if ( @journaux and ( $revue or $serie ) ) {
$total ++;
my $trouve = 0;
if ( $revue ) {
$trouve = revue($revue, @journaux);
if ( $trouve ) {
$succes ++ ;
$bib{'revue'} ++;
}
}
if ( $serie and not $trouve ) {
$trouve = revue($serie, @journaux);
if ( $trouve ) {
$succes ++ ;
$bib{'serie'} ++;
}
}
if ( $trouve ) {
print STDERR "Revue : +1\n" if $debug;
}
else {
print STDERR "Revue : 0\n" if $debug;
}
}
elsif ( $debug ) {
print STDERR "Revue : N.A.\n" if $debug;
}
# Récupération de la date
$bib{'publicationDate'} = $hit{'publicationDate'} ? $hit{'publicationDate'} : undef;
$bib{'copyrightDate'} = $hit{'copyrightDate'} ? $hit{'copyrightDate'} : undef;
$bib{'host.publicationDate'} = $host{'publicationDate'} ? $host{'publicationDate'} : undef;
$bib{'host.copyrightDate'} = $host{'copyrightDate'} ? $host{'copyrightDate'} : undef;
$bib{'serie.publicationDate'} = $serie{'publicationDate'} ? $serie{'publicationDate'} : undef;
$bib{'serie.copyrightDate'} = $serie{'copyrightDate'} ? $serie{'copyrightDate'} : undef;
if ( defined $bib{'publicationDate'} and
$bib{'publicationDate'} eq $date ) {
$bib{'date'} ++;
}
if ( defined $bib{'copyrightDate'} and
$bib{'copyrightDate'} eq $date ) {
$bib{'date'} ++;
}
if ( defined $bib{'host.publicationDate'} and
$bib{'host.publicationDate'} eq $date ) {
$bib{'date'} ++;
}
if ( defined $bib{'host.copyrightDate'} and
$bib{'host.copyrightDate'} eq $date ) {
$bib{'date'} ++;
}
if ( defined $bib{'serie.publicationDate'} and
$bib{'serie.publicationDate'} eq $date ) {
$bib{'date'} ++;
}
if ( defined $bib{'serie.copyrightDate'} and
$bib{'serie.copyrightDate'} eq $date ) {
$bib{'date'} ++;
}
if ( $bib{'date'} ) {
$proche .= "D";
}
# Test du volume
my $vol = $host{'volume'};
my $masque = "";
if ( $vol and $volume ) {
$total ++;
$proche .= "v";
if ( $vol eq $volume or biniou($vol, $volume, "VOLUME") ) {
$succes ++;
$bib{'volume'} ++;
$proche =~ s/v/V/o;
print "Volume : +1\n" if $debug;
}
elsif ( unidecode($vol) eq unidecode($volume) ) {
$succes ++;
$bib{'volume'} ++;
$proche =~ s/v/V/o;
print "Volume : +1\n" if $debug;
}
elsif ( $debug ) {
print "Volume : 0\n" if $debug;
}
}
elsif ( $vol and $fascicule ) {
$total ++;
$proche .= "i";
if ( $vol eq $fascicule or biniou($vol, $fascicule, "VOLUME/FASCICULE") ) {
$succes ++;
$bib{'volume'} ++;
$proche =~ s/i/I/o;
$masque = $fascicule;
$fascicule = "";
alerte("VOLUME = ISSUE");
print "Volume : +1\n" if $debug;
}
elsif ( unidecode($vol) eq unidecode($fascicule) ) {
$succes ++;
$bib{'volume'} ++;
$proche =~ s/i/I/o;
$masque = $fascicule;
$fascicule = "";
alerte("VOLUME = ISSUE");
print "Volume : +1\n" if $debug;
}
elsif ( $debug ) {
print "Volume : 0\n" if $debug;
}
}
elsif ( $debug ) {
print STDERR "Volume : N.A.\n";
}
# Test du numéro de fascicule
my $issue = $host{'issue'};
if ( $issue and $fascicule ) {
$total ++;
$proche .= "f";
my $trouve = 0;
if ( $issue eq $fascicule or biniou($issue, $fascicule, "FASCICULE") ) {
$trouve ++;
}
elsif ( unidecode($issue) eq unidecode($fascicule) ) {
$trouve ++;
}
if ( not $trouve ) {
if ( $fascicule =~ /^\p{IsAlpha}+(\d+)\z/ ) {
my $tmp = $1;
$trouve ++ if $issue eq $tmp;
}
elsif ( $fascicule =~ /^(\d+)\p{IsAlpha}+\z/ ) {
my $tmp = $1;
$trouve ++ if $issue eq $tmp;
}
}
if ( $trouve ) {
$succes ++;
$proche =~ s/f/F/o;
$bib{'fascicule'} ++;
print "Numéro : +1\n" if $debug;
}
elsif ( $debug ) {
print "Numéro : 0\n" if $debug;
}
}
elsif ( $debug ) {
print STDERR "Numéro : N.A.\n";
}
if ( $masque and not $fascicule ) {
$fascicule = $masque;
$masque = "";
}
my $okdebut = undef;
my $okfin = undef;
my $first = $host{'pages'}{'first'};
# Test du numéro de première page
if ( $first and $pagedebut ) {
$total ++;
my @tmp = ();
my $safe = $first;
$safe =~ s/(\W)/\\$1/go;
if ( defined $pages{'first'} ) {
@tmp = sort keys %{$pages{'first'}};
}
if ( $first eq $pagedebut ) {
$succes ++;
$bib{'pagedebut'} ++;
$okdebut = $pagedebut;
print STDERR "Page début : +1\n" if $debug;
}
elsif ( grep(/^$safe\z/, @tmp) > 0 ) {
$succes ++;
$bib{'pagedebut'} ++;
$okdebut = (grep(/^$first\z/, @tmp))[0];
print STDERR "Page début : +1\n" if $debug;
}
elsif ( $debug ) {
print STDERR "Page début : 0\n";
}
}
elsif ( $debug ) {
print STDERR "Page début : N.A.\n";
}
my $last = $host{'pages'}{'last'};
# Test du numéro de dernière page
if ( $last and $pagefin ) {
$total ++;
my @tmp = ();
my $safe = $last;
$safe =~ s/(\W)/\\$1/go;
if ( defined $pages{'last'} ) {
@tmp = sort keys %{$pages{'last'}};
}
if ( $last eq $pagefin ) {
$succes ++;
$bib{'pagefin'} ++;
$okfin = $pagefin;
print STDERR "Page fin : +1\n" if $debug;
}
elsif ( grep(/^$safe\z/, @tmp) > 0 ) {
$succes ++;
$bib{'pagefin'} ++;
$okfin = (grep(/^$last\z/, @tmp))[0];
print STDERR "Page fin : +1\n" if $debug;
}
elsif ( $debug ) {
print STDERR "Page fin : 0\n";
}
}
elsif ( $debug ) {
print STDERR "Page fin : N.A.\n";
}
# Indication que les données bibliographiques
# sont a priori correctes
if ( ( $bib{'revue'} or $bib{'issn'} ) and
( $bib{'volume'} or $bib{'fascicule'} ) and
$bib{'date'} and $okdebut and $okfin ) {
$rdvfp{$id} = "!";
}
# Test du pb des notices Pascal (et Francis ?)
# groupant plusieurs articles
$pb{$id} = undef;
if ( $first and $pagedebut and $last and $pagefin ) {
if ( ( $first > $pagedebut and $first <= $pagefin ) or
( $last < $pagefin and $last >= $pagedebut ) ) {
my %tmp = ();
if ( ( $bib{'revue'} or $bib{'issn'} ) and
( $bib{'volume'} or $bib{'fascicule'} or
$bib{'publicationDate'} or $bib{'copyrightDate'} or
$bib{'host.publicationDate'} or $bib{'host.copyrightDate'} or
$bib{'serie.publicationDate'} or $bib{'serie.copyrightDate'} ) ) {
$tmp{'revue'} = $revue if $revue;
$tmp{'serie'} = $serie if $serie;
$tmp{'issn'} = $bib{'issn'} if $bib{'issn'};
$tmp{'volume'} = $vol if $vol;
$tmp{'fascicule'} = $issue if $issue;
$tmp{'publicationDate'} = $bib{'publicationDate'};
$tmp{'copyrightDate'} = $bib{'copyrightDate'};
$tmp{'host.publicationDate'} = $bib{'host.publicationDate'};
$tmp{'host.copyrightDate'} = $bib{'host.copyrightDate'};
$tmp{'serie.publicationDate'} = $bib{'serie.publicationDate'};
$tmp{'serie.copyrightDate'} = $bib{'serie.copyrightDate'};
$tmp{'pagedebut'} = $pagedebut;
$tmp{'pagefin'} = $pagefin;
$pb{$id} = \%tmp;
}
}
}
# Le test devrait être plus circonstancié !!!
# Seulement si on a assez d'éléments pour être sûr !!!
my $bib = 0;
if ( $total >= 4 and $total == $succes ) {
if ( ($bib{'revue'} or $bib{'issn'}) and
($bib{'volume'} or $bib{'fascicule'}) and
($bib{'pagedebut'} or $bib{'pagefin'}) ) {
$bib = 1;
}
}
my $poids = 2;
# Commenté pour l'instant en attendant de voir
# si c'est aussi futé que je le croyais !
# if ( $total <= 4 ) {
# $poids = 1;
# }
# if ( not $bib ) {
# $total = max(4, $total + 1);
# }
if ( $debug ) {
print STDERR "\n\t=> total = $total\n";
print STDERR "\t=> bib = $bib\n";
print STDERR "\t=> poids = $poids\n\n";
}
# Test du titre
my $title = $hit{'title'};
if ( $title and $titre ) {
$total ++;
my $trouve = 0;
my $pot = 0;
if ( $titre =~ /^Pro and con\s*:\s*/o and
$title =~ /^(Pro|Con)\s*:/o ) {
my ($tmp1) = $titre =~ /^Pro and con\s*:\s*(\S.+)/o;
my ($tmp2) = $title =~ /^(?:Pro|Con)\s*:\s*(\S.+)/o;
if ( $pot = titre($tmp2, $tmp1, $bib) ) {
$trouve ++;
}
}
elsif ( $pot = titre($title, $titre, $bib) ) {
$trouve ++;
}
if ( not $trouve and $titre =~ / [IVXLCDM]+ *:/o ) {
if ( $pot = romain($title, $titre) ) {
$trouve ++;
}
}
if ( $trouve ) {
$succes += $pot;
print STDERR "Titre : +$pot\n" if $debug;
}
elsif ( $debug ) {
print STDERR "Titre : 0\n";
}
}
elsif ( $debug ) {
print STDERR "Titre : N.A.\n";
}
#
my @auteurs = ();
if ( $hit{'author'} ) {
@auteurs = @{$hit{'author'}};
}
# Test des noms d'auteur
my @mac2 = ();
my $mac2 = 0;
if ( ( @auteurs and not @attendus ) or
( @attendus and not @auteurs ) ) {
$total += 1;
}
elsif ( @auteurs and @attendus ) {
my $ld = 0;
my $trouve = 0;
my @names = ();
foreach my $auteur (@auteurs) {
my %auteur = %{$auteur};
next unless $auteur{'name'}; # Est-ce possible et si oui, que fait-on ?
my $name = $auteur{'name'};
if ( $name =~ /\bma?c ?(\w+)/oi ) {
my $canon = "$`Mc$1$'";
$mac2 ++ if $canon ne $name;
push(@mac2, $canon);
}
else {
push(@mac2, $name);
}
push(@names, $name);
}
if ( $mac2 == 0 ) {
@mac2 = ();
}
($trouve, $ld) = ld_au(\@attendus, \@names);
if ( $ld and (@mac1 or @mac2) ) {
my $trouve2 = undef;
my $ld2 = undef;
if ( @mac1 and @mac2 ) {
($trouve2, $ld2) = ld_au(\@mac1, \@mac2);
}
elsif ( @mac1 ) {
($trouve2, $ld2) = ld_au(\@mac1, \@names);
}
else {
($trouve2, $ld2) = ld_au(\@attendus, \@mac2);
}
if ( $trouve2 >= $trouve and $ld2 <= $ld ) {
$trouve = $trouve2;
$ld = $ld2;
alerte("CORRECTION MAC");
}
}
if ( $trouve == ($#attendus + 1) and $ld and $bib ) {
alerte("RATTRAPAGE");
$ld = 0;
}
my $ild = 1 - ($ld / ($#auteurs + $#attendus + 2));
$total += $poids;
if ( $#auteurs == $#attendus ) {
$poids = $poids * $trouve * $ild / ($#auteurs + 1);
# $succes += $poids * $trouve * $ild / ($#auteurs + 1);
$succes += $poids;
}
elsif ( $#auteurs > $#attendus ) {
if ( $#attendus > 34 and $#auteurs > 59 and $#auteurs - $#attendus > 2 ) {
$poids = $poids * $trouve * $ild / ($#attendus + 1);
}
else {
$poids = $poids * $trouve * $ild / ($#auteurs + 1);
}
$succes += $poids;
}
else {
$poids = $poids * $trouve * $ild / ($#attendus + 1);
$succes += $poids;
}
print STDERR "Auteurs : +$poids\n\n" if $debug;
}
elsif ( $debug ) {
print STDERR "Auteurs : N.A.\n\n";
}
print STDERR "\t=> total = $total\n" if $debug;
if ( $total ) {
$score = $succes/$total;
print STDERR "\t=> succes = $succes\n" if $debug;
print STDERR "\t=> score = $score\n\n" if $debug;
return($score, $id, $ark, \%id, $proche);
}
else {
alerte("TOTAL NUL");
return(0, $id, $ark, \%id, $proche);
}
}
sub propre
{
my $chaine = shift;
my $tmp1 = "";
my $tmp2 = "";
# Conversion vers UTF-8 et échappement des caractères réservés
$tmp1 = decode_entities($chaine);
$tmp1 =~ s#([+&|!(){}^"~*?:\/])#\\$1#go;
$tmp1 =~ s#([][])#\\$1#go;
# URLencodage
$tmp2 = uri_encode($tmp1);
$tmp2 =~ s/%20/ /go;
$tmp2 =~ s/&/%26/go;
return $tmp2;
}
sub titre
{
my ($t1, $t2, $sb) = @_;
if ( $t1 eq $t2 or
lc($t1) eq lc($t2) or
lc(unidecode($t1)) eq lc(unidecode($t2))) {
return 1;
}
$t2 =~ s|</?su[bp]>||go;
my $tmp1 = lc(join(" ", ($t1 =~ /(\w+)/go)));
my $tmp2 = lc(join(" ", ($t2 =~ /(\w+)/go)));
if ( $tmp1 eq $tmp2 ) {
return 1;
}
elsif ( unidecode($tmp1) eq unidecode($tmp2) ) {
return 1;
}
else {
$tmp1 = unidecode($tmp1);
$tmp2 = unidecode($tmp2);
my $ld = levenshtein_damereau($tmp1, $tmp2);
if ( $ld == 1 and length($tmp2) > 20 ) {
return 1;
}
elsif ( $ld == 2 and length($tmp2) > 40 ) {
return 1;
}
}
if ( $sb ) {
if ( $tmp1 =~ /\b$tmp2\b/ or $tmp2 =~ /\b$tmp1\b/ ) {
return 0.5;
}
}
if ( $t1 =~ /: /o or $t2 =~ /: /o ) {
my ($tmp1) = $t1 =~ /^(.+?)\s*: /o;
my ($tmp2) = $t2 =~ /^(.+?)\s*: /o;
if ( $tmp1 eq $tmp2 ) {
alerte("TITRE RÉDUIT 1");
return 1;
}
else {
$tmp1 = lc(join(" ", ($tmp1 =~ /(\w+)/go)));
$tmp2 = lc(join(" ", ($tmp2 =~ /(\w+)/go)));
if ( $tmp1 eq $tmp2 ) {
alerte("TITRE RÉDUIT 2");
return 1;
}
elsif ( unidecode($tmp1) eq unidecode($tmp2) ) {
alerte("TITRE RÉDUIT 3");
return 1;
}
}
}
## Test partie de titre
($tmp1 = $t1) =~ s/([^\w ])/\\$1/go;
($tmp2 = $t2) =~ s/([^\w ])/\\$1/go;
if ( length($t1) > length($t2) ) {
if ( $tmp1 =~ /^$tmp2\. .+/i ) {
alerte("TITRE RÉDUIT 4");
return 0.5;
}
}
elsif ( length($t1) < length($t2) ) {
if ( $tmp2 =~ /^$tmp1\. .+/i ) {
alerte("TITRE RÉDUIT 4");
return 0.5;
}
}
return 0;
}
sub ld_au # Levenshtein-Damereau adapté aux listes d'auteurs
{
my ($ref1, $ref2) = @_;
my @liste1 = @{$ref1};
my @liste2 = @{$ref2};
my $len1 = $#liste1 + 1;
my $len2 = $#liste2 + 1;
return $len2 if $len1 == 0;
return $len1 if $len2 == 0;
my %mat = ();
for ( my $i = 0 ; $i <= $len1 ; $i ++ ) {
for ( my $j = 0 ; $j <= $len2 ; $j ++ ) {
$mat{$i}{$j} = 0;
$mat{0}{$j} = $j;
}
$mat{$i}{0} = $i;
}
my $cost = 0;
my %trouve = ();
for ( my $i = 1 ; $i <= $len1 ; $i ++ ) {
my $trouve = 0;
for ( my $j = 1 ; $j <= $len2 ; $j ++ ) {
$cost = 1 - compare($liste1[$i-1], $liste2[$j-1]);
$trouve ++ if $cost == 0;
$mat{$i}{$j} = min ([$mat{$i-1}{$j} + 1,
$mat{$i}{$j-1} + 1,
$mat{$i-1}{$j-1} + $cost]);
if ( $i > 1 and $j > 1 and
compare($liste1[$i - 1], $liste2[$j - 2]) and
compare($liste1[$i - 2], $liste2[$j - 1]) ) {
$mat{$i}{$j} = min ([$mat{$i}{$j},
$mat{$i-2}{$j-2} + $cost]);
}
}
$trouve{$liste1[$i-1]} ++ if $trouve;
}
my $trouve = 0;
foreach my $item (keys %trouve) {
$trouve += $trouve{$item};
}
# print "\t -> $trouve ; $mat{$len1}{$len2}\n";
if ( $len1 > 35 and $len2 > 60 and $len2 - $len1 > 2 ) {
return ($trouve, $mat{$len1}{$len1});
}
else {
return ($trouve, $mat{$len1}{$len2});
}
}
sub levenshtein_damereau
{
my ($s1, $s2) = @_;
my $len1 = length $s1;
my $len2 = length $s2;
return $len2 if $len1 == 0;
return $len1 if $len2 == 0;
my %mat = ();
for ( my $i = 0 ; $i <= $len1 ; $i ++ ) {
for ( my $j = 0 ; $j <= $len2 ; $j ++ ) {
$mat{$i}{$j} = 0;
$mat{0}{$j} = $j;
}
$mat{$i}{0} = $i;
}
my @ar1 = split(//, $s1);
my @ar2 = split(//, $s2);
my $cost = 0;
for ( my $i = 1 ; $i <= $len1 ; $i ++ ) {
for ( my $j = 1 ; $j <= $len2 ; $j ++ ) {
$cost = $ar1[$i-1] eq $ar2[$j-1] ? 0 : 1;
$mat{$i}{$j} = min ([$mat{$i-1}{$j} + 1,
$mat{$i}{$j-1} + 1,
$mat{$i-1}{$j-1} + $cost]);
if ( $i > 1 and $j > 1 and
$ar1[$i - 1] eq $ar2[$j - 2] and
$ar1[$i - 2] eq $ar2[$j - 1] ) {
$mat{$i}{$j} = min ([$mat{$i}{$j}, $mat{$i-2}{$j-2} + $cost]);
}
}
}
return $mat{$len1}{$len2};
}
sub min
{
my @liste = @{$_[0]};
my $min = shift @liste;
foreach my $i (@liste) {
$min = $i if $i < $min;
}
return $min;
}
sub romain
{
my ($t2, $t1) = @_;
my $p2 = join(" ", ($t2 =~ /(\p{IsAlnum}+)/go));
while( $t1 =~ / ([IVXLCDM]+) *: */go ) {
my $avant = $`;
my $apres = $';
my $numR = $1;
(my $tav = $avant) =~ s/([^\p{IsAlnum} ])/\\$1/go;
(my $tap = $apres) =~ s/([^\p{IsAlnum} ])/\\$1/go;
my $val = $convert->arabic($numR);
if ( $t2 =~ /^$tav +(\p{IsAlnum}+ )?($val|$numR) *[:.] *$tap\z/i ) {
return 1;
}
else {
my $pav = join(" ", ($avant =~ /(\p{IsAlnum}+)/go));
my $pap = join(" ", ($apres =~ /(\p{IsAlnum}+)/go));
if ( $p2 =~ /^$pav +(\p{IsAlnum}+ )?($val|$numR) *$pap\z/i ) {
return 1;
}
elsif ( $p2 =~ /^$pav +(\p{IsAlnum}+ )?\p{IsAlnum}+ *$pap\z/i ) {
return 0.5;
}
}
}
return 0;
}
sub compare
{
my ($attendu, $name) = @_;
(my $safe = $attendu) =~ s/([^\p{IsAlnum} ])/\\$1/go;
if ( $name =~ /\b$safe\b/i ) {
# print "\t - $attendu <=> $name : 1\n";
return 1;
}
$name =~ s/[\x{60}\x{B4}\x{B8}]//go;
my $tmp1 = lc(unidecode($name));
my $tmp2 = lc(unidecode($attendu));
($safe = $tmp2) =~ s/([^\p{IsAlnum} ])/\\$1/go;
if ( $tmp1 =~ /\b$safe\b/ ) {
# print "\t - $attendu <=> $name : 1\n";
return 1;
}
my @tmp1 = split(/\s+/o, $tmp1);
my $premier = shift @tmp1;
$tmp1 = join(" ", @tmp1, $premier);
if ( $tmp1 =~ /\b$safe\b/ ) {
return 1;
}
unshift @tmp1, $premier;
my $dernier = pop @tmp1;
$tmp1 = join(" ", $dernier, @tmp1);
if ( $tmp1 =~ /\b$safe\b/ ) {
return 1;
}
# print "\t - $attendu <=> $name : 0\n";
return 0;
}
sub biniou
{
my ($val1, $val2, $champ) = @_;
if ( $val1 =~ /^\d+\z/o ) {
if ( $val2 =~ /^(\d+)[-\x{2010}-\x{2015}\x{2212}](\d+)\z/o ) {
my $inf = $1;
my $sup = $2;
if ( $inf < $sup and $val1 >= $inf and $val1 <= $sup ) {
return 1;
}
}
}
elsif ( $val1 =~ m|^(\d+)[-\x{2010}-\x{2015}\x{2212}/](\d+)\z|o ) {
my $inf = $1;
my $sup = $2;
if ( $val2 =~ /^\d+\z/o ) {
if ( $inf < $sup and $val2 >= $inf and $val2 <= $sup ) {
return 1;
}
}
elsif ( $val2 =~ m|^(\d+)[-\x{2010}-\x{2015}\x{2212}/](\d+)\z|o ) {
my $inf2 = $1;
my $sup2 = $2;
if ( $inf == $inf2 and $sup == $sup2 ) {
return 1;
}
}
}
elsif ( $val1 =~ /^([A-Za-z]+)(\d+)\z/o ) {
my $vl = $1;
my $vn = $2;
if ( $val2 =~ /^([A-Za-z])(\d+)[-\x{2010}-\x{2015}\x{2212}]([A-Za-z])(\d+)\z/o ) {
my $infl = $1;
my $infn = $2;
my $supl = $3;
my $supn = $4;
if ( $infl eq $supl ) {
if ( $vl eq $infl and $vn >= $infn and $vn <= $supl ) {
return 1;
}
}
elsif ( $infl lt $supl ) {
if ( $vl eq $infl and $vn >= $infn ) {
return 1;
}
if ( $vl eq $supl and $vn <= $supn ) {
return 1;
}
if ( $vl gt $infl and $vl lt $supl ) {
return 1
}
}
else {
alerte("FORMAT $champ ISTEX BIZARRE \"$val2\"")
}
}
}
elsif ( $val1 =~ /^([A-Za-z])(\d+)[-\x{2010}-\x{2015}\x{2212}]([A-Za-z])(\d+)\z/o ) {
my $infl = $1;
my $infn = $2;
my $supl = $3;
my $supn = $4;
if ( $val2 =~ /^([A-Za-z]+)(\d+)\z/o ) {
my $vl = $1;
my $vn = $2;
if ( $infl eq $supl ) {
if ( $vl eq $infl and $vn >= $infn and $vn <= $supl ) {
return 1;
}
}
elsif ( $infl lt $supl ) {
if ( $vl eq $infl and $vn >= $infn ) {
return 1;
}
if ( $vl eq $supl and $vn <= $supn ) {
return 1;
}
if ( $vl gt $infl and $vl lt $supl ) {
return 1
}
}
else {
alerte("FORMAT $champ INIST BIZARRE \"$val1\"")
}
}
}
else {
alerte("FORMAT $champ INIST INATTENDU \"$val1\"");
if ( $val2 !~ /^\d+\z/o and
$val2 !~ /^([A-Za-z]+)(\d+)\z/o and
$val2 !~ /^(\d+)[-\x{2010}-\x{2015}\x{2212}](\d+)\z/o and
$val2 !~ /^([A-Za-z])(\d+)[-\x{2010}-\x{2015}\x{2212}]([A-Za-z])(\d+)\z/o ) {
alerte("FORMAT $champ ISTEX INATTENDU \"$val2\"");
}
}
return 0;
}
sub groupe
{
my $ref = shift;
my $query = "";
my @liste = ();
return @liste if not defined $ref or ref($ref) ne 'HASH';
alerte();
my %hash = %{$ref};
if ( defined $hash{'revue'} ) {
my $revue = propre($hash{'revue'});
$query = "host.title:\"$revue\"";
}
if ( defined $hash{'serie'} ) {
my $revue = propre($hash{'serie'});
if ( $revue !~ /#/o ) {
$query .= " AND " if $query;
$query = "serie.title:\"$revue\"";
}
}
if ( defined $hash{'issn'} ) {
my $issn = propre($hash{'issn'});
$query .= " AND " if $query;
$query .= "host.issn:\"$issn\"";
}
if ( defined $hash{'publicationDate'} ) {
my $pdate = propre($hash{'publicationDate'});
$query .= " AND " if $query;
$query .= "host.publicationDate:\"$pdate\"";
}
elsif ( defined $hash{'copyrightDate'} ) {
my $cdate = propre($hash{'copyrightDate'});
$query .= " AND " if $query;
$query .= "host.copyrightDate:\"$cdate\"";
}
if ( defined $hash{'volume'} ) {
my $volume = propre($hash{'volume'});
$query .= " AND " if $query;
if ( $volume =~ /^\d+\z/o ) {
$query .= "host.volume:\"$volume\"";
}
else {
$query .= "host.volume.raw:\"$volume\"";
}
}
if ( defined $hash{'fascicule'} ) {
my $fascicule = propre($hash{'fascicule'});
$query .= " AND " if $query;
if ( $fascicule =~ /^\d+\z/o ) {
$query .= "host.issue:$fascicule";
}
else {
$query .= "host.issue.raw:\"$fascicule\"";
}
}
if ( defined $hash{'pagedebut'} and
defined $hash{'pagefin'} ) {
my $pagedebut = propre($hash{'pagedebut'});
my $pagefin = propre($hash{'pagefin'});
$query .= " AND " if $query;
if ( $pagedebut =~ /^\d+\z/o and $pagefin =~ /^\d+\z/o ) {
$query .= "host.pages.first:[$pagedebut TO $pagefin]";
$query .= " AND host.pages.last:[$pagedebut TO $pagefin]";
}
else {
$query .= "host.pages.first.raw:[$pagedebut TO $pagefin]";
$query .= " AND host.pages.last.raw:[$pagedebut TO $pagefin]";
}
}
## Envoi de la requête
print "ALT : \"$url$query&$out\"\n";
my $perl = get_hits($query);
if ( defined $perl ) {
my %top = %{$perl};
if ( $top{'total'} >= 1 ) {
foreach my $hit (@{$top{'hits'}}) {
my %hit = %{$hit};
my $id = $hit{'id'};
my $ark = $hit{'arkIstex'};
my $titre = $hit{'title'};
my $doi = 'N/A';
if ( defined $hit{'doi'} ) {
$doi = $hit{'doi'}->[0];
}
my @auteurs = ();
if ( $hit{'author'} ) {
foreach my $auteur (@{$hit{'author'}}) {
if ( defined $auteur->{'name'} ) {
push(@auteurs, $auteur->{'name'});
}
}
}
if ( $hit{'host'} ) {
my %host = %{$hit{'host'}};
if ( defined $host{'pages'} ) {
my ($debut, $fin) = (undef, undef);
if ( defined $host{'pages'}->{'first'} ) {
$debut = $host{'pages'}->{'first'};
}
if ( defined $host{'pages'}->{'last'} ) {
$fin = $host{'pages'}->{'last'};
}
if ( $debut and $fin ) {
my $tmp = "$debut\t$fin\t";
$tmp .= join("|", @auteurs);
$tmp .= "\t$titre\t$id\t$ark\t$doi";
push(@liste, $tmp);
}
}
}
}
}
}
return @liste;
}
sub revue
{
my ($title, @liste) = @_;
my $match = 0;
my $rv = lc(join(" ", ($title =~ /(\w+)/go)));
foreach my $item (@liste) {
my $jn = decode_entities($item);
if ( $jn eq $title or lc($jn) eq lc($title) ) {
$match ++;
}
else {
my $tmp = lc(join(" ", ($jn =~ /(\w+)/go)));
if ( $tmp eq $rv ) {
$match ++;
}
elsif ( $tmp =~ / [a-z]\z/o and $rv =~ /^$tmp / ) {
$match ++;
print "\t -> $num\t$tmp\t$rv\n" if not $modifs{"$tmp\t$rv"};
$modifs{"$tmp\t$rv"} ++;
}
elsif ( $rv =~ / [a-z]\z/o and $tmp =~ /^$rv / ) {
$match ++;
print "\t -> $num\t$tmp\t$rv\n" if not $modifs{"$tmp\t$rv"};
$modifs{"$tmp\t$rv"} ++;
}
else {
my $tmp1 = join(" ", grep(!/^(and|et|und|e|y)\z/, split(/ +/, $tmp)));
my $tmp2 = join(" ", grep(!/^(and|et|und|e|y)\z/, split(/ +/, $rv)));
if ( $tmp1 eq $tmp2 ) {
$match ++;
}
else {
$tmp1 =~ s/^(the|die|das|les?|la?|du|[ei]l) //o;
$tmp2 =~ s/^(the|die|das|les?|la?|du|[ei]l) //o;
$match ++ if $tmp1 eq $tmp2;
}
}
}
if ( not $match ) {
if ( $jn =~ /\s*: /o ) {
my ($tmp) = ($jn =~ /^(.+)\s*:/o);
if ( $tmp eq $title or lc($tmp) eq lc($title) ) {
$match ++;
}
}
if ( $title =~ /\s*: /o ) {
my ($tmp) = ($title =~ /^(.+)\s*:/o);
if ( $jn eq $tmp or lc($jn) eq lc($tmp) ) {
$match ++;
}
}
}
}
return $match;
}
__DATA__
##
## NE PAS MODIFIER !
##
## DO NOT EDIT!
##
33 excl
34 dquot
35 num
36 dollar
37 percnt
40 lpar
41 rpar
42 ast
43 plus
44 comma
45 hyphen
46 period
47 sol
58 colon
59 semi
61 equals
63 quest
64 commat
91 lsqb
92 bsol
93 rsqb
95 lowbar
96 grave
123 lcub
124 verbar
256 Amacr;
257 amacr;
258 Abreve;
259 abreve;
260 Aogon;
261 aogon;
262 Cacute;
263 cacute;
264 Ccirc;
265 ccirc;
266 Cdot;
267 cdot;
268 Ccaron;
269 ccaron;
270 Dcaron;
271 dcaron;
272 Dstrok;
273 dstrok;
274 Emacr;
275 emacr;
278 Edot;
279 edot;
280 Eogon;
281 eogon;
282 Ecaron;
283 ecaron;
284 Gcirc;
285 gcirc;
286 Gbreve;
287 gbreve;
288 Gdot;
289 gdot;
290 Gcedil;
291 gcedil;
292 Hcirc;
293 hcirc;
294 Hstrok;
295 hstrok;
296 Itilde;
297 itilde;
298 Imacr;
299 imacr;
302 Iogon;
303 iogon;
304 Idot;
305 inodot;
306 IJlig;
307 ijlig;
308 Jcirc;
309 jcirc;
310 Kcedil;
311 kcedil;
312 kgreen;
313 Lacute;
314 lacute;
315 Lcedil;
316 lcedil;
317 Lcaron;
318 lcaron;
319 Lmidot;
320 lmidot;
321 Lstrok;
322 lstrok;
323 Nacute;
324 nacute;
325 Ncedil;
326 ncedil;
327 Ncaron;
328 ncaron;
329 napos;
330 ENG;
331 eng;
332 Omacr;
333 omacr;
336 Odblac;
337 odblac;
340 Racute;
341 racute;
342 Rcedil;
343 rcedil;
344 Rcaron;
345 rcaron;
346 Sacute;
347 sacute;
348 Scirc;
349 scirc;
350 Scedil;
351 scedil;
354 Tcedil;
355 tcedil;
356 Tcaron;
357 tcaron;
358 Tstrok;
359 tstrok;
360 Utilde;
361 utilde;
362 Umacr;
363 umacr;
364 Ubreve;
365 ubreve;
366 Uring;
367 uring;
368 Udblac;
369 udblac;
370 Uogon;
371 uogon;
372 Wcirc;
373 wcirc;
374 Ycirc;
375 ycirc;
377 Zacute;
378 zacute;
379 Zdot;
380 zdot;
381 Zcaron;
382 zcaron;
501 gacute;
711 caron;
728 breve;
729 dot;
730 ring;
731 ogon;
733 dblac;
902 Aacgr;
904 Eacgr;
905 EEacgr;
906 Iacgr;
908 Oacgr;
910 Uacgr;
911 OHacgr;
912 idiagr;
913 Agr;
914 Bgr;
915 Ggr;
916 Dgr;
917 Egr;
918 Zgr;
919 EEgr;
920 THgr;
921 Igr;
922 Kgr;
923 Lgr;
924 Mgr;
925 Ngr;
926 Xgr;
927 Ogr;
928 Pgr;
929 Rgr;
931 Sgr;
932 Tgr;
933 Ugr;
934 PHgr;
935 KHgr;
936 PSgr;
937 OHgr;
938 Idigr;
939 Udigr;
940 aacgr;
941 eacgr;
942 eeacgr;
943 iacgr;
944 udiagr;
945 agr;
946 bgr;
947 ggr;
948 dgr;
949 egr;
950 zgr;
951 eegr;
952 thgr;
953 igr;
954 kgr;
955 lgr;
956 mgr;
957 ngr;
958 xgr;
959 ogr;
960 pgr;
961 rgr;
962 sfgr;
963 sgr;
964 tgr;
965 ugr;
966 phgr;
967 khgr;
968 psgr;
969 ohgr;
970 idigr;
971 udigr;
972 oacgr;
973 uacgr;
974 ohacgr;
977 thetav;
981 phiv;
988 gammad;
1008 kappav;
1009 rhov;
1025 IOcy;
1026 DJcy;
1027 GJcy;
1028 Jukcy;
1029 DScy;
1030 Iukcy;
1031 YIcy;
1032 Jsercy;
1033 LJcy;
1034 NJcy;
1035 TSHcy;
1036 KJcy;
1038 Ubrcy;
1039 DZcy;
1040 Acy;
1041 Bcy;
1042 Vcy;
1043 Gcy;
1044 Dcy;
1045 IEcy;
1046 ZHcy;
1047 Zcy;
1048 Icy;
1049 Jcy;
1050 Kcy;
1051 Lcy;
1052 Mcy;
1053 Ncy;
1054 Ocy;
1055 Pcy;
1056 Rcy;
1057 Scy;
1058 Tcy;
1059 Ucy;
1060 Fcy;
1061 KHcy;
1062 TScy;
1063 CHcy;
1064 SHcy;
1065 SHCHcy;
1066 HARDcy;
1067 Ycy;
1068 SOFTcy;
1069 Ecy;
1070 YUcy;
1071 YAcy;
1072 acy;
1073 bcy;
1074 vcy;
1075 gcy;
1076 dcy;
1077 iecy;
1078 zhcy;
1079 zcy;
1080 icy;
1081 jcy;
1082 kcy;
1083 lcy;
1084 mcy;
1085 ncy;
1086 ocy;
1087 pcy;
1088 rcy;
1089 scy;
1090 tcy;
1091 ucy;
1092 fcy;
1093 khcy;
1094 tscy;
1095 chcy;
1096 shcy;
1097 shchcy;
1098 hardcy;
1099 ycy;
1100 softcy;
1101 ecy;
1102 yucy;
1103 yacy;
1105 iocy;
1106 djcy;
1107 gjcy;
1108 jukcy;
1109 dscy;
1110 iukcy;
1111 yicy;
1112 jsercy;
1113 ljcy;
1114 njcy;
1115 tshcy;
1116 kjcy;
1118 ubrcy;
1119 dzcy;
8196 emsp13;
8197 emsp14;
8199 numsp;
8200 puncsp;
8202 hairsp;
8208 dash;
8211 ndash;
8212 mdash;
8213 horbar;
8214 Verbar;
8229 nldr;
8244 tprime;
8245 bprime;
8257 caret;
8259 hybull;
8411 tdot;
8412 DotDot;
8453 incare;
8459 hamilt;
8463 planck;
8466 lagran;
8467 ell;
8470 numero;
8471 copysr;
8478 rx;
8486 ohm;
8491 angst;
8492 bernou;
8499 phmmat;
8500 order;
8502 beth;
8503 gimel;
8504 daleth;
8531 frac13;
8532 frac23;
8533 frac15;
8534 frac25;
8535 frac35;
8536 frac45;
8537 frac16;
8538 frac56;
8539 frac18;
8540 frac38;
8541 frac58;
8542 frac78;
8597 varr;
8598 nwarr;
8599 nearr;
8600 drarr;
8601 dlarr;
8602 nlarr;
8603 nrarr;
8605 rarrw;
8606 Larr;
8608 Rarr;
8610 larrtl;
8611 rarrtl;
8614 map;
8617 larrhk;
8618 rarrhk;
8619 larrlp;
8620 rarrlp;
8621 harrw;
8622 nharr;
8624 lsh;
8625 rsh;
8630 cularr;
8631 curarr;
8634 olarr;
8635 orarr;
8636 lharu;
8637 lhard;
8638 uharr;
8639 uharl;
8640 rharu;
8641 rhard;
8642 dharr;
8643 dharl;
8644 rlarr2;
8646 lrarr2;
8647 larr2;
8648 uarr2;
8649 rarr2;
8650 darr2;
8651 lrhar2;
8652 rlhar2;
8653 nlArr;
8654 nhArr;
8655 nrArr;
8661 vArr;
8666 lAarr;
8667 rAarr;
8705 comp;
8708 nexist;
8714 epsis;
8717 bepsi;
8720 coprod;
8722 minus;
8723 mnplus;
8724 plusdo;
8726 setmn;
8728 compfn;
8735 ang90;
8737 angmsd;
8738 angsph;
8739 mid;
8740 nmid;
8741 par;
8742 npar;
8750 conint;
8757 becaus;
8765 bsim;
8768 wreath;
8769 nsim;
8771 sime;
8772 nsime;
8775 ncong;
8777 nap;
8778 ape;
8780 bcong;
8782 bump;
8783 bumpe;
8784 esdot;
8785 eDot;
8786 efDot;
8787 erDot;
8788 colone;
8789 ecolon;
8790 ecir;
8791 cire;
8793 wedgeq;
8796 trie;
8802 nequiv;
8806 lE;
8807 gE;
8808 lne;
8809 gne;
8810 Lt;
8811 Gt;
8812 twixt;
8814 nlt;
8815 ngt;
8816 nle;
8817 nge;
8818 lsim;
8819 gsim;
8822 lg;
8823 gl;
8826 pr;
8827 sc;
8828 pre;
8829 sce;
8830 prsim;
8831 scsim;
8832 npr;
8833 nsc;
8837 nsup;
8840 nsube;
8841 nsupe;
8842 subne;
8843 supne;
8846 uplus;
8847 sqsub;
8848 sqsup;
8849 sqsube;
8850 sqsupe;
8851 sqcap;
8852 sqcup;
8854 ominus;
8856 osol;
8857 odot;
8858 ocir;
8859 oast;
8861 odash;
8862 plusb;
8863 minusb;
8864 timesb;
8865 sdotb;
8866 vdash;
8867 dashv;
8868 top;
8871 models;
8872 vDash;
8873 Vdash;
8874 Vvdash;
8876 nvdash;
8877 nvDash;
8878 nVdash;
8879 nVDash;
8882 vltri;
8883 vrtri;
8884 ltrie;
8885 rtrie;
8888 mumap;
8890 intcal;
8891 veebar;
8892 barwed;
8900 diam;
8902 sstarf;
8903 divonx;
8904 bowtie;
8905 ltimes;
8906 rtimes;
8907 lthree;
8908 rthree;
8909 bsime;
8910 cuvee;
8911 cuwed;
8912 Sub;
8913 Sup;
8914 Cap;
8915 Cup;
8916 fork;
8918 ldot;
8919 gsdot;
8920 Ll;
8921 Gg;
8922 leg;
8923 gel;
8924 els;
8925 egs;
8926 cuepr;
8927 cuesc;
8928 npre;
8929 nsce;
8934 lnsim;
8935 gnsim;
8936 prnsim;
8937 scnsim;
8938 nltri;
8939 nrtri;
8940 nltrie;
8941 nrtrie;
8942 vellip;
8966 Barwed;
8972 drcrop;
8973 dlcrop;
8974 urcrop;
8975 ulcrop;
8981 telrec;
8982 target;
8988 ulcorn;
8989 urcorn;
8990 dlcorn;
8991 drcorn;
8994 frown;
8995 smile;
9251 blank;
9416 oS;
9472 boxh;
9474 boxv;
9484 boxdr;
9488 boxdl;
9492 boxur;
9496 boxul;
9500 boxvr;
9508 boxvl;
9516 boxhd;
9524 boxhu;
9532 boxvh;
9552 boxH;
9553 boxV;
9554 boxdR;
9555 boxDr;
9556 boxDR;
9557 boxdL;
9558 boxDl;
9559 boxDL;
9560 boxuR;
9561 boxUr;
9562 boxUR;
9563 boxuL;
9564 boxUl;
9565 boxUL;
9566 boxvR;
9567 boxVr;
9568 boxVR;
9569 boxvL;
9570 boxVl;
9571 boxVL;
9572 boxHd;
9573 boxhD;
9574 boxHD;
9575 boxHu;
9576 boxhU;
9577 boxHU;
9578 boxvH;
9579 boxVh;
9580 boxVH;
9600 uhblk;
9604 lhblk;
9608 block;
9617 blk14;
9618 blk12;
9619 blk34;
9633 squ;
9642 squf;
9645 rect;
9646 marker;
9651 xutri;
9652 utrif;
9653 utri;
9656 rtrif;
9657 rtri;
9661 xdtri;
9662 dtrif;
9663 dtri;
9666 ltrif;
9667 ltri;
9675 cir;
9733 starf;
9734 star;
9742 phone;
9792 female;
9794 male;
9834 sung;
9837 flat;
9838 natur;
9839 sharp;
10003 check;
10007 cross;
10016 malt;
10022 lozf;
10038 sext;
64256 fflig;
64257 filig;
64258 fllig;
64259 ffilig;
##
## The End!
##