#!/usr/bin/perl
# Déclaration des pragmas
use strict;
use utf8;
use open qw/:std :utf8/;
# Appel des modules externes de base
use Encode qw(decode_utf8 encode_utf8 is_utf8);
use Getopt::Long;
# Appel des modules spécifiques à l'application
use URI::Encode qw(uri_encode uri_decode);
use LWP::Simple;
use JSON;
use Text::Unidecode;
## use XML::Twig;
my ($programme) = $0 =~ m|^(?:.*/)?(.+)|;
my $version = "2.2.1";
my $dateModif = "05 Octobre 2017";
# Variables
my $destination = "";
my $help = 0;
my $mail = "";
my $numero = 0;
my $quiet = 0;
my $requete = "";
eval {
$SIG{__WARN__} = sub {usage(1);};
GetOptions(
"destination=s" => \$destination,
# "help" => \$help,
"mail=s" => \$mail,
# "numero=i" => \$numero,
"quiet" => \$quiet,
"requete=s" => \$requete,
);
};
$SIG{__WARN__} = sub {warn $_[0];};
usage(0) if $help;
usage(2) if not $destination;
my $sender = "";
if ( $mail ) {
if ( $mail !~ /^\w+(\.\w+)*\@\w+(\.\w+)+\z/ ) {
print STDERR "Erreur : adresse e-mail \"$mail\" incorrect\n";
exit 3;
}
$mail =~ s/(\W)/\\$1/go;
my $hostname = `hostname`;
my $domain = `hostname -d`;
$sender = "$ENV{'USER'}\@$hostname.$domain";
$sender =~ s/[\n\r]+//go;
}
if ( $destination eq "-" ) {
open(OUT, ">&STDOUT") or die "$!,";
}
elsif ( $destination =~ /\.gz\z/ ) {
open(OUT, "| gzip -c -9 > $destination") or die "$!,";
}
elsif ( $destination =~ /\.bz2\z/ ) {
open(OUT, "| bzip2 -c -9 > $destination") or die "$!,";
}
else {
$destination =~ s/\W*\z/.gz/;
open(OUT, "| gzip -c -9 > $destination") or die "$!,";
}
# Paramètres de l'API ISTEX
my $base = "https://api.istex.fr";
my $url = "$base/document/?q=";
my $req = "*";
my $out = "output=*";
my $size = 300;
if ( $requete ) {
$req = decode_utf8($requete, Encode::FB_QUIET);
if ( $requete ) {
die "Erreur : La requête n'est pas en UTF-8\n";
}
$req = propre($req);
}
#if ( $req =~ / /o ) {
# $req =~ s/ /+/go;
# }
my $uri = "$url$req&$out&size=$size&scroll=167s";
# Variables concernant les documents
my $corpus = "";
my $etape = 1000000;
my $id = "";
my $num = $numero;
my $suivant = "";
my $total = 0;
# Première itération
my $heure = date();;
if ( not $quiet ) {
print STDERR "Processus n° $$\n\n";
print STDERR "Démarrage le $heure\n";
}
message("Processus n° $$\nDémarrage le $heure");
my $nb = 10;
while( $nb ) {
print STDERR "URI : \"$uri\"\n" if not $quiet;
my $json = get("$uri");
my $perl = undef;
if ( defined $json ) {
eval {
$perl = decode_json $json;
};
if ( $@ ) {
print STDERR "[" . pretty($num) . "] Erreur JSON => Perl : $@ \n";
exit 6;
# print STDERR "[", pretty($num), "] Erreur JSON => Perl : $@ \n";
# sleep 30;
# next;
}
my %top = %{$perl};
$total = $top{'total'};
if ( $total > 0 ) {
print STDERR "Total : $total\n" if not $quiet;
if ( $top{'nextScrollURI'} ) {
$suivant = $top{'nextScrollURI'};
}
else {
$suivant = "";
}
my @hits = @{$top{'hits'}};
foreach my $hit (@hits) {
$num ++;
traite($hit);
}
$nb = 0;
}
else {
print STDERR "Aucun document pour la requête \'$req\'\n";
exit 4;
}
}
else {
$heure = date();;
if ( $nb ) {
$nb --;
print STDERR "Pause (10 sec.) à la notice ", pretty($num), " le $heure\n";
sleep 10;
}
else {
print STDERR "Aucune réponse du serveur \"$base\" le $heure\n";
exit 5;
}
}
}
# Itérations suivantes
$nb = 10;
while ( $suivant ) {
my $json = get("$suivant");
my $perl = undef;
if ( defined $json ) {
$nb = 10;
eval {
$perl = decode_json $json;
};
if ( $@ ) {
print STDERR "[", pretty($num), "] Erreur JSON => Perl : $@ \n";
sleep 30;
next;
}
$perl = decode_json $json;
my %top = %{$perl};
if ( $top{'nextScrollURI'} ) {
$suivant = $top{'nextScrollURI'};
}
else {
$suivant = "";
}
my @hits = @{$top{'hits'}};
foreach my $hit (@hits) {
$num ++;
traite($hit);
}
}
else {
$heure = date();;
if ( $nb ) {
$nb --;
print STDERR "Pause (10 sec.) à la notice ", pretty($num), " le $heure\n";
sleep 10;
}
else {
print STDERR "Aucune réponse du serveur \"$base\" le $heure\n";
sleep 20;
$heure = date();;
print STDERR "Reprise le $heure\n";
$nb = 10;
}
}
}
close OUT;
$heure = date();;
if ( not $quiet ) {
print STDERR "Arrêt le $heure\n";
}
if ( $num > 1 ) {
message("Arrêt le $heure\n$num/$total enregistrements traités");
}
else {
message("Arrêt le $heure\n$num/$total enregistrement traité");
}
exit 0;
sub usage
{
my $code = shift;
print STDERR "Usage : $programme -d destination [ -r 'requête' ] [ -m adresse_mail ] [ -q ]\n";
exit $code;
}
sub traite
{
my $hit = shift;
my %hit = %{$hit};
$id = $hit{'id'};
$corpus = defined $hit{'corpusName'} ? $hit{'corpusName'} : "Inconnu";
my $lien = "";
my $type = "";
my $langue = "";
my $resume = 0;
my $pdf = "?";
my $titre = "";
my $issn = "";
my $eissn = "";
my $isbn = "";
my $volume = "";
my $issue = "";
my $date = "";
my @catWoS = ();
my @catSM = ();
my @catPF = (); # Pascal & Francis
my @sujets = ();
my @types = ();
my $genre = "";
if ( defined $hit{'genre'} ) {
$genre = join(", ", @{$hit{'genre'}});
}
if ( defined $hit{'publicationDate'} ) {
$date = $hit{'publicationDate'};
}
if ( defined $hit{'copyrightDate'} ) {
$date = $hit{'copyrightDate'};
}
if ( defined $hit{'language'} ) {
$langue = join(", ", @{$hit{'language'}});
}
if ( defined $hit{'abstract'} ) {
$resume = length($hit{'abstract'});
}
if ( defined $hit{'qualityIndicators'} ) {
my %indicateurs = %{$hit{'qualityIndicators'}};
if ( defined $indicateurs{'pdfVersion'} ) {
$pdf = $indicateurs{'pdfVersion'};
}
}
if ( defined $hit{'categories'} ) {
my %categories = %{$hit{'categories'}};
if ( defined $categories{'wos'} ) {
@catWoS = @{$categories{'wos'}};
}
if ( defined $categories{'scienceMetrix'} ) {
@catSM = @{$categories{'scienceMetrix'}};
}
if ( defined $categories{'inist'} ) {
@catPF = @{$categories{'inist'}};
}
}
if ( defined $hit{'subject'} ) {
foreach my $subject (@{$hit{'subject'}}) {
my $valeur = $subject->{'value'};
if ( $valeur =~ /[\n\t]+/o ) {
$valeur =~ s/[\n\t]+//go;
print STDERR "Valeur de descripteur incorrect pour la notice n° $num [$id]\n";
}
push(@sujets, $valeur);
}
}
if ( defined $hit{'host'} ) {
my %host = %{$hit{'host'}};
if ( defined $host{'issn'} ) {
$issn = join(", ", @{$host{'issn'}});
}
if ( defined $host{'eissn'} ) {
$eissn = join(", ", @{$host{'eissn'}});
}
if ( defined $host{'isbn'} ) {
$isbn = join(", ", @{$host{'isbn'}});
}
if ( defined $host{'title'} ) {
$titre = $host{'title'};
}
if ( defined $host{'volume'} ) {
$volume = $host{'volume'};
}
if ( defined $host{'issue'} ) {
$issue = $host{'issue'};
}
if ( defined $host{'genre'} ) {
if ( $genre ) {
$genre = " ; $genre";
}
$genre = join(", ", @{$host{'genre'}}) . "$genre";
}
}
if ( defined $hit{'enrichments'} ) {
my %enrichment = %{$hit{'enrichments'}};
push(@types, sort keys %enrichment);
}
print OUT pretty($num) , "\t$corpus\t$id\t$genre\t$issn\t$eissn\t$isbn\t";
print OUT "$titre\t$date\t$volume\t$issue\t$langue\t$resume car.\t$pdf\t";
print OUT join(" ; ", @types), "\t", join(" ; ", @catWoS), "\t";
print OUT join(" ; ", @catSM), "\t", join(" ; ", @catPF), "\t";
print OUT join(" ; ", @sujets), "\n";
if ( $num % $etape == 0 ) {
my $heure = date();;
# my $tmp = $num;
# 1 while $tmp =~ s/(\d+)(\d\d\d)\b/$1.$2/o;
my $nombre = pretty($num);
print STDERR "Barre des $nombre notices atteinte le $heure\n" if not $quiet;
message("Barre des $nombre notices atteinte le $heure");
}
}
sub propre
{
my $chaine = shift;
# Vérification de jeu de caractères (doit être UTF-8)
if ( is_utf8($chaine, Encode::FB_QUIET) ) {
# URLencodage
$chaine = uri_encode($chaine);
$chaine =~ s/&/%26/go;
return $chaine;
}
else {
die "la chaîne de caractères \"$chaine\" n'est pas en UTF-8";
}
}
sub pretty
{
my $valeur = shift;
1 while $valeur =~ s/(\d+)(\d\d\d)\b/$1.$2/o;
return $valeur;
}
sub message
{
my $texte = shift;
return if not $mail;
open(MAIL, "| /usr/sbin/sendmail $mail") or die "$!,";
binmode(MAIL, ":utf8");
my $ancien = select MAIL;
$| = 1;
select $ancien;
print MAIL "Mime-Version: 1.0\nSubject: Commentaires sur l'extraction de données ISTEX\n";
print MAIL "From: \"M2M\" <$sender>\n";
print MAIL "Content-Type: text/plain; charset=utf-8\n";
print MAIL "Content-Transfer-Encoding: 8bit\n\n";
print MAIL "$texte\n\n.\n";
close MAIL;
}
sub date
{
my @time = localtime();
my $jour = (qw(Dimanche Lundi Mardi Mercredi Jeudi Vendredi Samedi))[$time[6]];
my $mois = (qw(Janvier Février Mars Avril Mai juin Juillet Août Septembre Octobre Novembre Décembre))[$time[4]];
my $annee = $time[5] + 1900;
my $date = "$jour $time[3] $mois $annee ";
$date .= sprintf("%02d:%02d:%02d", $time[2], $time[1], $time[0]);
return $date;
}