#!/usr/bin/perl # Déclaration des pragmas use strict; use utf8; use open qw/:std :utf8/; # Appel des modules externes de base use Encode qw(decode_utf8 encode_utf8 is_utf8); use Getopt::Long; # Appel des modules spécifiques à l'application use URI::Encode qw(uri_encode uri_decode); use LWP::Simple; use JSON; use Text::Unidecode; ## use XML::Twig; my ($programme) = $0 =~ m|^(?:.*/)?(.+)|; my $version = "2.3.0"; my $dateModif = "19 Janvier 2018"; # Variables my $destination = ""; my $help = 0; my $mail = ""; my $numero = 0; my $quiet = 0; my $requete = ""; eval { $SIG{__WARN__} = sub {usage(1);}; GetOptions( "destination=s" => \$destination, # "help" => \$help, "mail=s" => \$mail, # "numero=i" => \$numero, "quiet" => \$quiet, "requete=s" => \$requete, ); }; $SIG{__WARN__} = sub {warn $_[0];}; usage(0) if $help; usage(2) if not $destination; my $sender = ""; if ( $mail ) { if ( $mail !~ /^\w+(\.\w+)*\@\w+(\.\w+)+\z/ ) { print STDERR "Erreur : adresse e-mail \"$mail\" incorrect\n"; exit 3; } $mail =~ s/(\W)/\\$1/go; my $hostname = `hostname`; my $domain = `hostname -d`; $sender = "$ENV{'USER'}\@$hostname.$domain"; $sender =~ s/[\n\r]+//go; } if ( $destination eq "-" ) { open(OUT, ">&STDOUT") or die "$!,"; } elsif ( $destination =~ /\.gz\z/ ) { open(OUT, "| gzip -c -9 > $destination") or die "$!,"; } elsif ( $destination =~ /\.bz2\z/ ) { open(OUT, "| bzip2 -c -9 > $destination") or die "$!,"; } else { $destination =~ s/\W*\z/.gz/; open(OUT, "| gzip -c -9 > $destination") or die "$!,"; } # Paramètres de l'API ISTEX my $base = "https://api.istex.fr"; my $url = "$base/document/?q="; my $req = "*"; my $out = "output=*"; my $size = 300; if ( $requete ) { $req = decode_utf8($requete, Encode::FB_QUIET); if ( $requete ) { die "Erreur : La requête n'est pas en UTF-8\n"; } $req = propre($req); } #if ( $req =~ / /o ) { # $req =~ s/ /+/go; # } my $uri = "$url$req&$out&size=$size&scroll=167s"; # Variables concernant les documents my $corpus = ""; my $etape = 1000000; my $id = ""; my $num = $numero; my $suivant = ""; my $total = 0; # Première itération my $heure = date();; if ( not $quiet ) { print STDERR "Processus n° $$\n\n"; print STDERR "Démarrage le $heure\n"; } message("Processus n° $$\nDémarrage le $heure"); my $nb = 10; while( $nb ) { print STDERR "URI : \"$uri\"\n" if not $quiet; my $json = get("$uri"); my $perl = undef; if ( defined $json ) { eval { $perl = decode_json $json; }; if ( $@ ) { print STDERR "[" . pretty($num) . "] Erreur JSON => Perl : $@ \n"; exit 6; # print STDERR "[", pretty($num), "] Erreur JSON => Perl : $@ \n"; # sleep 30; # next; } my %top = %{$perl}; $total = $top{'total'}; if ( $total > 0 ) { print STDERR "Total : $total\n" if not $quiet; if ( $top{'nextScrollURI'} ) { $suivant = $top{'nextScrollURI'}; } else { $suivant = ""; } my @hits = @{$top{'hits'}}; foreach my $hit (@hits) { $num ++; traite($hit); } $nb = 0; } else { print STDERR "Aucun document pour la requête \'$req\'\n"; exit 4; } } else { $heure = date();; if ( $nb ) { $nb --; print STDERR "Pause (10 sec.) à la notice ", pretty($num), " le $heure\n"; sleep 10; } else { print STDERR "Aucune réponse du serveur \"$base\" le $heure\n"; exit 5; } } } # Itérations suivantes $nb = 10; while ( $suivant ) { my $json = get("$suivant"); my $perl = undef; if ( defined $json ) { $nb = 10; eval { $perl = decode_json $json; }; if ( $@ ) { print STDERR "[", pretty($num), "] Erreur JSON => Perl : $@ \n"; sleep 30; next; } $perl = decode_json $json; my %top = %{$perl}; if ( $top{'nextScrollURI'} ) { $suivant = $top{'nextScrollURI'}; } else { $suivant = ""; } my @hits = @{$top{'hits'}}; foreach my $hit (@hits) { $num ++; traite($hit); } } else { $heure = date();; if ( $nb ) { $nb --; print STDERR "Pause (10 sec.) à la notice ", pretty($num), " le $heure\n"; sleep 10; } else { print STDERR "Aucune réponse du serveur \"$base\" le $heure\n"; sleep 20; $heure = date();; print STDERR "Reprise le $heure\n"; $nb = 10; } } } close OUT; $heure = date();; if ( not $quiet ) { print STDERR "Arrêt le $heure\n"; } if ( $num > 1 ) { message("Arrêt le $heure\n$num/$total enregistrements traités"); } else { message("Arrêt le $heure\n$num/$total enregistrement traité"); } exit 0; sub usage { my $code = shift; print STDERR "Usage : $programme -d destination [ -r 'requête' ] [ -m adresse_mail ] [ -q ]\n"; exit $code; } sub traite { my $hit = shift; my %hit = %{$hit}; $id = $hit{'id'}; $corpus = defined $hit{'corpusName'} ? $hit{'corpusName'} : "Inconnu"; my $ark = '?'; my $lien = ""; my $type = ""; my $langue = ""; my $resume = 0; my $pdf = "?"; my $titre = ""; my $issn = ""; my $eissn = ""; my $isbn = ""; my $volume = ""; my $issue = ""; my $date = ""; my @catWoS = (); my @catSM = (); my @catPF = (); # Pascal & Francis my @sujets = (); my @types = (); my $genre = ""; if ( defined $hit{'arkIstex'} ) { $ark = substr($hit{'arkIstex'}, 4); } if ( defined $hit{'genre'} ) { $genre = join(", ", @{$hit{'genre'}}); } if ( defined $hit{'publicationDate'} ) { $date = $hit{'publicationDate'}; } if ( defined $hit{'copyrightDate'} ) { $date = $hit{'copyrightDate'}; } if ( defined $hit{'language'} ) { $langue = join(", ", @{$hit{'language'}}); } if ( defined $hit{'abstract'} ) { $resume = length($hit{'abstract'}); } if ( defined $hit{'qualityIndicators'} ) { my %indicateurs = %{$hit{'qualityIndicators'}}; if ( defined $indicateurs{'pdfVersion'} ) { $pdf = $indicateurs{'pdfVersion'}; } } if ( defined $hit{'categories'} ) { my %categories = %{$hit{'categories'}}; if ( defined $categories{'wos'} ) { @catWoS = @{$categories{'wos'}}; } if ( defined $categories{'scienceMetrix'} ) { @catSM = @{$categories{'scienceMetrix'}}; } if ( defined $categories{'inist'} ) { @catPF = @{$categories{'inist'}}; } } if ( defined $hit{'subject'} ) { foreach my $subject (@{$hit{'subject'}}) { my $valeur = $subject->{'value'}; if ( $valeur =~ /[\n\t]+/o ) { $valeur =~ s/[\n\t]+//go; print STDERR "Valeur de descripteur incorrect pour la notice n° $num [$id]\n"; } push(@sujets, $valeur); } } if ( defined $hit{'host'} ) { my %host = %{$hit{'host'}}; if ( defined $host{'issn'} ) { $issn = join(", ", @{$host{'issn'}}); } if ( defined $host{'eissn'} ) { $eissn = join(", ", @{$host{'eissn'}}); } if ( defined $host{'isbn'} ) { $isbn = join(", ", @{$host{'isbn'}}); } if ( defined $host{'title'} ) { $titre = $host{'title'}; } if ( defined $host{'volume'} ) { $volume = $host{'volume'}; } if ( defined $host{'issue'} ) { $issue = $host{'issue'}; } if ( defined $host{'genre'} ) { if ( $genre ) { $genre = " ; $genre"; } $genre = join(", ", @{$host{'genre'}}) . "$genre"; } } if ( defined $hit{'enrichments'} ) { my %enrichment = %{$hit{'enrichments'}}; push(@types, sort keys %enrichment); } print OUT pretty($num) , "\t$corpus\t$ark\t$id\t$genre\t$issn\t$eissn\t$isbn"; print OUT "\t$titre\t$date\t$volume\t$issue\t$langue\t$resume car.\t$pdf\t"; print OUT join(" ; ", @types), "\t", join(" ; ", @catWoS), "\t"; print OUT join(" ; ", @catSM), "\t", join(" ; ", @catPF), "\t"; print OUT join(" ; ", @sujets), "\n"; if ( $num % $etape == 0 ) { my $heure = date();; # my $tmp = $num; # 1 while $tmp =~ s/(\d+)(\d\d\d)\b/$1.$2/o; my $nombre = pretty($num); print STDERR "Barre des $nombre notices atteinte le $heure\n" if not $quiet; message("Barre des $nombre notices atteinte le $heure"); } } sub propre { my $chaine = shift; # Vérification de jeu de caractères (doit être UTF-8) if ( is_utf8($chaine, Encode::FB_QUIET) ) { # URLencodage $chaine = uri_encode($chaine); $chaine =~ s/&/%26/go; return $chaine; } else { die "la chaîne de caractères \"$chaine\" n'est pas en UTF-8"; } } sub pretty { my $valeur = shift; 1 while $valeur =~ s/(\d+)(\d\d\d)\b/$1.$2/o; return $valeur; } sub message { my $texte = shift; return if not $mail; open(MAIL, "| /usr/sbin/sendmail $mail") or die "$!,"; binmode(MAIL, ":utf8"); my $ancien = select MAIL; $| = 1; select $ancien; print MAIL "Mime-Version: 1.0\nSubject: Commentaires sur l'extraction de données ISTEX\n"; print MAIL "From: \"M2M\" <$sender>\n"; print MAIL "Content-Type: text/plain; charset=utf-8\n"; print MAIL "Content-Transfer-Encoding: 8bit\n\n"; print MAIL "$texte\n\n.\n"; close MAIL; } sub date { my @time = localtime(); my $jour = (qw(Dimanche Lundi Mardi Mercredi Jeudi Vendredi Samedi))[$time[6]]; my $mois = (qw(Janvier Février Mars Avril Mai juin Juillet Août Septembre Octobre Novembre Décembre))[$time[4]]; my $annee = $time[5] + 1900; my $date = "$jour $time[3] $mois $annee "; $date .= sprintf("%02d:%02d:%02d", $time[2], $time[1], $time[0]); return $date; }