#!/usr/bin/env node var program = require('commander'); var request = require('superagent'); var uuid = require('uuid'); var fs = require('fs'); var mkdirp = require('mkdirp'); var async = require('async'); var package = require('./package.json'); program .version(package.version) .option('-q, --query [requete]', "La requete (?q=) ", '*') .option('-c, --corpus [corpus]', "Le corpus souhaité (ex: springer, ecco, ...)", 'istex') .option('-s, --size [size]', "Quantité de documents à télécharger", 10) .option('-ft, --fulltext [0|1]', "Pour retourner ou pas le plein texte", 0) .option('-v, --verbose', "Affiche plus d'informations", false) .parse(process.argv); var dstPath = process.cwd() + '/' + program.corpus; mkdirp.sync(dstPath); var zipName = process.cwd() + '/' + uuid.v1() + '.zip'; // découpe le téléchargement par pages // pour éviter de faire une énorme requête var nbHitPerPage = 100; var nbPages = Math.floor(program.size / nbHitPerPage); var nbLastPage = program.size - (nbPages * nbHitPerPage); var ranges = []; for (var page = 0; page < nbPages; page++) { ranges.push([ page * nbHitPerPage, nbHitPerPage]); }; ranges.push([ nbPages * nbHitPerPage, nbLastPage ]); // lance les recherches et les téléchargements console.log("Téléchargement des " + program.size + " premiers documents (metadata & fulltext) ici : " + dstPath); // télécharge page par page var firstPage = true; async.mapLimit(ranges, 1, function (range, cb) { downloadPage(range, cb, function (body) { if (firstPage) { console.log("Nombre de documents dans le corpus " + program.corpus + " : " + body.total); firstPage = false; } console.log('Téléchargement de la page ' + (range[0] / nbHitPerPage +1 ) + ' (' + (range[0] + range[1]) + ' documents)'); }); }, function (err) { if (err) return console.log(err); console.log('Téléchargements terminés'); }); var supportedTypes = ['all','txt','raw','pdf','tei','mods','tif','tiff','png','jpg','jpeg','zip','xml','html','gif']; // // Fonction de téléchargement d'une page // function downloadPage(range, cb, cbBody) { var url = 'https://api.istex.fr/document/?q='+program.query+'&output=metadata' + (program.fulltext != 0 ? ',fulltext' : '') + ((program.corpus == 'istex') ? '' : ('&corpus=' + program.corpus)) + '&from=' + range[0] + '&size=' + range[1]; console.log(url); // to ignore bad https certificate process.env.NODE_TLS_REJECT_UNAUTHORIZED = "0"; var agent = request.agent(); agent .get(url) .end(function (err, res) { if (err) { return cb(new Error(err)); } if (!res || !res.body || !res.body.hits) { return cb(new Error('Response error: statusCode=' + res.statusCode)); } // transmission du body pour les messages cbBody(res.body); // lancement des téléchargement de façon séquentielle async.mapLimit(res.body.hits, 1, function (item1, cb2) { // extract the MODS from the returned JSON var mods = { url: '', filename: item1.id + '.mods.xml' }; item1.metadata.forEach(function (item2) { if (item2.type && item2.type == 'mods') { mods.url = item2.uri; } }); if (program.fulltext) { var wantedType = (supportedTypes.indexOf(program.fulltext.toLowerCase()) >= 0) ? program.fulltext.toLowerCase() : "pdf"; if (program.fulltext) // extract the fulltext from the returned JSON var fulltext = []; item1.fulltext.forEach(function (item2) { if (wantedType == 'all' || item2.type == wantedType) { fulltext.push({ url: '' + item2.uri, filename: '' + item1.id + '.' + item2.type }); } }); } // download the document (MODS and fulltext) async.series([ // download the MODS function (callback) { var stream = fs.createWriteStream(dstPath + '/' + mods.filename); var req = request.get(mods.url); req.pipe(stream); stream.on('finish', function () { if (program.verbose) { console.log(mods.filename); } callback(null); }); stream.on('error', callback); }, // download the fulltext function (callback) { if (!program.fulltext) return callback(null); console.log(fulltext); if (Array.isArray(fulltext)) { async.map(fulltext, function(fulltextItem, cbMap) { var stream = fs.createWriteStream(dstPath + '/' + fulltextItem.filename); var req = request.get(fulltextItem.url); req.pipe(stream); stream.on('finish', function () { if (program.verbose) { console.log(fulltextItem.filename); } cbMap(null); }); stream.on('error', cbMap); }, function(err, results){ callback(null); }); } else { callback(null); } }, ], function (err) { // MODS and fulltext downloaded process.stdout.write('.'); cb2(err); }); }, function (err) { console.log(''); // page downloaded cb(err, res.body); }); }); }