Newer
Older
istex-api-harvester / nodejs / istex-api-harvester.njs
#!/usr/bin/env node

var program   = require('commander');
var request   = require('superagent');
var uuid      = require('uuid');
var fs        = require('fs');
var mkdirp    = require('mkdirp');
var async     = require('async');
var package   = require('./package.json');

program
  .version(package.version)
  .option('-q, --query [requete]', "La requete (?q=) ", '*')
  .option('-c, --corpus [corpus]', "Le corpus souhaité (ex: springer, ecco, ...)", 'istex')
  .option('-s, --size [size]',     "Quantité de documents à télécharger", 10)
  .option('-ft, --fulltext [0|1]', "Pour retourner ou pas le plein texte", 0)
  .option('-v, --verbose',         "Affiche plus d'informations", false)
  .parse(process.argv);

var dstPath = process.cwd() + '/' + program.corpus;
mkdirp.sync(dstPath);
var zipName = process.cwd() + '/' + uuid.v1() + '.zip';

// découpe le téléchargement par pages
// pour éviter de faire une énorme requête
var nbHitPerPage = 100;
var nbPages      = Math.floor(program.size / nbHitPerPage);
var nbLastPage   = program.size - (nbPages * nbHitPerPage);
var ranges       = [];
for (var page = 0; page < nbPages; page++) {
  ranges.push([ page * nbHitPerPage,  nbHitPerPage]);
};
ranges.push([ nbPages * nbHitPerPage, nbLastPage ]);

// lance les recherches et les téléchargements
console.log("Téléchargement des " + program.size +
            " premiers documents (metadata & fulltext) ici : " + dstPath);

// télécharge page par page
var firstPage = true;
async.mapLimit(ranges, 1, function (range, cb) {
  downloadPage(range, cb, function (body) {
    if (firstPage) {
      console.log("Nombre de documents dans le corpus " + program.corpus + " : " + body.total);
      firstPage = false;
    }
    console.log('Téléchargement de la page ' +
                (range[0] / nbHitPerPage +1 ) + ' (' + (range[0] + range[1]) + ' documents)');
  });
}, function (err) {
  if (err) return console.log(err);
  console.log('Téléchargements terminés');
});

var supportedTypes = ['all','txt','raw','pdf','tei','mods','tif','tiff','png','jpg','jpeg','zip','xml','html','gif'];


//
// Fonction de téléchargement d'une page
//  
function downloadPage(range, cb, cbBody) {
  var url = 'https://api.istex.fr/document/?q='+program.query+'&output=metadata'
            + (program.fulltext != 0 ? ',fulltext' : '')
            + ((program.corpus == 'istex') ? '' : ('&corpus=' + program.corpus))
            + '&from=' + range[0] + '&size=' + range[1];
  console.log(url);

  // to ignore bad https certificate
  process.env.NODE_TLS_REJECT_UNAUTHORIZED = "0";

  var agent = request.agent();
  agent
  .get(url)
  .end(function (err, res) {
    if (err) {
      return cb(new Error(err));
    }
    if (!res || !res.body || !res.body.hits) {
      return cb(new Error('Response error: statusCode=' + res.statusCode));
    }

    // transmission du body pour les messages
    cbBody(res.body);

    // lancement des téléchargement de façon séquentielle
    async.mapLimit(res.body.hits, 1, function (item1, cb2) {
      // extract the MODS from the returned JSON
      var mods = { url: '', filename: item1.id + '.mods.xml' };
      item1.metadata.forEach(function (item2) {
        if (item2.type && item2.type == 'mods') {
          mods.url = item2.uri;
        }
      });
      if (program.fulltext) {
        var wantedType = (supportedTypes.indexOf(program.fulltext.toLowerCase()) >= 0) ? program.fulltext.toLowerCase() : "pdf";
        if (program.fulltext)
        // extract the fulltext from the returned JSON
        var fulltext = [];
        item1.fulltext.forEach(function (item2) {
          if (wantedType == 'all' || item2.type == wantedType) {
            fulltext.push({
              url: '' + item2.uri, 
              filename: '' + item1.id + '.' + item2.type
            });
          }
        });
      }

      // download the document (MODS and fulltext)
      async.series([
        // download the MODS
        function (callback) {
          var stream = fs.createWriteStream(dstPath + '/' + mods.filename);
          var req = request.get(mods.url);
          req.pipe(stream);
          stream.on('finish', function () {
            if (program.verbose) {
              console.log(mods.filename);
            }
            callback(null);
          });
          stream.on('error', callback);
        },
        // download the fulltext
        function (callback) {
          if (!program.fulltext) return callback(null);
          console.log(fulltext);

          if (Array.isArray(fulltext)) {
            async.map(fulltext, function(fulltextItem, cbMap) {


              var stream = fs.createWriteStream(dstPath + '/' + fulltextItem.filename);
              var req = request.get(fulltextItem.url);
              req.pipe(stream);
              stream.on('finish', function () {
                if (program.verbose) {
                  console.log(fulltextItem.filename);
                }
                cbMap(null);
              });
              stream.on('error', cbMap);


            }, function(err, results){
            callback(null);

          });
          } else {
            callback(null);
          }

        },
      ], function (err) {
        // MODS and fulltext downloaded
        process.stdout.write('.');
        cb2(err);
      });
    }, function (err) {
      console.log('');
      // page downloaded
      cb(err, res.body);
    });

  });
}