Newer
Older
li-harvest-hal / index.js
'use strict';

const fetch = require('node-fetch');
const fs = require('graceful-fs');
const xmlFormatter = require('xml-formatter');

const utils = require('./lib/utils.js');
const config = require('./conf/conf.json');

const business = {};

let outputPath, harvestPath;

let processedFileCount = 0;
let writtenFilesCount = 0;

const output = {
  level1: 1,
  level2: 1
};

business.finalJob = function (docObjects, cbFinalJob) {
  const finalJobLogs = {
    processLogs: [],
    errLogs: []
  };
  finalJobLogs.processLogs.push(`Harvested TEI-XML file count : ${writtenFilesCount}`);
  finalJobLogs.processLogs.push(`Harvest path : ${harvestPath}`);
  return cbFinalJob(null, finalJobLogs);
};

business.doTheJob = function (docObject, cb) {
  const dateBegin = docObject.halHarvestModifiedDateFrom;
  const dateEnd = docObject.halHarvestModifiedDateTo;

  harvestPath = utils.getHarvestPath([
    docObject.corpusName,
    utils.getDateYYYYMMDD(new Date(dateBegin)),
    'to',
    utils.getDateYYYYMMDD(new Date(dateEnd))].filter(Boolean).join('-')
  );

  if (fs.existsSync(harvestPath)) {
    const err = {
      code: 1,
      message: harvestPath + ' harvest folder already exist'
    };
    docObject.error = err;
    return cb(err);
  }

  const query = utils.getQueryUrl();
  // [dateBegin TO dateEnd]
  query.searchParams.append('fq', 'modifiedDate_tdate:[' + dateBegin + ' TO ' + dateEnd + ']');
  // delete halHarvestModifiedDateFrom and halHarvestModifiedDateTo
  delete docObject.halHarvestModifiedDateFrom;
  delete docObject.halHarvestModifiedDateTo;
  requestByQuery(docObject, query, null, 0, cb);
};

/**
 * requestByQuery : search XML-TEI files and harvest.
 * @param  {Object} docObject : le docObject
 * @param  {URL} query : query generated from conf/query.json.
 * @param  {String} cursorMark : used for scrolling, default value 'null'.
 * @param  {Number} retryCount : number of retries when request failed, max retries = 2.
 * @callback  cb : the callback.
 */
function requestByQuery (docObject, query, cursorMark, retryCount, cb) {
  const target = cursorMark ? utils.getScrollUrl(query, cursorMark) : query.href;
  const fetchOpts = {
    headers: {
      'Content-Type': 'application/json'
    }
  };

  const agent = utils.getProxyAgent();
  if (agent) {
    fetchOpts.agent = agent;
  }

  return fetch(target, fetchOpts).then((res) => res.json()).then(async function (res) {
    const nextCursorMark = res.nextCursorMark;
    const docs = res.response.docs;
    const totalResult = parseInt(res.response.numFound);
    if (totalResult === 0) {
      const _err = {
        code: 3,
        message: 'no result found'
      };
      docObject.error = _err;
      return cb(_err);
    }

    if (processedFileCount === 0 && totalResult !== 0) {
      outputPath = harvestPath + '/001/001';
      fs.mkdirSync(outputPath, { recursive: true });
    }

    let filesCount = 0;
    for (let i = 0; i < docs.length; i++) {
      if (filesCount === 100) {
        filesCount = 0;
        output.level2++;
        if (output.level2 === 1000) {
          output.level1++;
          output.level2 = 1;
        }
        outputPath = harvestPath + '/' + utils.padFolderName(output);
        fs.mkdirSync(outputPath, { recursive: true });
      }
      const writeStream = fs.createWriteStream(outputPath + '/' + docs[i].halId_s + '.xml', { flags: 'a' });
      const xmlContent = xmlFormatter(docs[i].label_xml);
      await utils.writeFile(writeStream, xmlContent);
      writeStream.end();
      writeStream.on('finish', function () {
        writtenFilesCount++;
        checkHarvestEnd(docObject, totalResult, cb);
      });
      processedFileCount++;
      filesCount++;
    }

    if (processedFileCount < totalResult) {
      requestByQuery(docObject, query, nextCursorMark, 0, cb);
    }
  }).catch(function (err) {
    if (retryCount < config.max_retry) {
      retryCount++;
      requestByQuery(docObject, query, cursorMark, retryCount, cb);
    } else {
      const _err = {
        code: 2,
        message: err.message
      };
      docObject.error = _err;
      return cb(_err);
    }
  });
}

// check that the harvest is finished
function checkHarvestEnd (docObject, total, cb) {
  if (writtenFilesCount === total) {
    docObject.corpusRoot = harvestPath;
    return cb();
  }
}

module.exports = business;