Newer
Older
li-harvest-hal / index.js
'use strict';

const fetch = require('node-fetch');
const fs = require('graceful-fs');
const path = require('path');
const utils = require('./lib/utils');
const config = require('./config/config');

const business = {};
let outputPath, harvestPath;
let processedFileCount = 0;
let writtenFilesCount = 0;
const output = {
  level1: 1,
  level2: 1
};

business.doTheJob = function (docObject, cb) {
  const dateBegin = docObject.harvestModifiedDateFrom;
  const dateEnd = docObject.harvestModifiedDateTo;

  if (!process.env.CORPUSES_ROOT) {
    const err = {
      code: 'MissingEnvironmentVariable',
      _errMsg: 'Environment variable $CORPUSES_ROOT must be set'
    };
    docObject.error = err;
    return cb(err);
  }

  const folderName = [
    docObject.corpusName,
    utils.getDateYYYYMMDD(new Date(dateBegin)),
    'to',
    utils.getDateYYYYMMDD(new Date(dateEnd))
  ].filter(Boolean).join('-');

  harvestPath = path.join(process.env.CORPUSES_ROOT, 'hal', folderName);

  if (fs.existsSync(harvestPath)) {
    const err = {
      code: 'FolderExistError',
      _errMsg: harvestPath + ' harvest folder already exist'
    };

    docObject.error = err;
    return cb(err);
  }

  const query = utils.getQueryUrl();
  // [dateBegin TO dateEnd]
  query.searchParams.append('fq', 'modifiedDate_tdate:[' + dateBegin + ' TO ' + dateEnd + ']');
  requestByQuery(docObject, query, null, 0, cb);
};

business.finalJob = function (docObjects, cb) {
  const finalJobLogs = {
    processLogs: [],
    errLogs: []
  };
  finalJobLogs.processLogs.push(`Harvested TEI-XML file count : ${writtenFilesCount}`);
  finalJobLogs.processLogs.push(`Harvest path : ${harvestPath}`);
  return cb(null, finalJobLogs);
};

/**
 * requestByQuery : search XML-TEI files and harvest.
 * @param  {Object} docObject : le docObject
 * @param  {URL} query : query generated from config/query.json.
 * @param  {String} cursorMark : used for scrolling, default value 'null'.
 * @param  {Number} retryCount : number of retries when request failed, max retries = 2.
 * @callback  cb : the callback.
 */
function requestByQuery (docObject, query, cursorMark, retryCount, cb) {
  const target = cursorMark ? utils.getScrollUrl(query, cursorMark) : query.href;
  const fetchOpts = {
    headers: {
      'Content-Type': 'application/json'
    }
  };

  const agent = utils.getProxyAgent();
  if (agent) {
    fetchOpts.agent = agent;
  }

  return fetch(target, fetchOpts).then((res) => res.json()).then(async function (res) {
    const nextCursorMark = res.nextCursorMark;
    const docs = res.response.docs;
    const totalResult = parseInt(res.response.numFound);
    if (totalResult === 0) {
      const _err = {
        code: 'NoResultFound',
        _errMsg: 'no result found to harvest'
      };
      docObject.error = _err;
      return cb(_err);
    }

    if (processedFileCount === 0 && totalResult !== 0) {
      outputPath = harvestPath + '/001/001';
      fs.mkdirSync(outputPath, { recursive: true });
    }

    let filesCount = 0;
    for (let i = 0; i < docs.length; i++) {
      if (filesCount === 100) {
        filesCount = 0;
        output.level2++;
        if (output.level2 === 1000) {
          output.level1++;
          output.level2 = 1;
        }
        outputPath = harvestPath + '/' + utils.padFolderName(output);
        fs.mkdirSync(outputPath, { recursive: true });
      }
      const writeStream = fs.createWriteStream(outputPath + '/' + docs[i].halId_s + '.xml', { flags: 'a' });
      await utils.writeFile(writeStream, docs[i].label_xml);
      writeStream.end();
      writeStream.on('finish', function () {
        writtenFilesCount++;
        checkHarvestEnd(docObject, totalResult, cb);
      });
      processedFileCount++;
      filesCount++;
    }

    if (processedFileCount < totalResult) {
      requestByQuery(docObject, query, nextCursorMark, 0, cb);
    }
  }).catch(function (err) {
    if (retryCount < config.maxRetry) {
      retryCount++;
      requestByQuery(docObject, query, cursorMark, retryCount, cb);
    } else {
      const _err = {
        code: 'FetchError',
        _errMsg: err.message
      };
      docObject.error = _err;
      return cb(_err);
    }
  });
}

// check that the harvest is finished
function checkHarvestEnd (docObject, total, cb) {
  if (writtenFilesCount === total) {
    docObject.corpusRoot = harvestPath;
    return cb();
  }
}

module.exports = business;