'use strict'; const fetch = require('node-fetch'); const fs = require('graceful-fs'); const path= require('path'); const xmlFormatter = require('xml-formatter'); const utils = require('./lib/utils'); const config = require('./config/config'); const business = {}; let outputPath, harvestPath; let processedFileCount = 0; let writtenFilesCount = 0; const output = { level1: 1, level2: 1 }; business.doTheJob = function (docObject, cb) { const dateBegin = docObject.halHarvestModifiedDateFrom; const dateEnd = docObject.halHarvestModifiedDateTo; if (!process.env.CORPUSES_ROOT) { const err = { code: 'MissingEnvironmentVariable', _errMsg: 'Environment variable $CORPUSES_ROOT must be set' }; docObject.error = err; return cb(err); } const folderName = [ docObject.corpusName, utils.getDateYYYYMMDD(new Date(dateBegin)), 'to', utils.getDateYYYYMMDD(new Date(dateEnd)) ].filter(Boolean).join('-'); harvestPath = path.join(process.env.CORPUSES_ROOT, 'hal', folderName); if (fs.existsSync(harvestPath)) { const err = { code: 'FolderExistError', _errMsg: harvestPath + ' harvest folder already exist' }; docObject.error = err; return cb(err); } const query = utils.getQueryUrl(); // [dateBegin TO dateEnd] query.searchParams.append('fq', 'modifiedDate_tdate:[' + dateBegin + ' TO ' + dateEnd + ']'); requestByQuery(docObject, query, null, 0, cb); }; business.finalJob = function (docObjects, cb) { const finalJobLogs = { processLogs: [], errLogs: [] }; finalJobLogs.processLogs.push(`Harvested TEI-XML file count : ${writtenFilesCount}`); finalJobLogs.processLogs.push(`Harvest path : ${harvestPath}`); return cb(null, finalJobLogs); }; /** * requestByQuery : search XML-TEI files and harvest. * @param {Object} docObject : le docObject * @param {URL} query : query generated from config/query.json. * @param {String} cursorMark : used for scrolling, default value 'null'. * @param {Number} retryCount : number of retries when request failed, max retries = 2. * @callback cb : the callback. */ function requestByQuery (docObject, query, cursorMark, retryCount, cb) { const target = cursorMark ? utils.getScrollUrl(query, cursorMark) : query.href; const fetchOpts = { headers: { 'Content-Type': 'application/json' } }; const agent = utils.getProxyAgent(); if (agent) { fetchOpts.agent = agent; } return fetch(target, fetchOpts).then((res) => res.json()).then(async function (res) { const nextCursorMark = res.nextCursorMark; const docs = res.response.docs; const totalResult = parseInt(res.response.numFound); if (totalResult === 0) { const _err = { code: 'NoResultFound', _errMsg: 'no result found to harvest' }; docObject.error = _err; return cb(_err); } if (processedFileCount === 0 && totalResult !== 0) { outputPath = harvestPath + '/001/001'; fs.mkdirSync(outputPath, { recursive: true }); } let filesCount = 0; for (let i = 0; i < docs.length; i++) { if (filesCount === 100) { filesCount = 0; output.level2++; if (output.level2 === 1000) { output.level1++; output.level2 = 1; } outputPath = harvestPath + '/' + utils.padFolderName(output); fs.mkdirSync(outputPath, { recursive: true }); } const writeStream = fs.createWriteStream(outputPath + '/' + docs[i].halId_s + '.xml', { flags: 'a' }); const xmlContent = xmlFormatter(docs[i].label_xml); await utils.writeFile(writeStream, xmlContent); writeStream.end(); writeStream.on('finish', function () { writtenFilesCount++; checkHarvestEnd(docObject, totalResult, cb); }); processedFileCount++; filesCount++; } if (processedFileCount < totalResult) { requestByQuery(docObject, query, nextCursorMark, 0, cb); } }).catch(function (err) { if (retryCount < config.max_retry) { retryCount++; requestByQuery(docObject, query, cursorMark, retryCount, cb); } else { const _err = { code: 'FetchError', _errMsg: err.message }; docObject.error = _err; return cb(_err); } }); } // check that the harvest is finished function checkHarvestEnd (docObject, total, cb) { if (writtenFilesCount === total) { docObject.corpusRoot = harvestPath; return cb(); } } module.exports = business;