diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7e231d0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +node_modules/ +*.gz diff --git a/README.md b/README.md index 87d0705..9f5abc8 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,20 @@ -get-corhal -=============== +# get-corhal -Get all Conditor corpus using corhal-api.inist.fr \ No newline at end of file +Get all Conditor corpus using corhal-api.inist.fr + +## Requirements + +Node 14.15+ + +## Installation + +```bash +npm install +``` + +## Usage + +```bash +$ ./index.mjs | gzip - > data.json.gz + ░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░ 0% | ETA: 17698s | 10000/1943464 +``` diff --git a/index.mjs b/index.mjs new file mode 100755 index 0000000..163ce0d --- /dev/null +++ b/index.mjs @@ -0,0 +1,35 @@ +#!/usr/bin/env node +import fetch from "node-fetch"; +import "cli-progress"; +import cliProgress from "cli-progress"; + +const bar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_grey); +let notices = []; +let afterKeyToken; + +const response = await fetch("https://corhal-api.inist.fr/mergedDocuments?size=50"); +const data = /** @type {object[]} */(await response.json()); +notices = [...notices, ...data]; +const totalCount = Number(response.headers.get("x-total-count")); +let resultCount = notices.length; +bar.start(totalCount, resultCount); +afterKeyToken = response.headers.get("after-key-token"); + +const max = 10_000_000; +// const max = 300; +// const max = 10_000; + +while (afterKeyToken && resultCount < max) { + const response = await fetch("https://corhal-api.inist.fr/mergedDocuments?size=50"); + const data = /** @type {object[]} */(await response.json()); + notices = [...notices, ...data]; + const totalCount = response.headers.get("x-total-count"); + resultCount = notices.length; + bar.update(resultCount); + afterKeyToken = response.headers.get("after-key-token"); + // console.error({ resultCount, totalCount, afterKeyToken }); +} + +bar.stop(); + +console.log(JSON.stringify(notices) + "\n"); diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 0000000..b92afb3 --- /dev/null +++ b/package-lock.json @@ -0,0 +1,241 @@ +{ + "name": "get-corhal", + "version": "1.0.0", + "lockfileVersion": 2, + "requires": true, + "packages": { + "": { + "version": "1.0.0", + "license": "MIT", + "dependencies": { + "cli-progress": "^3.11.1", + "node-fetch": "^3.2.6" + } + }, + "node_modules/ansi-regex": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", + "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", + "engines": { + "node": ">=8" + } + }, + "node_modules/cli-progress": { + "version": "3.11.1", + "resolved": "https://registry.npmjs.org/cli-progress/-/cli-progress-3.11.1.tgz", + "integrity": "sha512-TTMA2LHrYaZeNMcgZGO10oYqj9hvd03pltNtVbu4ddeyDTHlYV7gWxsFiuvaQlgwMBFCv1TukcjiODWFlb16tQ==", + "dependencies": { + "string-width": "^4.2.3" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/data-uri-to-buffer": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-4.0.0.tgz", + "integrity": "sha512-Vr3mLBA8qWmcuschSLAOogKgQ/Jwxulv3RNE4FXnYWRGujzrRWQI4m12fQqRkwX06C0KanhLr4hK+GydchZsaA==", + "engines": { + "node": ">= 12" + } + }, + "node_modules/emoji-regex": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", + "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==" + }, + "node_modules/fetch-blob": { + "version": "3.1.5", + "resolved": "https://registry.npmjs.org/fetch-blob/-/fetch-blob-3.1.5.tgz", + "integrity": "sha512-N64ZpKqoLejlrwkIAnb9iLSA3Vx/kjgzpcDhygcqJ2KKjky8nCgUQ+dzXtbrLaWZGZNmNfQTsiQ0weZ1svglHg==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/jimmywarting" + }, + { + "type": "paypal", + "url": "https://paypal.me/jimmywarting" + } + ], + "dependencies": { + "node-domexception": "^1.0.0", + "web-streams-polyfill": "^3.0.3" + }, + "engines": { + "node": "^12.20 || >= 14.13" + } + }, + "node_modules/formdata-polyfill": { + "version": "4.0.10", + "resolved": "https://registry.npmjs.org/formdata-polyfill/-/formdata-polyfill-4.0.10.tgz", + "integrity": "sha512-buewHzMvYL29jdeQTVILecSaZKnt/RJWjoZCF5OW60Z67/GmSLBkOFM7qh1PI3zFNtJbaZL5eQu1vLfazOwj4g==", + "dependencies": { + "fetch-blob": "^3.1.2" + }, + "engines": { + "node": ">=12.20.0" + } + }, + "node_modules/is-fullwidth-code-point": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz", + "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==", + "engines": { + "node": ">=8" + } + }, + "node_modules/node-domexception": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/node-domexception/-/node-domexception-1.0.0.tgz", + "integrity": "sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/jimmywarting" + }, + { + "type": "github", + "url": "https://paypal.me/jimmywarting" + } + ], + "engines": { + "node": ">=10.5.0" + } + }, + "node_modules/node-fetch": { + "version": "3.2.6", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-3.2.6.tgz", + "integrity": "sha512-LAy/HZnLADOVkVPubaxHDft29booGglPFDr2Hw0J1AercRh01UiVFm++KMDnJeH9sHgNB4hsXPii7Sgym/sTbw==", + "dependencies": { + "data-uri-to-buffer": "^4.0.0", + "fetch-blob": "^3.1.4", + "formdata-polyfill": "^4.0.10" + }, + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/node-fetch" + } + }, + "node_modules/string-width": { + "version": "4.2.3", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", + "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", + "dependencies": { + "emoji-regex": "^8.0.0", + "is-fullwidth-code-point": "^3.0.0", + "strip-ansi": "^6.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/strip-ansi": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", + "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", + "dependencies": { + "ansi-regex": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/web-streams-polyfill": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-3.2.1.tgz", + "integrity": "sha512-e0MO3wdXWKrLbL0DgGnUV7WHVuw9OUvL4hjgnPkIeEvESk74gAITi5G606JtZPp39cd8HA9VQzCIvA49LpPN5Q==", + "engines": { + "node": ">= 8" + } + } + }, + "dependencies": { + "ansi-regex": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", + "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==" + }, + "cli-progress": { + "version": "3.11.1", + "resolved": "https://registry.npmjs.org/cli-progress/-/cli-progress-3.11.1.tgz", + "integrity": "sha512-TTMA2LHrYaZeNMcgZGO10oYqj9hvd03pltNtVbu4ddeyDTHlYV7gWxsFiuvaQlgwMBFCv1TukcjiODWFlb16tQ==", + "requires": { + "string-width": "^4.2.3" + } + }, + "data-uri-to-buffer": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-4.0.0.tgz", + "integrity": "sha512-Vr3mLBA8qWmcuschSLAOogKgQ/Jwxulv3RNE4FXnYWRGujzrRWQI4m12fQqRkwX06C0KanhLr4hK+GydchZsaA==" + }, + "emoji-regex": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", + "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==" + }, + "fetch-blob": { + "version": "3.1.5", + "resolved": "https://registry.npmjs.org/fetch-blob/-/fetch-blob-3.1.5.tgz", + "integrity": "sha512-N64ZpKqoLejlrwkIAnb9iLSA3Vx/kjgzpcDhygcqJ2KKjky8nCgUQ+dzXtbrLaWZGZNmNfQTsiQ0weZ1svglHg==", + "requires": { + "node-domexception": "^1.0.0", + "web-streams-polyfill": "^3.0.3" + } + }, + "formdata-polyfill": { + "version": "4.0.10", + "resolved": "https://registry.npmjs.org/formdata-polyfill/-/formdata-polyfill-4.0.10.tgz", + "integrity": "sha512-buewHzMvYL29jdeQTVILecSaZKnt/RJWjoZCF5OW60Z67/GmSLBkOFM7qh1PI3zFNtJbaZL5eQu1vLfazOwj4g==", + "requires": { + "fetch-blob": "^3.1.2" + } + }, + "is-fullwidth-code-point": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz", + "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==" + }, + "node-domexception": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/node-domexception/-/node-domexception-1.0.0.tgz", + "integrity": "sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==" + }, + "node-fetch": { + "version": "3.2.6", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-3.2.6.tgz", + "integrity": "sha512-LAy/HZnLADOVkVPubaxHDft29booGglPFDr2Hw0J1AercRh01UiVFm++KMDnJeH9sHgNB4hsXPii7Sgym/sTbw==", + "requires": { + "data-uri-to-buffer": "^4.0.0", + "fetch-blob": "^3.1.4", + "formdata-polyfill": "^4.0.10" + } + }, + "string-width": { + "version": "4.2.3", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", + "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", + "requires": { + "emoji-regex": "^8.0.0", + "is-fullwidth-code-point": "^3.0.0", + "strip-ansi": "^6.0.1" + } + }, + "strip-ansi": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", + "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", + "requires": { + "ansi-regex": "^5.0.1" + } + }, + "web-streams-polyfill": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-3.2.1.tgz", + "integrity": "sha512-e0MO3wdXWKrLbL0DgGnUV7WHVuw9OUvL4hjgnPkIeEvESk74gAITi5G606JtZPp39cd8HA9VQzCIvA49LpPN5Q==" + } + } +} diff --git a/package.json b/package.json new file mode 100644 index 0000000..1e20471 --- /dev/null +++ b/package.json @@ -0,0 +1,22 @@ +{ + "name": "get-corhal", + "version": "1.0.0", + "description": "Get all Conditor corpus using corhal-api.inist.fr", + "main": "index.mjs", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "repository": { + "type": "git", + "url": "ssh://git@gitbucket.inist.fr:22222/parmentf/get-corhal.git" + }, + "keywords": [ + "conditor" + ], + "author": "François Parmentier", + "license": "MIT", + "dependencies": { + "cli-progress": "^3.11.1", + "node-fetch": "^3.2.6" + } +}