diff --git a/doiwos-dumps-config.json b/doiwos-dumps-config.json new file mode 100644 index 0000000..3fc9e72 --- /dev/null +++ b/doiwos-dumps-config.json @@ -0,0 +1,23 @@ +{ + "files": { + "zip": "https://gitbucket.inist.fr/parmentf/web-dumps/archive/doiswos-dumps/doswos-dumps@1.0.0.zip" + }, + "environnement": { + "CRON_VERBOSE": true, + "EZS_VERBOSE": false, + "NODE_OPTIONS": "--max_old_space_size=1024", + "WOS_API_KEY": "PUT HERE A VALID WOS API KEY" + }, + "packages": [ + "@ezs/core@2.1.0", + "@ezs/basics@1.22.3", + "@ezs/analytics@2.0.2" + ], + "tasks": [ + { + "CronRule": "0 1 * * *", + "Target": "data/09-corpus-simple-cnrs.json", + "RunOnStartup": true + } + ] +} \ No newline at end of file diff --git a/doiwos-dumps/00-dois.txt b/doiwos-dumps/00-dois.txt new file mode 100644 index 0000000..7f1b201 --- /dev/null +++ b/doiwos-dumps/00-dois.txt @@ -0,0 +1,100 @@ +10.1332/0305573012501251 +10.1111/j.1540-6210.2011.02488.x +https://doi.org/10.1016/j.gloenvcha.2015.01.004 +10.1038/ngeo1391 +10.1007/s10980-016-0365-y +10.5194/gmd-10-3979-2017 +https://doi.org/10.1007/s11205-017-1650-0 +10.1111/j.1466- 8238.2010.00540.x +10.1038/s41558-018-0156-3 +https://doi.org/10.3390/su4040656 +10.1257/jel.38.3.595 +10.1126/science.1234485 +10.1038/534320a +https://doi.org/10.1016/j.envsci.2015.11.011 +10.1080/09640568.2016.1168288 +https://doi.org/10.1002/wcc.485 +10.1016/j.gloenvcha.2016.05.009 +10.1088/1748- 9326/9/6/064030 +10.1029/2011GL050087 +10.1175/BAMS-D-17-0157.1 +10.5194/gmd-10-4035-2017 +https://doi.org/10.1038/s41893-018-0021-4 +10.1007/s10712-019-09525-z +10.1029/2007GC001784 +10.1093/ajcn/84.1.289 +https://doi.org/10.1016/j.cosust.2014.11.002 +10.1111/area.12189 +10.1007/s10113-018-1288-8 +10.1111/rec.12359 +10.1126/science.aap8826 +10.1007/s10584-013-0905-2 +http://dx.doi.org/10.1146/annurev.psych.52.1.141 +10.1210/jc.2008-1595 +https://doi.org/10.1186/s13021-018-0095-3 +10.1007/s10584-018-2317-9 +10.1038/nature08823 +10.1890/070062 +10.1016/j. landurbplan.2014.10.018 +10.1111/tgis.12265 +10.1111/psj.12212 +10.1016/j.ecolecon.2006.09.026 +doi:10.1073/pnas.0906974107 +10.1016/0967-0637(95)00012-U +doi:10.1093/biosci/biu225 +10.1016/j.gloenvcha.2011.10.003 +10.1111/gcb.13988 +10.1111/j.1745- 6584.1998.tb02825.x. +10.1023/A:1005582929198 +http://www.jstor.org/stable/27800489 +10.1007/s00035-011-0094-4 +10.5194/cp-12-663-2016 +10.17645/pag.v6i1.1169 +10.1007/s10584-011-0148-z +10.1659/MRD-JOURNAL-D-17-00107.1 +10.1111/1365-2664.12643 +10.1371/journal.pone.0068037 +papers3://publication/doi/10.1016/j.foreco.2017.08.016 +10.5194/esd-9-1085-2018 +10.1016/j.palaeo.2011.04.005 +10.1007/s40808-018-0517-y +10.1038/s41598-019-42792-9 +10.1126/science.aac7125 +10.1038/s41598-019-57280-3 +10.1111/ddi.12556 +10.1038/nature17155 http://www.nature.com/nature/journal/v531/n7594/abs/nature17155.html#supplementary-information +http://dx.doi.org/10.1016/j.palaeo.2006.11.050 +10.15242/IICBE.C0415015 +10.1126/science.aac8083 +10.1007/s10531-016-1071-4 +10.1007/s10531-019-01827-3 +10.1038/s41467-017-00923-8 +10.1126/science.1081056 +10.1007/s00338-021-02170-2 +10.1098/rspb.2015.1211 +papers3://publication/doi/10.3390/f9100596 +papers3://publication/doi/10.1007/s40641-015-0002-x +10.3390/sci2030067 +10.1111/gcb.12754 +10.1073/pnas.1621517114 +10.1098/rspb.2011.1897 +papers3://publication/doi/10.1002/eco.1997 +10.1371/journal.pone.0188142 +10.1146/annurev-environ-110615-085610 +10.1073/pnas.0804478105 +https://doi.org/10.1016/j.cub.2012.09.036 +http://dx.doi.org/10.1016/j.palaeo.2016.01.033 +doi.org/10.1111/2041-210X.13675 +10.3389/fmars.2019.00499 +10.1016/j.tree.2006.09.010 +doi:10.1111/j.1365-2699.2006.01482.x +doi:10.1111/j.1466-822X.2005.00182.x +https://doi.org/10.1016/j.palaeo.2009.10.001 +10.1038/ngeo905 https://www.nature.com/articles/ngeo905#supplementary-information +10.1671/1952-4 +10.1029/2008gl035028 +http://dx.doi.org/10.1016/j.chemgeo.2013.10.012 +10.1002/ece3.2196 +10.1016/j.biocon.2016.10.007 +10.1016/j.biocon.2018.03.037 +10.1038/srep26513 diff --git a/doiwos-dumps/01-harvest-from-doi.ini b/doiwos-dumps/01-harvest-from-doi.ini new file mode 100644 index 0000000..b3049d9 --- /dev/null +++ b/doiwos-dumps/01-harvest-from-doi.ini @@ -0,0 +1,85 @@ +; Usage: npx ezs harvest-from-doi.ini < dois.txt + +; Inspiré de https://gitbucket.inist.fr/tdm/web-services/blob/master/biblio-tools/v1/wos/works/expand.ini + +[use] +plugin = @ezs/basics +plugin = @ezs/analytics + +# Flow configuration +[TXTParse] + +[env] +; Maximum 90 (théoriquement 100, mais un DOI peut renvoyer plusieurs notices) +; Parfois, on rencontre une erreur "Request Header Fields Too Large", sans doute +; due à une URL trop longue (avec trop de DOI trop longs). +path = size +value = 50 + +path = indent +value = true + +path = token +value = env('WOS_API_KEY') + +# Clean up DOIs +# - lower case +# - remove BOM +# - from CRLF to LF +# - remove - at then of a DOI +# - replace middle point with normal point +# - replace en dash with normal hyphen-minus +# - remove soft hyphen (discretionary hyphen) +# - remove "http" part of a DOI +# - remove "doi:" part of a DOI +# - remove URL following a DOI +# - remove duplicates of a DOI +# - remove PMID following DOI +# - remove part preceding a DOI +# - remove blanks +# - remove ;subjmeta part of a DOI +# - remove double quotes from DOIs +[replace] +path = value +value = toLower().replace(/^\uFEFF/, "").replace(/\r/,"").replace(/-$/, "").replace(/\u00B7/g, ".").replace(/\u2013/g, "-").replace(/\u00AD/g, "").replace(/^.*doi(\.org)?\/(.+)/g, "$2").replace(/^.*doi[: ]+10\./g, "10.").replace(/^(10\.\d+\/.+\S) https?:\/\/.+/, "$1").replace(/^(10\.\d+\/\S+) \1.*/, "$1").replace(/^(10\.\d+\/\S+) pmid.+/, "$1").replace(/^.* (10\.\d+\/.+)/, "$1").replace(/ /g, "").replace(/;subjmeta=\d+$/,"").replace(/"+/g, "") + +[group] +size = env('size') + +[replace] +path = dois +value = self().map('value').filter(Boolean).map(x => JSON.stringify(x)).join(' OR ') + +[replace] +path = usrQuery +value = fix('DO=(', self.dois ,')').join('') + +path = databaseId +value = WOK + +path = count +value = env('size') + +path = firstRecord +value = 1 + +; FR = Full Record +path = optionView +value = FR + +# Maximum 1 request by second +[throttle] +bySecond = 1 + +; [debug] + +[URLStream] +url = https://wos-api.clarivate.com/api/wos/ +header = env('token').prepend('X-ApiKey:') +path = Data.Records.records.REC.* +timeout = 50000 +noerror = true +retries = 1 + +[dump] +indent = env('indent') diff --git a/doiwos-dumps/02-extract-fields.ini b/doiwos-dumps/02-extract-fields.ini new file mode 100644 index 0000000..f9f5087 --- /dev/null +++ b/doiwos-dumps/02-extract-fields.ini @@ -0,0 +1,55 @@ +# Extract few fields from a WoS file. +# Usage: +# npx ezs extract-fields.ini < corpus_WoS_vol1.json | jq + +[use] +plugin = basics + +[JSONParse] + +[assign] +path = tmp.reprint_addresses +value = get('static_data.fullrecord_metadata.reprint_addresses.address_name', []).castArray().map(name => name.address_spec.full_address) + +path = tmp.addresses +value = get('static_data.fullrecord_metadata.addresses.address_name', []).castArray().map(name => name.address_spec.full_address) + +[replace] +path = uri +value = get("UID") + +path = doi +value = get("dynamic_data.cluster_related.identifiers.identifier").filter(i => i.type === "doi").map("value").uniq() + +path = title +value = get('static_data.summary.titles.title').filter(title => title.type === "item").map(title => title.content).join('|') + +path = abstract +value = get('static_data.fullrecord_metadata.abstracts.abstract.abstract_text.p', []).castArray().map(s => String(s).replace(/<[^>]*>/g, "").replace("Key Points", "")).join(" ; ") + +path = publication_year +value = get('static_data.summary.pub_info.pubyear') + +path = source +value = get('static_data.summary.titles.title').filter(title => title.type === "source").map(title => title.content).join('|') + +path = affiliations +value = get('tmp.addresses', []).concat(_.get(self, 'tmp.reprint_addresses')).remove(null).uniq() + +path = countries +value = get('static_data.fullrecord_metadata.addresses.address_name', []).castArray().map("address_spec.country", []).uniq() + +path = keywords +value = get('static_data.item.keywords_plus.keyword') + +path = subjects +value = get('static_data.fullrecord_metadata.category_info.subjects.subject').map('content').uniq() + +path = subheadings +value = get('static_data.fullrecord_metadata.category_info.subheadings.subheading').castArray().uniq() + +path = headings +value = get('static_data.fullrecord_metadata.category_info.headings.heading').castArray().uniq() + +[dump] +indent = true diff --git a/doiwos-dumps/03-enrich-rnsr.ini b/doiwos-dumps/03-enrich-rnsr.ini new file mode 100644 index 0000000..90a896d --- /dev/null +++ b/doiwos-dumps/03-enrich-rnsr.ini @@ -0,0 +1,33 @@ +# Usage: npx ezs enrich-rnsr.ini < data/corpus-simple.json > data/corpus-simple-rnsr.json + +# To see the enriched notices: +# $ fx < data/corpus-simple-rnsr.json '.filter(o => o.ws.rnsr[0]?.length)' + +[use] +# JSONParse URLConnect +plugin = basics +# aggregate exploding +plugin = analytics + +[JSONParse] + +[assign] +path = ws.rnsr +value = get('affiliations').map(address => ({ year: _.get(self, 'publication_year'), address})) + +[expand] +path = ws.rnsr +size = 100 + +[expand/exploding] + +[expand/URLConnect] +url = https://affiliations-tools.services.inist.fr/v1/rnsr/info +retries = 3 +timeout = 6000000 +noerror = true + +[expand/aggregate] + +[dump] +indent = true diff --git a/doiwos-dumps/04-enrich-etab.ini b/doiwos-dumps/04-enrich-etab.ini new file mode 100644 index 0000000..be0062a --- /dev/null +++ b/doiwos-dumps/04-enrich-etab.ini @@ -0,0 +1,19 @@ +# Usage: npx ezs enrich-etab.ini < data/corpus-simple-rnsr.json > data/corpus-simple-etab.json + +# To see the enriched notices: +# $ fx < data/corpus-simple-etab.json '.filter(o => o.ws.rnsr[0]?.length)' + + +[use] +# JSONParse +plugin = basics + +[JSONParse] + +# Concaténation des sigles et libellés des établissements associés +[assign] +path = ws.etab +value = get('ws.rnsr', []).map(structs => Array.isArray(structs) ? structs.map(struct => `${struct.sigle}: ${struct.intitule}`) : []).flatten().uniq() + +[dump] +indent = true diff --git a/doiwos-dumps/05-enrich-institutes.ini b/doiwos-dumps/05-enrich-institutes.ini new file mode 100644 index 0000000..dd04c73 --- /dev/null +++ b/doiwos-dumps/05-enrich-institutes.ini @@ -0,0 +1,48 @@ +# Usage: npx ezs enrich-institutes.ini < data/corpus-simple-etab.json > data/corpus-simple-instituts.json + +# To see the enriched notices: +# $ fx < data/corpus-simple-instituts.json '.filter(o => o.ws.rnsr[0]?.length)' + + +[use] +# JSONParse URLConnect +plugin = basics +# expand aggregate exploding +plugin = analytics + +[JSONParse] + +# - Instituts du CNRS + +[assign] +path = ws.rnsr_id +value = get("ws.rnsr").map(structs => Array.isArray(structs) ? structs.map(struct => struct.num_nat_struct) : "n/a").filter((id) => id !== "n/a").flatten().uniq() + +[assign] +path = ws.instituts +value = get("ws.rnsr_id") + +[expand] +path = ws.instituts +size = 1 + +[expand/exploding] + +[expand/expand] +path = value +size = 100 + +[expand/expand/URLConnect] +url = https://mapping-tools.services.inist.fr/v1/rnsr/instituts-cnrs/json +retries = 3 +timeout = 6000000 +noerror = true + +[expand/aggregate] + +[assign] +path = ws.instituts +value = get('ws.instituts').uniq() + +[dump] +indent = true diff --git a/doiwos-dumps/06-enrich-teeft-en.ini b/doiwos-dumps/06-enrich-teeft-en.ini new file mode 100644 index 0000000..4e88690 --- /dev/null +++ b/doiwos-dumps/06-enrich-teeft-en.ini @@ -0,0 +1,33 @@ +# Usage: npx ezs enrich-teeft-en.ini < data/corpus-simple-instituts.json > data/corpus-simple-teeft-en.json + +# To see the enriched notices: +# $ fx < data/corpus-simple-teeft-en.json '.filter(o => o.ws.rnsr[0]?.length)' + + +[use] +# JSONParse URLConnect +plugin = basics +# expand +plugin = analytics + +[JSONParse] + +# - Teeft + + +[assign] +path = ws.teeft +value = fix(_.get(self, "abstract", ""), _.get(self, "title")).join(" ; ") + +[expand] +path = ws.teeft +size = 100 + +[expand/URLConnect] +url = https://terms-extraction.services.inist.fr/v1/teeft/en?nb=10 +retries = 3 +timeout = 6000000 +noerror = true + +[dump] +indent = true diff --git a/doiwos-dumps/07-enrich-pascal.ini b/doiwos-dumps/07-enrich-pascal.ini new file mode 100644 index 0000000..1fa07ff --- /dev/null +++ b/doiwos-dumps/07-enrich-pascal.ini @@ -0,0 +1,34 @@ +# Usage: npx ezs enrich-pascal.ini < data/corpus-simple-teeft-en.json > data/corpus-simple-pascal.json + + +# To see the enriched notices: +# $ fx < data/corpus-simple-pascal.json '.filter(o => o.ws.rnsr[0]?.length)' + + +[use] +# JSONParse URLConnect +plugin = basics +# expand +plugin = analytics + +[JSONParse] + +# - Pascal + + +[assign] +path = ws.pascal +value = fix(_.get(self, "abstract", ""), _.get(self, "title")).join(" ; ") + +[expand] +path = ws.pascal +size = 100 + +[expand/URLConnect] +url = https://domains-classifier.services.inist.fr/v1/en/classify +retries = 3 +timeout = 6000000 +noerror = true + +[dump] +indent = true diff --git a/doiwos-dumps/08-extract-pascal-label.ini b/doiwos-dumps/08-extract-pascal-label.ini new file mode 100644 index 0000000..d33160b --- /dev/null +++ b/doiwos-dumps/08-extract-pascal-label.ini @@ -0,0 +1,22 @@ +# Usage: npx ezs extract-pascal-label.ini < data/corpus-simple-pascal.json > data/corpus-simple-label-pascal.json + + +# To see the enriched notices: +# $ fx < data/corpus-simple-label-pascal.json '.filter(o => o.ws.rnsr[0]?.length)' + + +[use] +# JSONParse +plugin = basics + +[JSONParse] + +# Action! + +[assign] +path = ws.pascal +value = fix(_.get(self, "ws.pascal.1.code.value", "").replace("BGI / Prodig", "Géographie"), _.get(self, "ws.pascal.2.code.value")).join(" - ") + + +[dump] +indent = true diff --git a/doiwos-dumps/09-detect-cnrs.ini b/doiwos-dumps/09-detect-cnrs.ini new file mode 100644 index 0000000..8a1252b --- /dev/null +++ b/doiwos-dumps/09-detect-cnrs.ini @@ -0,0 +1,18 @@ +# Usage: npx ezs detect-cnrs.ini < data/corpus-simple-label-pascal.json > data/corpus-simple-cnrs.json + +# To see the enriched notices: +# $ fx < data/corpus-simple-cnrs.json '.filter(o => o.ws.rnsr[0]?.length)' + + +[use] +# JSONParse +plugin = basics + +[JSONParse] + +[assign] +path = ws.cnrs +value = get('affiliations').filter(a => a.includes('CNRS') || a.includes('UMR')).thru(affiliations => affiliations.length ? "oui" : "non") + +[dump] +indent = true diff --git a/doiwos-dumps/Makefile b/doiwos-dumps/Makefile new file mode 100644 index 0000000..61cefae --- /dev/null +++ b/doiwos-dumps/Makefile @@ -0,0 +1,31 @@ +# This ensures the next time you run Make, it’ll properly re-run the failed +# rule, and guards against broken files. +# See https://tech.davis-hansson.com/p/make/#change-some-make-defaults +.DELETE_ON_ERROR: + +data/09-corpus-simple-cnrs.json: data/08-corpus-simple-label-pascal.json + npx ezs 09-detect-cnrs.ini < $< > $@ + +data/08-corpus-simple-label-pascal.json: data/07-corpus-simple-pascal.json + npx ezs 08-extract-pascal-label.ini < $< > $@ + +data/07-corpus-simple-pascal.json: data/06-corpus-simple-teeft-en.json + npx ezs 07-enrich-pascal.ini < $< > $@ + +data/06-corpus-simple-teeft-en.json: data/05-corpus-simple-instituts.json + npx ezs 06-enrich-teeft-en.ini < $< > $@ + +data/05-corpus-simple-instituts.json: data/04-corpus-simple-etab.json + npx ezs 05-enrich-institutes.ini < $< > $@ + +data/04-corpus-simple-etab.json: data/03-corpus-simple-rnsr.json + npx ezs 04-enrich-etab.ini < $< > $@ + +data/03-corpus-simple-rnsr.json: data/02-corpus-simple.json + npx ezs 03-enrich-rnsr.ini < $< > $@ + +data/02-corpus-simple.json: data/01-corpus.json + npx ezs 02-extract-fields.ini < $< > $@ + +data/01-corpus.json: 00-dois.txt + npx ezs 01-harvest-from-doi.ini < $< > $@ diff --git a/doiwos-dumps/data/.gitkeep b/doiwos-dumps/data/.gitkeep new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/doiwos-dumps/data/.gitkeep