diff --git a/conditor-dumps/02-download.ini b/conditor-dumps/02-download.ini index 2cb6f21..770cb16 100644 --- a/conditor-dumps/02-download.ini +++ b/conditor-dumps/02-download.ini @@ -15,3 +15,42 @@ url = https://corhal-api.inist.fr retries = 3 timeout = 60000 + +[assign] +# Récupère electronicPublicationDate et publicationDate +# Prend la plus ancienne (= la plus petite) +# Ne garde que l'année +path = ApilPublicationDate +value = get("host.electronicPublicationDate", "9999") \ + .castArray() \ + .concat(_.get(self, "host.publicationDate", "9999")) \ + .min().toString() \ + .thru(str => str.substring(0,4)) + +[assign] +path = ApilFinancement +value = get('funders').castArray().filter(Boolean).thru(arr => Boolean(arr.length)) + +# Quand les RNSR ne sont pas fournis dans authors.*.affiliations.*.rnsr +# on utilise les enrichissements et on les met au même niveau dans ApilRnsr +[map] +path = authors + +[map/map] +path = affiliations + +[map/map/assign] +path = ApilRnsr +value = get("rnsr") + +[map/map/swing] +test = get("ApilRnsr").isEmpty() + +[map/map/swing/assign] +path = ApilRnsr +value = get("enrichments.rnsr", []) \ + .filter(rnsr => !["200612821P", "200018571R", "199812965F", "201523784S"].includes(rnsr)) + +# Supprime les champs inutiles pour les études bibliométriques +[exchange] +value = omit(['business','origins','technical']) diff --git a/conditor-dumps/03-create-fields.ini b/conditor-dumps/03-create-fields.ini deleted file mode 100644 index 827bdd2..0000000 --- a/conditor-dumps/03-create-fields.ini +++ /dev/null @@ -1,43 +0,0 @@ -# npx ezs 03-create-fields.ini -append = pack - -[unpack] - -[assign] -# Récupère electronicPublicationDate et publicationDate -# Prend la plus ancienne (= la plus petite) -# Ne garde que l'année -path = ApilPublicationDate -value = get("host.electronicPublicationDate", "9999") \ - .castArray() \ - .concat(_.get(self, "host.publicationDate", "9999")) \ - .min().toString() \ - .thru(str => str.substring(0,4)) - -[assign] -path = ApilFinancement -value = get('funders').castArray().filter(Boolean).thru(arr => Boolean(arr.length)) - -# Quand les RNSR ne sont pas fournis dans authors.*.affiliations.*.rnsr -# on utilise les enrichissements et on les met au même niveau dans ApilRnsr -[map] -path = authors - -[map/map] -path = affiliations - -[map/map/assign] -path = ApilRnsr -value = get("rnsr") - -[map/map/swing] -test = get("ApilRnsr").isEmpty() - -[map/map/swing/assign] -path = ApilRnsr -value = get("enrichments.rnsr", []) \ - .filter(rnsr => !["200612821P", "200018571R", "199812965F", "201523784S"].includes(rnsr)) - -# Supprime les champs inutiles pour les études bibliométriques -[exchange] -value = omit(['business','origins','technical']) diff --git a/conditor-dumps/03-create-fields/.gitkeep b/conditor-dumps/03-create-fields/.gitkeep deleted file mode 100644 index e69de29..0000000 --- a/conditor-dumps/03-create-fields/.gitkeep +++ /dev/null diff --git a/conditor-dumps/Makefile b/conditor-dumps/Makefile index b4e7c5c..4c6a8cc 100644 --- a/conditor-dumps/Makefile +++ b/conditor-dumps/Makefile @@ -6,7 +6,7 @@ .DELETE_ON_ERROR: # To prevent deleting intermediate files (for controls) -.PRECIOUS: 02-download/%.jsonl 03-create-fields/%.jsonl 04-enrich/%.jsonl +.PRECIOUS: 02-download/%.jsonl 04-enrich/%.jsonl SOURCE_FILES := $(wildcard 01-query/*.txt) TARGET_FILES := $(patsubst 01-query/%.txt, 05-report/%.log, $(SOURCE_FILES)) @@ -26,16 +26,11 @@ mkdir -p $(@D) wc -l $< > $@ -04-enrich/%.jsonl: 03-create-fields/%.jsonl +04-enrich/%.jsonl: 02-download/%.jsonl mkdir -p $(@D) time npx ezs 04-enrich.ini < $< > $@.crdownload mv $@.crdownload $@ -03-create-fields/%.jsonl: 02-download/%.jsonl - mkdir -p $(@D) - time npx ezs 03-create-fields.ini < $< > $@.crdownload - mv $@.crdownload $@ - 02-download/%.jsonl: 01-query/%.txt mkdir -p $(@D) time npx ezs 02-download.ini < $< > $@.crdownload