diff --git a/apil-dumps/01-query/corhal-test.txt b/apil-dumps/01-query/corhal-test.txt deleted file mode 100644 index bce25a0..0000000 --- a/apil-dumps/01-query/corhal-test.txt +++ /dev/null @@ -1 +0,0 @@ -(halId:hal-02068169) diff --git a/apil-dumps/02-download.ini b/apil-dumps/02-download.ini deleted file mode 100644 index b9a0147..0000000 --- a/apil-dumps/02-download.ini +++ /dev/null @@ -1,31 +0,0 @@ -append = pack - -[use] -plugin = basics -plugin = conditor - -[TXTConcat] - -[replace] -path = q -value = self() - -[CORHALFetch] -url = https://corhal-api.inist.fr -retries = 5 -timeout = 30000 - -[replace] -path = url -value = get('business.sourceUidChain').prepend('https://corhal-api.inist.fr/mergedDocuments/') - -[URLFetch] -url = get('url') -json = true -target = result -retries = 10 -timeout = 30000 -noerror = true - -[exchange] -value = get('result') diff --git a/apil-dumps/03-enrichment-address.ini b/apil-dumps/03-enrichment-address.ini deleted file mode 100644 index 1685b35..0000000 --- a/apil-dumps/03-enrichment-address.ini +++ /dev/null @@ -1,49 +0,0 @@ -prepend = unpack -append = pack - -[use] -plugin = basics - -[expand] -path = authors -size = env('size') - -# Ensure to process an array -[expand/assign] -path = value -value = get('value',[]).concat(null).filter(Boolean) -[expand/exploding] - -;[expand/debug] -;text = authors - -[expand/expand] -path = value.affiliations -size = env('size') - -# Expand array : -[expand/expand/assign] -path = value -value = get('value',[]).concat(null).filter(Boolean) -[expand/expand/exploding] -;[expand/expand/debug] -;text = affiliations -; - -[expand/expand/expand] -path = value.address -size = env('size') -cacheName = env('cache').thru(x => x ? 'affiliations-tools-v1-expand' : null) - -;[expand/expand/expand/debug] -;text = address - - -[expand/expand/expand/URLConnect] -url = env('url', 'https://affiliations-tools.services.inist.fr/v1/expand') -retries = env('retries') -timeout = env('timeout') -noerror = true - -[expand/expand/aggregate] -[expand/aggregate] diff --git a/apil-dumps/03-enrichment-documentType.ini b/apil-dumps/03-enrichment-documentType.ini deleted file mode 100644 index d6eb6e0..0000000 --- a/apil-dumps/03-enrichment-documentType.ini +++ /dev/null @@ -1,21 +0,0 @@ -prepend = unpack -append = pack - -[use] -plugin = basics - -[assign] -path = ws.originalGenre -value = get('originalGenre') - -[expand] -path = ws.originalGenre -size = env('size') -cacheName = env('cache').thru(x => x ? 'mapping-tools-v1-homogenize-documentType-json' : null) - -[expand/URLConnect] -url = env('url', 'https://mapping-tools.services.inist.fr/v1/homogenize/documentType/json') -retries = env('retries') -timeout = env('timeout') -noerror = true - diff --git a/apil-dumps/03-enrichment-host.ini b/apil-dumps/03-enrichment-host.ini deleted file mode 100644 index 5da871a..0000000 --- a/apil-dumps/03-enrichment-host.ini +++ /dev/null @@ -1,19 +0,0 @@ -prepend = unpack -append = pack - -[use] -plugin = basics - -[assign] -path = ws.host.title -value = get('host.title') - -[expand] -path = ws.host.title -size = env('size') -cacheName = env('cache').thru(x => x ? 'terms-extraction-v1-tools-normalize' : null) - -[expand/URLConnect] -url = env('url', 'https://terms-extraction.services.inist.fr/v1/tools/normalize') -timeout = env('timeout') -noerror = true diff --git a/apil-dumps/03-enrichment-publisher.ini b/apil-dumps/03-enrichment-publisher.ini deleted file mode 100644 index a57c140..0000000 --- a/apil-dumps/03-enrichment-publisher.ini +++ /dev/null @@ -1,20 +0,0 @@ -prepend = unpack -append = pack - -[use] -plugin = basics -[assign] -path = ws.host.publisher -value = get('host.publisher') - -[expand] -path = ws.host.publisher -size = env('size') -cacheName = env('cache').thru(x => x ? 'mapping-tools-v1-homogenize-publisher-json' : null) - -[expand/URLConnect] -url = env('url', 'https://mapping-tools.services.inist.fr/v1/homogenize/publisher/json') -retries = env('retries') -timeout = env('timeout') -noerror = true - diff --git a/apil-dumps/03-enrichment-rnsr.ini b/apil-dumps/03-enrichment-rnsr.ini deleted file mode 100644 index cb4f33f..0000000 --- a/apil-dumps/03-enrichment-rnsr.ini +++ /dev/null @@ -1,43 +0,0 @@ -prepend = unpack -append = pack - -[use] -plugin = basics - -[expand] -path = authors -size = env('size') - -# Ensure to process an array -[expand/assign] -path = value -value = get('value',[]).concat(null).filter(Boolean) -[expand/exploding] -;[expand/debug] -;text = authors - -[expand/expand] -path = value.rnsr -size = env('size') - -# Expand array : -[expand/expand/assign] -path = value -value = get('value',[]).concat(null).filter(Boolean) -[expand/expand/exploding] -;[expand/expand/debug] -;text = rnsr - -[expand/expand/expand] -path = value -size = env('size') -cacheName = env('cache').thru(x => x ? 'mapping-tools-tools-v1-rnsr-2022-json' : null) - -[expand/expand/expand/URLConnect] -url = env('url', 'https://mapping-tools.services.inist.fr/v1/rnsr/2022/json') -retries = env('retries') -timeout = env('timeout') -noerror = false - -[expand/expand/aggregate] -[expand/aggregate] diff --git a/apil-dumps/03-enrichment.ini b/apil-dumps/03-enrichment.ini deleted file mode 100644 index cd8c936..0000000 --- a/apil-dumps/03-enrichment.ini +++ /dev/null @@ -1,43 +0,0 @@ -prepend = unpack -append = pack - -[use] -plugin = basics -plugin = analytics - -[env] -path = size -value = 100 - -path = timeout -value = 9000 - -path = retries -value = 3 - -path = cache -value = true - -;path = url -;value = https://base-line.services.inist.fr/v1/echo/json - -; #1 -[delegate] -file = ./03-enrichment-documentType.ini - -; #2 -[delegate] -file = ./03-enrichment-publisher.ini - -; #3 -[delegate] -file = ./03-enrichment-rnsr.ini - -; #4 -[delegate] -file = ./03-enrichment-host.ini - -; #5 -[delegate] -file = ./03-enrichment-address.ini - diff --git a/apil-dumps/04-refine.ini b/apil-dumps/04-refine.ini deleted file mode 100644 index f892009..0000000 --- a/apil-dumps/04-refine.ini +++ /dev/null @@ -1,318 +0,0 @@ -prepend = unpack -append = pack - -[use] -plugin = basics - -[replace] - -path = ws -value = get('ws') - -path = abstract en -value = get('abstract.en') - -path = abstract fr -value = get('abstract.fr') - -path = articleNumber -value = get('articleNumber') - -path = arxiv -value = get('arxiv') - -path = authors -value = get('authors') - -path = bibCode -value = get('bibCode') - -path = cern -value = get('cern') - -path = classification dewey -value = get('classification.dewey') - -path = classification hal -value = get('classification.hal') - -path = classification tef -value = get('classification.tef') - -path = classification thesisDomain -value = get('classification.thesisDomain') - -path = creationDate -value = get('creationDate') - -path = defenseOrganisms associatedLaboratory -value = get('defenseOrganisms', []).map('associatedLaboratory') - -path = defenseOrganisms associatedLaboratoryIdRef -value = get('defenseOrganisms', []).map('associatedLaboratoryIdRef') - -path = defenseOrganisms degreeGrantor -value = get('defenseOrganisms', []).map('degreeGrantor') - -path = defenseOrganisms degreeGrantorIdRef -value = get('defenseOrganisms', []).map('degreeGrantorIdRef') - -path = defenseOrganisms doctoralSchool -value = get('defenseOrganisms', []).map('doctoralSchool') - -path = defenseOrganisms doctoralSchoolIdRef -value = get('defenseOrganisms', []).map('doctoralSchoolIdRef') - -path = documentType -value = get('documentType') - -path = doi -value = get('doi') - -path = provider -value = get('doi').split('/').head() - -path = duplicateRules -value = get('duplicateRules') - -path = duplicates sourceUid -value = get('duplicates', []).map('sourceUid') - -path = duplicates source -value = get('duplicates', []).map('source') - -path = eisbn -value = get('eisbn') - -path = eissn -value = get('eissn') - -path = electronicPublicationDate -value = get('electronicPublicationDate') - -path = enrichments classifications bso -value = get('enrichments.classifications.bso') - -path = enrichments classifications scienceMetrix -value = get('enrichments.classifications.scienceMetrix') - -path = enrichments classifications scopus -value = get('enrichments.classifications.scopus') - -path = enrichments oa core -value = get('enrichments.oa.core') - -path = enrichments oa core -value = get('enrichments.oa.core') - -path = enrichments oa unpaywall -value = get('enrichments.oa.unpaywall') - -path = enrichments oa unpaywall -value = get('enrichments.oa.unpaywall') - -path = ensam -value = get('ensam') - -path = fulltextPath -value = get('fulltextPath') - -path = funders name -value = get('funders').map('name') - -path = funders grantNumber -value = get('funders').map('grantNumber') - -path = halId -value = get('halId') - -path = hasDoi -value = get('hasDoi') - -path = hasFulltext -value = get('hasFulltext') - -path = hasTransDuplicate -value = get('hasTransDuplicate') - -path = idChain -value = get('idChain') - -path = idConditor -value = get('idConditor') - -path = idProdinra -value = get('idProdinra') - -path = ineris -value = get('ineris') - -path = inspire -value = get('inspire') - -path = ird -value = get('ird') - -path = irstea -value = get('irstea') - -path = isbn -value = get('isbn') - -path = isDeduplicable -value = get('isDeduplicable') - -path = isDuplicate -value = get('isDuplicate') - -path = isNearDuplicate -value = get('isNearDuplicate') - -path = issn -value = get('issn') - -path = issue -value = get('issue') - -path = keywords en author -value = get('keywords.en.author') - -path = keywords en mesh -value = get('keywords.en.mesh') - -path = keywords fr author -value = get('keywords.fr.author') - -path = keywords fr mesh -value = get('keywords.fr.mesh') - -path = keywords fr rameau -value = get('keywords.fr.rameau') - -path = keywords undetermined author -value = get('keywords.undetermined.author') - -path = language -value = get('language') - -path = localRef -value = get('localRef') - -path = meetingAbstractNumber -value = get('meetingAbstractNumber') - -path = nearDuplicates sourceUid -value = get('nearDuplicates').map('sourceUid') - -path = nearDuplicates source -value = get('nearDuplicates').map('source') - -path = nearDuplicatesDetectedBySimilarity sourceUid -value = get('nearDuplicatesDetectedBySimilarity', []).map('sourceUid') - -path = nearDuplicatesDetectedBySimilarity source -value = get('nearDuplicatesDetectedBySimilarity', []).map('source') - -path = nnt -value = get('nnt') - -path = oatao -value = get('oatao') - -path = okina -value = get('okina') - -path = otherNumber -value = get('otherNumber') - -path = pageRange -value = get('pageRange') - -path = part -value = get('part') - -path = patentNumber -value = get('patentNumber') - -path = path -value = get('path') - -path = pii -value = get('pii') - -path = pmc -value = get('pmc') - -path = pmId -value = get('pmId') - -path = ppn -value = get('ppn') - -path = publicationDate -value = get('publicationDate') - -path = publisher -value = get('publisher') - -path = reportNumber -value = get('reportNumber') - -path = sciencespo -value = get('sciencespo') - -path = sessionName -value = get('sessionName') - -path = source -value = get('source') - -path = sourceId -value = get('sourceId') - -path = sourceUid -value = get('sourceUid') - -path = specialIssue -value = get('specialIssue') - -path = supplement -value = get('supplement') - -path = thesisAdvisor forename -value = get('thesisAdvisor', []).map('forename') - -path = thesisAdvisor idRef -value = get('thesisAdvisor', []).map('idRef') - -path = thesisAdvisor surname -value = get('thesisAdvisor', []).map('surname') - -path = title default -value = get('title.default') - -path = title en -value = get('title.en') - -path = title fr -value = get('title.fr') - -path = title journal -value = get('title.journal') - -path = title meeting -value = get('title.meeting') - -path = title monography -value = get('title.monography') - -path = typeConditor -value = get('typeConditor') - -path = utKey -value = get('utKey') - -path = volume -value = get('volume') - -[OBJFlatten] diff --git a/apil-dumps/Makefile b/apil-dumps/Makefile deleted file mode 100644 index f5c4ba0..0000000 --- a/apil-dumps/Makefile +++ /dev/null @@ -1,82 +0,0 @@ -# To set specific directory for each crontab run -ifeq ($(strip $(Startup)),) -# use the the day of the week -#VERSION_DIR := $(shell date +%A) -# use the month -VERSION_DIR := $(shell date +%B) -else -# use no version -VERSION_DIR := . -endif - -# To set ezs parameters -EZSFLAGS := --param VERSION_DIR="$(VERSION_DIR)" -# To set global parameters -ROOT_DIR := $(shell pwd) -# To set the location and the extension of sources files (queries) -INPUT_DIR := 01-query -INPUT_EXT := txt - -# To set the location and the extension of results files -OUTPUT_DIR := 05-result/$(VERSION_DIR) -OUTPUT_EXT := jsonl - -# To generate all files from source directory to target directory -SOURCE_FILES := $(wildcard $(INPUT_DIR)/*.$(INPUT_EXT)) -TARGET_FILES := $(patsubst $(INPUT_DIR)/%.$(INPUT_EXT), $(OUTPUT_DIR)/%.$(OUTPUT_EXT), $(SOURCE_FILES)) - -# Phony Rules -help: - @grep -P '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' - -list: ## list all target files - @ls -1 $(TARGET_FILES) - -view: ## To view current generated files - find $(OUTPUT_DIR) - -clean-cache: ## To delete enrichment cache - @rm -v -f $(TMPDIR)/memory - -clean: ## To delete current generated files - @rm -v -f $(TARGET_FILES) - -drain: ## To delete all directories (except the source directory) - @rm -v -Rf 02-download 03-enrichment 04-refine 05-result - -watch: ## Automatically build files when they change - while true; do \ - inotifywait -qr -e modify -e create -e delete -e move --exclude '/\.' $(INPUT_DIR); \ - make clean-cache; \ - make all; \ - done - -# Rules -all: $(TARGET_FILES) ## Build all files - -%: $(OUTPUT_DIR)/%.$(OUTPUT_EXT) - @echo "$<" - -02-download/$(VERSION_DIR)/%.jsonl: 01-query/%.txt - mkdir -p $(@D) - time ezs $(EZSFLAGS) 02-download.ini < $< > $@.crdownload - mv $@.crdownload $@ - -03-enrichment/$(VERSION_DIR)/%.jsonl: 02-download/$(VERSION_DIR)/%.jsonl - mkdir -p $(@D) - time ezs $(EZSFLAGS) 03-enrichment.ini < $< > $@ - -04-refine/$(VERSION_DIR)/%.jsonl: 03-enrichment/$(VERSION_DIR)/%.jsonl - mkdir -p $(@D) - time ezs $(EZSFLAGS) 04-refine.ini < $< > $@ - -05-result/$(VERSION_DIR)/%.jsonl: 04-refine/$(VERSION_DIR)/%.jsonl - mkdir -p $(@D) - mv $< $@ - ls -lhag $@ |sed -re 's/^[^ ]* //' >> "$(subst .jsonl,.log,$@)" - -# To prevent deleting intermediate files (for controls) -#.PRECIOUS: 02-download/$(VERSION_DIR)/%.jsonl 03-enrichment/$(VERSION_DIR)/%.jsonl - -.DEFAULT_GOAL := help -.PHONY: clean drain view help list all clean-cache watch diff --git a/conditor-dumps/02-download.ini b/conditor-dumps/02-download.ini index 2cb6f21..770cb16 100644 --- a/conditor-dumps/02-download.ini +++ b/conditor-dumps/02-download.ini @@ -15,3 +15,42 @@ url = https://corhal-api.inist.fr retries = 3 timeout = 60000 + +[assign] +# Récupère electronicPublicationDate et publicationDate +# Prend la plus ancienne (= la plus petite) +# Ne garde que l'année +path = ApilPublicationDate +value = get("host.electronicPublicationDate", "9999") \ + .castArray() \ + .concat(_.get(self, "host.publicationDate", "9999")) \ + .min().toString() \ + .thru(str => str.substring(0,4)) + +[assign] +path = ApilFinancement +value = get('funders').castArray().filter(Boolean).thru(arr => Boolean(arr.length)) + +# Quand les RNSR ne sont pas fournis dans authors.*.affiliations.*.rnsr +# on utilise les enrichissements et on les met au même niveau dans ApilRnsr +[map] +path = authors + +[map/map] +path = affiliations + +[map/map/assign] +path = ApilRnsr +value = get("rnsr") + +[map/map/swing] +test = get("ApilRnsr").isEmpty() + +[map/map/swing/assign] +path = ApilRnsr +value = get("enrichments.rnsr", []) \ + .filter(rnsr => !["200612821P", "200018571R", "199812965F", "201523784S"].includes(rnsr)) + +# Supprime les champs inutiles pour les études bibliométriques +[exchange] +value = omit(['business','origins','technical']) diff --git a/conditor-dumps/03-create-fields.ini b/conditor-dumps/03-create-fields.ini deleted file mode 100644 index 827bdd2..0000000 --- a/conditor-dumps/03-create-fields.ini +++ /dev/null @@ -1,43 +0,0 @@ -# npx ezs 03-create-fields.ini -append = pack - -[unpack] - -[assign] -# Récupère electronicPublicationDate et publicationDate -# Prend la plus ancienne (= la plus petite) -# Ne garde que l'année -path = ApilPublicationDate -value = get("host.electronicPublicationDate", "9999") \ - .castArray() \ - .concat(_.get(self, "host.publicationDate", "9999")) \ - .min().toString() \ - .thru(str => str.substring(0,4)) - -[assign] -path = ApilFinancement -value = get('funders').castArray().filter(Boolean).thru(arr => Boolean(arr.length)) - -# Quand les RNSR ne sont pas fournis dans authors.*.affiliations.*.rnsr -# on utilise les enrichissements et on les met au même niveau dans ApilRnsr -[map] -path = authors - -[map/map] -path = affiliations - -[map/map/assign] -path = ApilRnsr -value = get("rnsr") - -[map/map/swing] -test = get("ApilRnsr").isEmpty() - -[map/map/swing/assign] -path = ApilRnsr -value = get("enrichments.rnsr", []) \ - .filter(rnsr => !["200612821P", "200018571R", "199812965F", "201523784S"].includes(rnsr)) - -# Supprime les champs inutiles pour les études bibliométriques -[exchange] -value = omit(['business','origins','technical']) diff --git a/conditor-dumps/03-create-fields/.gitkeep b/conditor-dumps/03-create-fields/.gitkeep deleted file mode 100644 index e69de29..0000000 --- a/conditor-dumps/03-create-fields/.gitkeep +++ /dev/null diff --git a/conditor-dumps/Makefile b/conditor-dumps/Makefile index b4e7c5c..4c6a8cc 100644 --- a/conditor-dumps/Makefile +++ b/conditor-dumps/Makefile @@ -6,7 +6,7 @@ .DELETE_ON_ERROR: # To prevent deleting intermediate files (for controls) -.PRECIOUS: 02-download/%.jsonl 03-create-fields/%.jsonl 04-enrich/%.jsonl +.PRECIOUS: 02-download/%.jsonl 04-enrich/%.jsonl SOURCE_FILES := $(wildcard 01-query/*.txt) TARGET_FILES := $(patsubst 01-query/%.txt, 05-report/%.log, $(SOURCE_FILES)) @@ -26,16 +26,11 @@ mkdir -p $(@D) wc -l $< > $@ -04-enrich/%.jsonl: 03-create-fields/%.jsonl +04-enrich/%.jsonl: 02-download/%.jsonl mkdir -p $(@D) time npx ezs 04-enrich.ini < $< > $@.crdownload mv $@.crdownload $@ -03-create-fields/%.jsonl: 02-download/%.jsonl - mkdir -p $(@D) - time npx ezs 03-create-fields.ini < $< > $@.crdownload - mv $@.crdownload $@ - 02-download/%.jsonl: 01-query/%.txt mkdir -p $(@D) time npx ezs 02-download.ini < $< > $@.crdownload