diff --git a/halinria-dumps/03-refine.ini b/halinria-dumps/03-refine.ini deleted file mode 100644 index b18c3f1..0000000 --- a/halinria-dumps/03-refine.ini +++ /dev/null @@ -1,26 +0,0 @@ -prepend = unpack -append = pack - -[use] -plugin = basics -plugin = conditor -plugin = lodex - -[assign] -path = tei -value = get('business.sourceUidChain').prepend('https://corhal-api.inist.fr/mergedDocuments/').append('.tei') - -path = title -value = get('title.default') - -path = host -value = get('host').omit('pages', 'specialIssue', 'range', 'language', 'part', 'conference', 'editors') - -[assign] -path = links -value = get('sourceUids').filter(i => (i.indexOf('hal') === 0)).map(i => i.replace(/^hal\$/, 'https://hal.archives-ouvertes.fr/')).join(' ; ') - - -[exchange] -value = self().omit(['authors', 'origins', 'business', 'files', 'keywords', 'enrichments', 'classifications', 'funders', 'sourceUids', 'abstract', 'technical', 'title', 'pmcId', 'articleNumber', 'pii', 'pmId' ]) - diff --git a/halinria-dumps/04-export.ini b/halinria-dumps/04-export.ini deleted file mode 100644 index c4eaae8..0000000 --- a/halinria-dumps/04-export.ini +++ /dev/null @@ -1,16 +0,0 @@ -prepend = unpack - -[use] -plugin = basics -plugin = conditor -plugin = lodex - -[OBJFlatten] -separator = / - -[objects2columns] -[OBJStandardize] - -[CSVString] -format = strict -separator = fix('\t') diff --git a/halinria-dumps/04-refine.ini b/halinria-dumps/04-refine.ini new file mode 100644 index 0000000..b18c3f1 --- /dev/null +++ b/halinria-dumps/04-refine.ini @@ -0,0 +1,26 @@ +prepend = unpack +append = pack + +[use] +plugin = basics +plugin = conditor +plugin = lodex + +[assign] +path = tei +value = get('business.sourceUidChain').prepend('https://corhal-api.inist.fr/mergedDocuments/').append('.tei') + +path = title +value = get('title.default') + +path = host +value = get('host').omit('pages', 'specialIssue', 'range', 'language', 'part', 'conference', 'editors') + +[assign] +path = links +value = get('sourceUids').filter(i => (i.indexOf('hal') === 0)).map(i => i.replace(/^hal\$/, 'https://hal.archives-ouvertes.fr/')).join(' ; ') + + +[exchange] +value = self().omit(['authors', 'origins', 'business', 'files', 'keywords', 'enrichments', 'classifications', 'funders', 'sourceUids', 'abstract', 'technical', 'title', 'pmcId', 'articleNumber', 'pii', 'pmId' ]) + diff --git a/halinria-dumps/05-export.ini b/halinria-dumps/05-export.ini new file mode 100644 index 0000000..c4eaae8 --- /dev/null +++ b/halinria-dumps/05-export.ini @@ -0,0 +1,16 @@ +prepend = unpack + +[use] +plugin = basics +plugin = conditor +plugin = lodex + +[OBJFlatten] +separator = / + +[objects2columns] +[OBJStandardize] + +[CSVString] +format = strict +separator = fix('\t') diff --git a/halinria-dumps/Makefile b/halinria-dumps/Makefile index a507152..fe903ff 100644 --- a/halinria-dumps/Makefile +++ b/halinria-dumps/Makefile @@ -1,7 +1,12 @@ +# Aliases +corhal-inria: corhal-inria-2014-candidats corhal-inria-2014-doublons corhal-inria-2015-candidats corhal-inria-2015-doublons corhal-inria-2016-candidats corhal-inria-2016-doublons corhal-inria-2017-candidats corhal-inria-2017-doublons corhal-inria-2018-candidats corhal-inria-2018-doublons corhal-inria-2019-candidats corhal-inria-2019-doublons corhal-inria-2020-candidats corhal-inria-2020-doublons corhal-inria-2021-candidats corhal-inria-2022-doublons + # To set specific directory for each version ifeq ($(strip $(Startup)),) # use the the day of the week -VERSION_DIR := $(shell date +%A) +#VERSION_DIR := $(shell date +%A) +# use the month +VERSION_DIR := $(shell date +%B) else # use no version VERSION_DIR := . @@ -9,62 +14,63 @@ # To set ezs parameters EZSFLAGS := --param VERSION_DIR="$(VERSION_DIR)" - +# To set global parameters +ROOT_DIR := $(shell pwd) # To set the location and the extension of sources files (queries) INPUT_DIR := 01-query INPUT_EXT := txt # To set the location and the extension of results files -OUTPUT_DIR := 04-export/$(VERSION_DIR) +OUTPUT_DIR := 05-export/$(VERSION_DIR) OUTPUT_EXT := tsv -# To set custom aliases -corhal-inria-2014-candidats: $(OUTPUT_DIR)/corhal-inria-2014-candidats.tsv -corhal-inria-2014-doublons: $(OUTPUT_DIR)/corhal-inria-2014-doublons.tsv -corhal-inria-2015-candidats: $(OUTPUT_DIR)/corhal-inria-2015-candidats.tsv -corhal-inria-2015-doublons: $(OUTPUT_DIR)/corhal-inria-2015-doublons.tsv -corhal-inria-2016-candidats: $(OUTPUT_DIR)/corhal-inria-2016-candidats.tsv -corhal-inria-2016-doublons: $(OUTPUT_DIR)/corhal-inria-2016-doublons.tsv -corhal-inria-2017-candidats: $(OUTPUT_DIR)/corhal-inria-2017-candidats.tsv -corhal-inria-2017-doublons: $(OUTPUT_DIR)/corhal-inria-2017-doublons.tsv -corhal-inria-2018-candidats: $(OUTPUT_DIR)/corhal-inria-2018-candidats.tsv -corhal-inria-2018-doublons: $(OUTPUT_DIR)/corhal-inria-2018-doublons.tsv -corhal-inria-2019-candidats: $(OUTPUT_DIR)/corhal-inria-2019-candidats.tsv -corhal-inria-2019-doublons: $(OUTPUT_DIR)/corhal-inria-2019-doublons.tsv -corhal-inria-2020-candidats: $(OUTPUT_DIR)/corhal-inria-2020-candidats.tsv -corhal-inria-2020-doublons: $(OUTPUT_DIR)/corhal-inria-2020-doublons.tsv -corhal-inria-2021-candidats: $(OUTPUT_DIR)/corhal-inria-2021-candidats.tsv -corhal-inria-2022-doublons: $(OUTPUT_DIR)/corhal-inria-2022-doublons.tsv - - # To generate all files from source directory to target directory SOURCE_FILES := $(wildcard $(INPUT_DIR)/*.$(INPUT_EXT)) TARGET_FILES := $(patsubst $(INPUT_DIR)/%.$(INPUT_EXT), $(OUTPUT_DIR)/%.$(OUTPUT_EXT), $(SOURCE_FILES)) -all: $(TARGET_FILES) -# Step 2: to download all result -02-download/$(VERSION_DIR)/%.jsonl: $(INPUT_DIR)/%.txt - @[ -d 02-download/$(VERSION_DIR)/ ] || mkdir -p 02-download/$(VERSION_DIR)/ - ezs $(EZSFLAGS) 02-download.ini < $< > $@ +# Phony Rules +help: + @grep -P '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' -# Step 3: to refine and select only chosen fields -03-refine/$(VERSION_DIR)/%.jsonl: 02-download/$(VERSION_DIR)/%.jsonl - @[ -d 03-refine/$(VERSION_DIR)/ ] || mkdir -p 03-refine/$(VERSION_DIR)/ - ezs $(EZSFLAGS) 03-refine.ini < $< > $@ +list: ## list all target files + @ls -1 $(TARGET_FILES) -# Step 4 : to create a export file -04-export/$(VERSION_DIR)/%.tsv: 03-refine/$(VERSION_DIR)/%.jsonl - @[ -d 04-export/$(VERSION_DIR)/ ] || mkdir -p 04-export/$(VERSION_DIR)/ - ezs $(EZSFLAGS) 04-export.ini < $< > $@ +view: ## To view current generated files + find $(OUTPUT_DIR) -# To delete all generated files -clean: - @rm -f $(TARGET_FILES) - @echo "Files deleted!" +clean-cache: ## To delete enrichment cache + @rm -v -f $(TMPDIR)/memory -# To prevent deleting intermediate files (useful for local dev and debug cases) -#.PRECIOUS: 02-download/$(VERSION_DIR)/%.jsonl +clean: ## To delete current generated files + @rm -v -f $(TARGET_FILES) -# To ignore non-file targets -.PHONY: clean +drain: ## To delete all directories + @rm -v -Rf 02-download 04-refine 05-export +# Rules +all: $(TARGET_FILES) ## Build all files + +%: 05-export/$(VERSION_DIR)/%.tsv + [ -f $< ] && rm $< + $(MAKE) $< + +02-download/$(VERSION_DIR)/%.jsonl: 01-query/%.txt + mkdir -p $(@D) + time ezs $(EZSFLAGS) 02-download.ini < $< > $@ + + +04-refine/$(VERSION_DIR)/%.jsonl: 02-download/$(VERSION_DIR)/%.jsonl + mkdir -p $(@D) + time ezs $(EZSFLAGS) 04-refine.ini < $< > $@ + + +05-export/$(VERSION_DIR)/%.tsv: 04-refine/$(VERSION_DIR)/%.jsonl + mkdir -p $(@D) + time ezs $(EZSFLAGS) 05-export.ini < $< > $@ + +# To prevent deleting intermediate files (for controls) +.PRECIOUS: 02-download/$(VERSION_DIR)/%.jsonl +#03-enrichment/$(VERSION_DIR)/%.jsonl + +.DEFAULT_GOAL := help +.PHONY: clean drain view help list all clean-cache