diff --git a/apil-dumps/01-query/corhal-ins2i-2022.txt b/apil-dumps/01-query/corhal-ins2i-2022.txt new file mode 100644 index 0000000..3d4006d --- /dev/null +++ b/apil-dumps/01-query/corhal-ins2i-2022.txt @@ -0,0 +1,17 @@ +(NOT business.duplicateGenre:Thèse AND ((business.authorsAddresses:ins2i) OR +(business.authorsRnsr:(201822755K OR 201220494D OR 201722314K OR 201019867F OR +201420776B OR 201420882S OR 201622160X OR 201622332J OR 200119896B OR +200619962B OR 200619963C OR 200619964D OR 200719974K OR 200819983R OR +199819875S OR 199819876T OR 199819877U OR 199819879W OR 201222358E OR +201722366S OR 201922944L OR 202123656B OR 200619753Z OR 201220443Y OR +201119573G OR 199320519N OR 199511789R OR 200311834E OR 200511875R OR +200711885T OR 200711886U OR 200717526Z OR 200919221H OR 199511949P OR +199111950H OR 199511969L OR 200317503S OR 199511665F OR 201722241F OR +201722568L OR 200012161Y OR 200012163A OR 200212221E OR 201220091R OR +201822714R OR 200112433P OR 200112440X OR 200519331V OR 200918463J OR +201120462Y OR 201220263C OR 201220432L OR 201320497C OR 198912571S OR +199712651U OR 200212717U OR 200212719W OR 200812835W OR 201220427F OR +201621976X OR 201622400H OR 199812842X OR 199812876J OR 202123711L OR +202123712M OR 201521249L OR 198319352N OR 199517454Y OR 201120535C))) AND +(host.publicationDate.normalized:2022 OR +host.electronicPublicationDate.normalized:2022)) diff --git a/apil-dumps/01-query/corhal-insu-2022.txt b/apil-dumps/01-query/corhal-insu-2022.txt new file mode 100644 index 0000000..d36a9a7 --- /dev/null +++ b/apil-dumps/01-query/corhal-insu-2022.txt @@ -0,0 +1 @@ +(((sourceUid:*insu* AND source:hal) OR (business.authorsAddresses:insu) OR (business.authorsRnsr:(202023499K OR 200119846X OR 200510659U OR 201522729V OR 201622338R OR 200610636P OR 202124035N OR 201019744X OR 201119743S OR 201119742R OR 202023501M OR 199921733G OR 202123708H OR 200420526Y OR 200521739M OR 200721741F OR 201120530X OR 200919527R OR 201220716V OR 201521823K OR 200719584L OR 200310841A OR 200810842E OR 199521753W OR 201320566C OR 201722374A OR 199911795E OR 200317531X OR 199512007C OR 200711908T OR 201119432D OR 201119454C OR 201119400U OR 201119477C OR 201622384R OR 199512000V OR 201119478D OR 199512003Y OR 200317685P OR 199112005T OR 199512012H OR 199712084C OR 199512085M OR 200012116Z OR 200012185Z OR 200012191F OR 200012210B OR 200712259Z OR 200816947R OR 199612315H OR 200817617U OR 199512316N OR 199612326V OR 200816948S OR 199612327W OR 202123702B OR 200112481S OR 200112483U OR 200512539M OR 200919203N OR 200512541P OR 200918490N OR 200512542R OR 200918450V OR 201220407J OR 201220431K OR 201220349W OR 201220274P OR 201220213Y OR 201220223J OR 200816914E OR 201220322S OR 201320574L OR 201320575M OR 201320576N OR 199712602R OR 199412629H OR 199712664H OR 199712666K OR 200212701B OR 200212760R OR 200212762T OR 200212766X OR 200212768Z OR 200212769A OR 200412804E OR 200812834V OR 200918434C OR 200611689J OR 199812847C OR 199812866Y OR 199812867Z OR 199812942F OR 199812955V OR 201922947P OR 200610854B OR 200410855L OR 200310864A OR 200620527R OR 200810872M OR 200810873N OR 201420664E OR 201020686W OR 201020888R OR 201220893M OR 201220920S OR 201121707B OR 201220900V OR 201220901W OR 201320582V OR 201320583W OR 199320515J OR 199519805D))) AND (host.publicationDate.normalized:2022 OR host.electronicPublicationDate.normalized:2022)) diff --git a/apil-dumps/Makefile b/apil-dumps/Makefile index a8601bf..48f3ecf 100644 --- a/apil-dumps/Makefile +++ b/apil-dumps/Makefile @@ -1,3 +1,7 @@ +# Aliases +corhal-ins2i: corhal-ins2i-2016 corhal-ins2i-2017 corhal-ins2i-2018 corhal-ins2i-2019 corhal-ins2i-2020 corhal-ins2i-2021 corhal-ins2i-2022 +corhal-insu: corhal-insu-2016 corhal-insu-2017 corhal-insu-2018 corhal-insu-2019 corhal-insu-2020 corhal-insu-2021 corhal-insu-2022 + # To set specific directory for each version ifeq ($(strip $(Startup)),) # use the the day of the week @@ -6,56 +10,67 @@ # use no version VERSION_DIR := . endif + # To set ezs parameters EZSFLAGS := --param VERSION_DIR="$(VERSION_DIR)" # To set the location and the extension of sources files (queries) +ROOT_DIR := $(shell pwd) INPUT_DIR := 01-query INPUT_EXT := txt + # To set the location and the extension of results files OUTPUT_DIR := 05-result/$(VERSION_DIR) -OUTPUT_EXT := jsonl +OUTPUT_EXT := txt # To generate all files from source directory to target directory SOURCE_FILES := $(wildcard $(INPUT_DIR)/*.$(INPUT_EXT)) TARGET_FILES := $(patsubst $(INPUT_DIR)/%.$(INPUT_EXT), $(OUTPUT_DIR)/%.$(OUTPUT_EXT), $(SOURCE_FILES)) -all: $(TARGET_FILES) -# To delete all generated files -clean: - @rm -f $(TARGET_FILES) - @echo "Files deleted!" +# Phony Rules +help: + @grep -P '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' -corhal-ins2i-2016: 05-result/$(VERSION_DIR)/corhal-ins2i-2016.jsonl -corhal-ins2i-2017: 05-result/$(VERSION_DIR)/corhal-ins2i-2017.jsonl -corhal-ins2i-2018: 05-result/$(VERSION_DIR)/corhal-ins2i-2018.jsonl -corhal-ins2i-2019: 05-result/$(VERSION_DIR)/corhal-ins2i-2019.jsonl -corhal-ins2i-2020: 05-result/$(VERSION_DIR)/corhal-ins2i-2020.jsonl -corhal-ins2i-2021: 05-result/$(VERSION_DIR)/corhal-ins2i-2021.jsonl -corhal-insu-2016: 05-result/$(VERSION_DIR)/corhal-insu-2016.jsonl -corhal-insu-2017: 05-result/$(VERSION_DIR)/corhal-insu-2017.jsonl -corhal-insu-2018: 05-result/$(VERSION_DIR)/corhal-insu-2018.jsonl -corhal-insu-2019: 05-result/$(VERSION_DIR)/corhal-insu-2019.jsonl -corhal-insu-2020: 05-result/$(VERSION_DIR)/corhal-insu-2020.jsonl -corhal-insu-2021: 05-result/$(VERSION_DIR)/corhal-insu-2021.jsonl -corhal-test: 05-result/$(VERSION_DIR)/corhal-test.jsonl +list: ## list all target files + @ls -1 $(TARGET_FILES) + +view: ## To view current generated files + find $(OUTPUT_DIR) + +clean: ## To delete current generated files + @rm -v -f $(TARGET_FILES) + +drain: ## To delete all directories + @rm -v -Rf 02-download 03-enrichment 04-refine 05-result + +# Rules +all: $(TARGET_FILES) ## Build all files + +%: 05-result/$(VERSION_DIR)/%.txt + [ -f $< ] && rm $< + $(MAKE) $< 02-download/$(VERSION_DIR)/%.jsonl: 01-query/%.txt - @[ -d 02-download/$(VERSION_DIR)/ ] || mkdir -p 02-download/$(VERSION_DIR)/ + mkdir -p $(@D) ezs $(EZSFLAGS) 02-download.ini < $< > $@ 03-enrichment/$(VERSION_DIR)/%.jsonl: 02-download/$(VERSION_DIR)/%.jsonl - @[ -d 03-enrichment/$(VERSION_DIR)/ ] || mkdir -p 03-enrichment/$(VERSION_DIR)/ + mkdir -p $(@D) ezs $(EZSFLAGS) 03-enrichment.ini < $< > $@ 04-refine/$(VERSION_DIR)/%.jsonl: 03-enrichment/$(VERSION_DIR)/%.jsonl - @[ -d 04-refine/$(VERSION_DIR)/ ] || mkdir -p 04-refine/$(VERSION_DIR)/ + mkdir -p $(@D) ezs $(EZSFLAGS) 04-refine.ini < $< > $@ -05-result/$(VERSION_DIR)/%.jsonl: 04-refine/$(VERSION_DIR)/%.jsonl - @[ -d 05-result/$(VERSION_DIR)/ ] || mkdir -p 05-result/$(VERSION_DIR)/ - cat $< > $@ +05-result/$(VERSION_DIR)/%.txt: 04-refine/$(VERSION_DIR)/%.jsonl + $(eval TMP1 = $(realpath $<)) + $(eval TMP2 = $(basename $(notdir $<))) + $(eval TMP3 = $(@D)/$(TMP2)) + mkdir -p $(TMP3) + cd $(TMP3) && cat $(TMP1) | split --additional-suffix=.jsonl -l 10000 - "$(TMP2)_" + find $(TMP3) -name "$(TMP2)_*" -exec readlink -f {} \;|xargs wc -l > $@ -# To prevent deleting intermediate files (useful for local dev and debug cases) -.PRECIOUS: 02-download/$(VERSION_DIR)/%.jsonl +# To prevent deleting intermediate files (useful for local tests) +# .PRECIOUS: 02-download/$(VERSION_DIR)/%.jsonl -.PHONY: clean +.DEFAULT_GOAL := help +.PHONY: clean drain view help list all