diff --git a/wos-dumps/01-query/wos-test.txt b/wos-dumps/01-query/wos-test.txt new file mode 100644 index 0000000..1bbb73a --- /dev/null +++ b/wos-dumps/01-query/wos-test.txt @@ -0,0 +1 @@ +CU=France AND PY=2018 AND DO=(10.1063/1.5066074) diff --git a/wos-dumps/03-enrichment.ini b/wos-dumps/03-enrichment.ini new file mode 100644 index 0000000..137f233 --- /dev/null +++ b/wos-dumps/03-enrichment.ini @@ -0,0 +1 @@ +[transit] diff --git a/wos-dumps/03-refine.ini b/wos-dumps/03-refine.ini deleted file mode 100644 index 1262ef5..0000000 --- a/wos-dumps/03-refine.ini +++ /dev/null @@ -1,52 +0,0 @@ -prepend = unpack -append = pack - -[use] -plugin = basics -plugin = conditor -plugin = lodex - -[assign] -path = tmp.reprint_addresses -value = get('static_data.fullrecord_metadata.reprint_addresses.address_name', []).castArray().map(name => name.address_spec.full_address) - -path = tmp.addresses -value = get('static_data.fullrecord_metadata.addresses.address_name', []).castArray().map(name => name.address_spec.full_address) - -[replace] -path = uri -value = get("UID") - -path = doi -value = get('dynamic_data.cluster_related.identifiers.identifier').find({ type: 'doi'}).get('value').toLower() - -path = title -value = get('static_data.summary.titles.title').filter(title => title.type === "item").map(title => title.content).join('|') - -path = abstract -value = get('static_data.fullrecord_metadata.abstracts.abstract.abstract_text.p', []).castArray().map(s => s.replace(/<[^>]*>/g, "").replace("Key Points", "")).join(" ; ") - -path = publication_year -value = get('static_data.summary.pub_info.pubyear') - -path = source -value = get('static_data.summary.titles.title').filter(title => title.type === "source").map(title => title.content).join('|') - -path = affiliations -value = get('tmp.addresses', []).concat(_.get(self, 'tmp.reprint_addresses')).remove(null).uniq() - -path = countries -value = get('static_data.fullrecord_metadata.addresses.address_name', []).castArray().map("address_spec.country", []).uniq() - -path = keywords -value = get('static_data.item.keywords_plus.keyword') - -path = subjects -value = get('static_data.fullrecord_metadata.category_info.subjects.subject').map('content').uniq() - -path = subheadings -value = get('static_data.fullrecord_metadata.category_info.subheadings.subheading').castArray().uniq() - -path = headings -value = get('static_data.fullrecord_metadata.category_info.headings.heading').castArray().uniq() - diff --git a/wos-dumps/04-export.ini b/wos-dumps/04-export.ini deleted file mode 100644 index 137f233..0000000 --- a/wos-dumps/04-export.ini +++ /dev/null @@ -1 +0,0 @@ -[transit] diff --git a/wos-dumps/04-refine.ini b/wos-dumps/04-refine.ini new file mode 100644 index 0000000..4462769 --- /dev/null +++ b/wos-dumps/04-refine.ini @@ -0,0 +1,52 @@ +prepend = unpack +append = pack + +[use] +[debug] +text = AV +[assign] +path = tmp.reprint_addresses +value = get('static_data.fullrecord_metadata.reprint_addresses.address_name', []).castArray().map(name => name.address_spec.full_address) + +path = tmp.addresses +value = get('static_data.fullrecord_metadata.addresses.address_name', []).castArray().map(name => name.address_spec.full_address) + +[replace] +path = uri +value = get("UID") + +path = doi +value = get('dynamic_data.cluster_related.identifiers.identifier').find({ type: 'doi'}).get('value').toLower() + +path = title +value = get('static_data.summary.titles.title').filter(title => title.type === "item").map(title => title.content).join('|') + +path = abstract +value = get('static_data.fullrecord_metadata.abstracts.abstract.abstract_text.p', []).castArray().map(s => s.replace(/<[^>]*>/g, "").replace("Key Points", "")).join(" ; ") + +path = publication_year +value = get('static_data.summary.pub_info.pubyear') + +path = source +value = get('static_data.summary.titles.title').filter(title => title.type === "source").map(title => title.content).join('|') + +path = affiliations +value = get('tmp.addresses', []).concat(_.get(self, 'tmp.reprint_addresses')).remove(null).uniq() + +path = countries +value = get('static_data.fullrecord_metadata.addresses.address_name', []).castArray().map("address_spec.country", []).uniq() + +path = keywords +value = get('static_data.item.keywords_plus.keyword') + +path = subjects +value = get('static_data.fullrecord_metadata.category_info.subjects.subject').map('content').uniq() + +path = subheadings +value = get('static_data.fullrecord_metadata.category_info.subheadings.subheading').castArray().uniq() + +path = headings +value = get('static_data.fullrecord_metadata.category_info.headings.heading').castArray().uniq() + +[debug] +text = AP diff --git a/wos-dumps/05-export.ini b/wos-dumps/05-export.ini new file mode 100644 index 0000000..137f233 --- /dev/null +++ b/wos-dumps/05-export.ini @@ -0,0 +1 @@ +[transit] diff --git a/wos-dumps/Makefile b/wos-dumps/Makefile index ffd3e93..1ef86d3 100644 --- a/wos-dumps/Makefile +++ b/wos-dumps/Makefile @@ -1,5 +1,10 @@ +# Aliases +wos-france: wos-2018-france wos-2019-france wos-2020-france wos-2021-france wos-2022-france + # To set specific directory for each version ifeq ($(strip $(Startup)),) +# use the the day of the week +#VERSION_DIR := $(shell date +%A) # use the month VERSION_DIR := $(shell date +%B) else @@ -8,52 +13,75 @@ endif # To set ezs parameters -EZSFLAGS := -v --param VERSION_DIR="$(VERSION_DIR)" - +EZSFLAGS := --param VERSION_DIR="$(VERSION_DIR)" +# To set global parameters +ROOT_DIR := $(shell pwd) # To set the location and the extension of sources files (queries) INPUT_DIR := 01-query INPUT_EXT := txt # To set the location and the extension of results files -OUTPUT_DIR := 04-export/$(VERSION_DIR) -OUTPUT_EXT := jsonl - -# To set custom aliases -wos-2018-france: $(OUTPUT_DIR)/wos-2018-france.jsonl -wos-2019-france: $(OUTPUT_DIR)/wos-2019-france.jsonl -wos-2020-france: $(OUTPUT_DIR)/wos-2020-france.jsonl -wos-2021-france: $(OUTPUT_DIR)/wos-2021-france.jsonl -wos-2022-france: $(OUTPUT_DIR)/wos-2022-france.jsonl - +OUTPUT_DIR := 06-result/$(VERSION_DIR) +OUTPUT_EXT := txt # To generate all files from source directory to target directory SOURCE_FILES := $(wildcard $(INPUT_DIR)/*.$(INPUT_EXT)) TARGET_FILES := $(patsubst $(INPUT_DIR)/%.$(INPUT_EXT), $(OUTPUT_DIR)/%.$(OUTPUT_EXT), $(SOURCE_FILES)) -all: $(TARGET_FILES) -# Step 2: to download all result -02-download/$(VERSION_DIR)/%.jsonl: $(INPUT_DIR)/%.txt - @[ -d 02-download/$(VERSION_DIR)/ ] || mkdir -p 02-download/$(VERSION_DIR)/ - ezs $(EZSFLAGS) 02-download.ini < $< > $@ +# Phony Rules +help: + @grep -P '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' -# Step 3: to refine and select only chosen fields -03-refine/$(VERSION_DIR)/%.jsonl: 02-download/$(VERSION_DIR)/%.jsonl - @[ -d 03-refine/$(VERSION_DIR)/ ] || mkdir -p 03-refine/$(VERSION_DIR)/ - ezs $(EZSFLAGS) 03-refine.ini < $< > $@ +list: ## list all target files + @ls -1 $(TARGET_FILES) -# Step 4 : to create a export file -04-export/$(VERSION_DIR)/%.jsonl: 03-refine/$(VERSION_DIR)/%.jsonl - @[ -d 04-export/$(VERSION_DIR)/ ] || mkdir -p 04-export/$(VERSION_DIR)/ - ezs $(EZSFLAGS) 04-export.ini < $< > $@ +view: ## To view current generated files + find $(OUTPUT_DIR) -# To delete all generated files -clean: - @rm -f $(TARGET_FILES) - @echo "Files deleted!" +clean-cache: ## To delete enrichment cache + @rm -v -f $(TMPDIR)/memory -# To prevent deleting intermediate files +clean: ## To delete current generated files + @rm -v -f $(TARGET_FILES) + +drain: ## To delete all directories + @rm -v -Rf 02-download 03-enrichment 04-refine 06-export 06-result + +# Rules +all: $(TARGET_FILES) ## Build all files + +%: 06-result/$(VERSION_DIR)/%.txt + [ -f $< ] && rm $< + $(MAKE) $< + +02-download/$(VERSION_DIR)/%.jsonl: 01-query/%.txt + mkdir -p $(@D) + time ezs $(EZSFLAGS) 02-download.ini < $< > $@ + +03-enrichment/$(VERSION_DIR)/%.jsonl: 02-download/$(VERSION_DIR)/%.jsonl + mkdir -p $(@D) + time ezs $(EZSFLAGS) 03-enrichment.ini < $< > $@ + +04-refine/$(VERSION_DIR)/%.jsonl: 03-enrichment/$(VERSION_DIR)/%.jsonl + mkdir -p $(@D) + time ezs $(EZSFLAGS) 04-refine.ini < $< > $@ + +05-export/$(VERSION_DIR)/%.jsonl: 04-refine/$(VERSION_DIR)/%.jsonl + mkdir -p $(@D) + time ezs $(EZSFLAGS) 05-export.ini < $< > $@ + +# WARNING: split --additional-suffix is unavailable in busybox, so we should use find command +06-result/$(VERSION_DIR)/%.txt: 05-export/$(VERSION_DIR)/%.jsonl + $(eval TMP1 = $(realpath $<)) + $(eval TMP2 = $(basename $(notdir $<))) + $(eval TMP3 = $(@D)/$(TMP2)) + mkdir -p $(TMP3) + cd $(TMP3) && cat $(TMP1) | split -l 10000 - "$(TMP2)_" + find $(TMP3) -type f -name "$(TMP2)_??" -exec mv {} {}.jsonl \; + find $(TMP3) -type f -name "$(TMP2)_??.jsonl" -exec readlink -f {} \;|xargs wc -l > $@ + +# To prevent deleting intermediate files (for controls) .PRECIOUS: 02-download/$(VERSION_DIR)/%.jsonl -# To ignore non-file targets -.PHONY: clean - +.DEFAULT_GOAL := help +.PHONY: clean drain view help list all clean-cache