diff --git a/wos-dumps/02-download.ini b/wos-dumps/02-download.ini index 6d38e86..284ad4b 100644 --- a/wos-dumps/02-download.ini +++ b/wos-dumps/02-download.ini @@ -3,6 +3,7 @@ [use] plugin = basics plugin = conditor +plugin = analytics [TXTConcat] @@ -19,6 +20,9 @@ path = sortField value = LD+D +[throttle] +bySecond = 1 + [WOSFetch] url = https://wos-api.clarivate.com/api/wos retries = 5 @@ -26,3 +30,143 @@ token = env('WOS_API_KEY') [ungroup] + +[replace] + +path = AccessionNumber +value = get("UID","") + +path = Identifiers +value = get("dynamic_data.cluster_related.identifiers.identifier", []) \ + .thru(identifiers => ({ \ + DOI: identifiers.find(i => i.type === "doi")?.value || "", \ + ISSN: identifiers.find(i => i.type === "issn")?.value || "", \ + eISSN: identifiers.find(i => i.type === "eissn")?.value || "", \ + eISBN: identifiers.find(i => i.type === "eisbn")?.value || "", \ + ArticleNumber: identifiers.find(i => i.type === "art_no")?.value || "", \ + PMID: identifiers.find(i => i.type === "pmid")?.value || "" })) + +path = DocumentType +value = get("static_data.summary.doctypes") \ + .thru(obj => [].concat(obj || []).map(i => i?.doctype || null)) + +path = Titles +value = get("static_data.summary.titles.title") \ + .thru(titles => ({ DocumentTitle: titles.find(i => i.type === "item")?.content || "", \ + Source: titles.find(i => i.type === "source")?.content || "" })) + +path = Publisher +value = get("static_data.summary.publishers.publisher.names.name") \ + .thru(pub => ({ Publisher: pub?.full_name || "", \ + PublisherUnifiedName: (pub?.unified_name || "").toUpperCase() })) + +path = PubInfo +value = get("static_data.summary.pub_info") \ + .thru(pub => ({ Volume: pub?.vol || "", Issue: pub?.issue || "", \ + BeginningPage: pub?.page?.begin || "", EndingPage: pub?.page?.end || "", \ + YearPublished: pub?.pubyear || "", EarlyAccessYear: pub?.early_access_year || "" })) + +path = Conference +value = get("static_data.summary.conferences.conference") \ + .thru(conf => ({ ConferenceDate: conf?.conf_dates?.conf_date?.content || "", \ + ConferenceTitle: conf?.conf_titles?.conf_title || "", \ + ConferenceCity: conf?.conf_locations?.conf_location?.conf_city || "", \ + ConferenceState: conf?.conf_locations?.conf_location?.conf_state || "" })) + +path = Authors +value = get("static_data.summary.names.name") \ + .thru(authors => [].concat(authors || []).filter(i => i.role === "author") \ + .map(i => ({ wos_standard: i.wos_standard, \ + full_name_deburred: _.deburr(i.preferred_name?.full_name || ""), \ + full_name: i.preferred_name?.full_name || "" }))) + +path = AuthorsWithAddress +value = get("static_data.fullrecord_metadata.addresses.address_name", []) \ + .thru(addresses => { \ + let authorsData = {}; \ + [].concat(addresses || []).forEach(addr => { \ + [].concat(addr.names?.name || []).filter(a => a.role === "author").forEach(author => { \ + let key = author.wos_standard || author.full_name || "Unknown"; \ + if (!authorsData[key]) { \ + authorsData[key] = { \ + wos_standard: author.wos_standard || "Unknown", \ + full_name: author.preferred_name?.full_name || author.full_name || "Unknown", \ + countries: new Set(), \ + cities: new Set(), \ + full_addresses: new Set() \ + }; \ + } \ + authorsData[key].countries.add(addr.address_spec?.country || ""); \ + authorsData[key].cities.add(addr.address_spec?.city || ""); \ + authorsData[key].full_addresses.add(addr.address_spec?.full_address || ""); \ + }); \ + }); \ + return Object.values(authorsData).map(a => ({ \ + wos_standard: a.wos_standard, \ + full_name: a.full_name, \ + country: [...a.countries], \ + city: [...a.cities], \ + full_address: [...a.full_addresses] \ + })); \ + }) + +path = ReprintAddresses +value = get("static_data.fullrecord_metadata.reprint_addresses.address_name", []) \ + .thru(reprints => [].concat(reprints || []).map(addr => { \ + let authors = [].concat(addr.names?.name || []).filter(a => a.role === "author"); \ + let wos_standard = authors.length > 0 ? authors.map(a => a.wos_standard || "Unknown") : ["Unknown"]; \ + let full_name = authors.length > 0 ? authors.map(a => a.full_name || "Unknown") : ["Unknown"]; \ + return { \ + wos_standard: wos_standard, \ + full_name: full_name, \ + country: addr.address_spec?.country || "", \ + city: addr.address_spec?.city || "", \ + full_address: addr.address_spec?.full_address || "" \ + }; \ + })) + +path = KeywordsPlus +value = get("static_data.item.keywords_plus.keyword", []) \ + .thru(kw => [].concat(kw || [])) + +path = AuthorKeywords +value = get("static_data.fullrecord_metadata.keywords.keyword", []) \ + .thru(kw => [].concat(kw || [])) + +path = CitationCountWOS +value = get("dynamic_data.citation_related.tc_list.silo_tc", []) \ + .thru(silos => [].concat(silos || []).find(i => i.coll_id === "WOS")?.local_count || 0) + +path = CitationTopics +value = get("dynamic_data.citation_related.citation_topics.subj-group.subject", []) \ + .thru(subjects => { \ + let subjectArray = [].concat(subjects || []); \ + return { \ + Macro: subjectArray.find(i => i["content-type"] === "macro") ? \ + `${subjectArray.find(i => i["content-type"] === "macro")["content-id"]} - ${subjectArray.find(i => i["content-type"] === "macro").content}` : "", \ + Meso: subjectArray.find(i => i["content-type"] === "meso") ? \ + `${subjectArray.find(i => i["content-type"] === "meso")["content-id"]} - ${subjectArray.find(i => i["content-type"] === "meso").content}` : "", \ + Micro: subjectArray.find(i => i["content-type"] === "micro") ? \ + `${subjectArray.find(i => i["content-type"] === "micro")["content-id"]} - ${subjectArray.find(i => i["content-type"] === "micro").content}` : "" \ + }; \ + }) + +path = Categories +value = get("static_data.fullrecord_metadata.category_info.subjects.subject", []) \ + .thru(subjects => ({ ResearchAreas: subjects.find(i => i.ascatype === "extended")?.content || "", \ + WebofScienceCategories: subjects.find(i => i.ascatype === "traditional")?.content || "" })) + +path = NormalizedLanguage +value = get("static_data.fullrecord_metadata.normalized_languages.language", []) \ + .thru(langs => [].concat(langs || []).map(i => i.content || "")) + +path = NormalizedDocumentType +value = get("static_data.fullrecord_metadata.normalized_doctypes.doctype", []) \ + .thru(docs => [].concat(docs || []).map(i => i || "")) + +path = Abstract +value = get("static_data.fullrecord_metadata.abstracts.abstract.abstract_text.p","") + +path = SDG +value = get("dynamic_data.citation_related.SDG.sdg_category", []) \ + .castArray().map("content") \ No newline at end of file diff --git a/wos-dumps/03-enrich.ini b/wos-dumps/03-enrich.ini index 3daddc0..5be34be 100644 --- a/wos-dumps/03-enrich.ini +++ b/wos-dumps/03-enrich.ini @@ -1,2 +1,41 @@ -[transit] +append = pack +[use] +; URLConnect +plugin = basics +plugin = analytics +[unpack] +[assign] +path = RevueVolume +value = fix(`${self.Titles.Source}${self.PubInfo.Volume ? ' Volume:'+self.PubInfo.Volume :''}${self.PubInfo.Issue ? ' Issue:'+self.PubInfo.Issue :''}${self.Identifiers.ArticleNumber ? ' Article Number:'+self.Identifiers.ArticleNumber :''}${self.PubInfo.BeginningPage && self.PubInfo.EndingPage ? ' Pages:'+self.PubInfo.BeginningPage+'-'+self.PubInfo.EndingPage :''} Published:${self.PubInfo.YearPublished}`) + +[assign] +path = DisciplinesESI +value = get("Categories.WebofScienceCategories") +[expand] +path = DisciplinesESI +size = 100 +file = ./03.1-enrich-ESI.ini + +[assign] +path = JCRSubjectCategory +value = get("Titles.Source") +[expand] +path = JCRSubjectCategory +size = 100 +file = ./03.2-enrich-JCR.ini + +[assign] +path = Unpaywall +value = get("Identifiers.DOI") +[swing] +test = get("Unpaywall").isEmpty() +reverse = true +[swing/expand] +path = Unpaywall +size = 100 +[swing/expand/URLConnect] +url = https://biblio-tools.services.istex.fr/v2/unpaywall/works/expand +timeout = 3600000 +noerror = false +retries = 5 \ No newline at end of file diff --git a/wos-dumps/03.1-enrich-ESI.ini b/wos-dumps/03.1-enrich-ESI.ini new file mode 100644 index 0000000..84dd77e --- /dev/null +++ b/wos-dumps/03.1-enrich-ESI.ini @@ -0,0 +1,39 @@ +# Création d'une variable d'environnement qui stocke la date au format ISO +[env] +path = date +value = thru(d => (new Date()).toISOString().split("T")[0]) + +# Sert à traiter individuellement chaque valeur de la liste (chaque code ici) +[map] +path = value +# Création d'une clé temporaire pour stocker la valeur actuelle + +[map/replace] +path = tmpkey +value = self() + +[map/combine] +path = tmpkey +# URL vers un fichier CSV accessible via internet +primer = http://mapping-tables.daf.intra.inist.fr/SC-DiscESI-Correspondance.tsv + +# Création d'un fichier temporaire local avec la date définie dans [env] (facultatif, évite de télécharger le fichier plusieurs fois) +cacheName = ESI +default = n/a + +[map/combine/URLStream] +path = false + +[map/combine/CSVParse] +separator = fix("\t") + +[map/combine/CSVObject] +# Une fois le TSV parsé en objets JSON, la valeur de "From" est transformé en clé et celle de "To" en valeur ({"From": "code1", "To": "Intitulé1"}). Puis on transforme l'objet avec "intitule" comme clé. +[map/combine/replace] +path = id +value = get('From') +path = value +value = get('To').split().zipObject(["intitule"]).invert() +# Enfin on extrait uniquement la vaeur de "intitule" après le mapping +[map/exchange] +value = get('tmpkey.value.intitule') \ No newline at end of file diff --git a/wos-dumps/03.2-enrich-JCR.ini b/wos-dumps/03.2-enrich-JCR.ini new file mode 100644 index 0000000..0e7910c --- /dev/null +++ b/wos-dumps/03.2-enrich-JCR.ini @@ -0,0 +1,27 @@ +[assign] +path = value +value=get("value") + +[combine] +path = value +primer = http://mapping-tables.daf.intra.inist.fr/JCR_AvecNotoriete.tsv +; nom du fichier temporaire local (facultatif, évite de télécharger le fichier plusieurs fois) +cacheName = jCRCitation +default = 0 - Sans notoriété + +[combine/URLStream] +path = false + +[combine/CSVParse] +separator = fix("\t") + +[combine/CSVObject] + +[combine/replace] +path=id +value=get('From') +path = value +value = get('To') +[assign] +path=value +value=get("value.value") \ No newline at end of file diff --git a/wos-dumps/README.md b/wos-dumps/README.md new file mode 100644 index 0000000..a2d84df --- /dev/null +++ b/wos-dumps/README.md @@ -0,0 +1,86 @@ +# WoS-dumps + +Collecte, restructuration et enrichissement de données WebofScience. + +Les fichiers fournis sont au format JSON Lines (`.jsonl`) + +> 📗 Tant qu'une étape n'est pas terminée, le fichier résultant est suffixé par `.crdownload`. + +Pour interroger l'API il convient de renseigner sa clé API. Celle-ci ne pouvant être divulguée ici, on la déclare en variable d'environnement dans la configuration de l'instance EZMaster. Voir plus bas. + +## Étapes + +### 01-query + +Dépôt d'un fichier `NOM_DU_FICHIER.txt` (requête) dans le répertoire `01-query`. + +### 02-download + +Téléchargement de données via la requête sur l'API WoS. + +Etant donné la structure des JSON récupérés, le code a été factorisé afin de réduire la redondance et surtout le nombre d'appels aux données. +Les données extraites sont donc regroupées selon leur imbrication : + +- `AccessionNumber` => Les UTWOS +- `Identifiers` => `DOI`, `ISSN`, `eISSN`, `eISBN`, `ArticleNumber` & `PMID` +- `DocumentType` => Les types de document +- `Titles` => `DocumentTitle` & `Source` +- `PubInfo` => `Volume`, `Issue`, `BeginningPage`, `EndingPage`, `YearPublished` & `EarlyAccessYear` +- `Conference` => `ConferenceDate`, `ConferenceTitle`, `ConferenceCity` & `ConferenceState` +- `Authors` => Pour chaque auteur `wos_standard`, `full_name_deburred` & `full_name` +- `AuthorsWithAddress` => Pour chaque auteur `wos_standard`, `full_name`, `country`, `city` & `full_address` +- `ReprintAddresses` => Pour chaque auteur `wos_standard`, `full_name`, `country`, `city` & `full_address` +- `KeywordsPlus` => Les mots-clés WoS +- `AuthorKeywords` => Les mots-clés d'auteurs +- `CitationCountWOS` => Le nombre de citations +- `CitationTopics` => `Macro`, `Meso` & `Micro` +- `Categories` => `ResearchAreas` & `WebofScienceCategories` +- `NormalizedLanguage` => La/les langue(s) du document +- `NormalizedDocumentType` => Les types de document normalisés +- `Abstract` => Le résumé du document +- `SDG` => Les Sustainable Development Goals + +### 03-enrich + + +- `RevueVolume` => Template string qui concatène, si les données existent, les éléments suivants dans une chaîne du type : `Source` `Volume:` `Issue:` `Article Number:` `Pages:`(`BeginningPage EndingPage`) `Published:` +- `DisciplinesESI` => enrichissement par sous-flux. Le fichier `03.1-enrich-ESI.ini` récupère les disciplines ESI à partir de `WebofScienceCategories` +- `JCRSubjectCategory` => enrichissement par sous-flux. Le fichier `03.2-enrich-JCR.ini` récupère les données relatives à la notoriété d'une revue à partir de `Source` +- `Unpaywall` interroge le web-service Unpaywall uniquement si le document possède un DOI (évite les interrogations inutiles) + +### 04-report + +Cette étape sert à générer un fichier par requête, contenant le nombre de lignes +du fichier (et donc le nombre de notices) et le nom du fichier correspondant +dans le répertoire `03-enrich`. + +## Configuration + +> ⚠️ **Attention**: utilisez au moins la version 1.0.4 de `lodex-makefile` et la version 14 de `node`. + +S'assurer d'avoir les versions suivantes : + +```json +"packages": [ +"@ezs/conditor@2.11.1", +"@ezs/analytics@2.3.3", +"@ezs/basics@2.8.0", +"@ezs/core@3.10.7" +] +``` + +Reste de la [configuration](../wos-dumps-config.json) : + +```json +{ + "environnement": { + "EZS_VERBOSE": false, + "EZS_PIPELINE_DELAY": 3600, + "WOS_API_KEY": "********************************" + }, + "files" : { + "zip": "https://gitbucket.inist.fr/tdm/web-dumps/archive/wos-dumps/wos-dumps@4.0.0.zip" + } +} +``` +