diff --git a/wos-dumps/02-download.ini b/wos-dumps/02-download.ini index 7cd5932..8a54374 100644 --- a/wos-dumps/02-download.ini +++ b/wos-dumps/02-download.ini @@ -28,7 +28,7 @@ retries = 5 timeout = 120000 token = env('WOS_API_KEY') -step = 20 +step = 100 [ungroup] @@ -50,7 +50,7 @@ return acc}, { DOI: "", ISSN: "", eISSN: "", eISBN: "", ArticleNumber: "", PMID: "" })) path = DocumentType -value = get("static_data.summary.doctypes").thru(obj => [].concat(obj || []).map(i => i?.doctype || null)) +value = get("static_data.summary.doctypes").thru(obj => [].concat(obj || []).map(i => i?.doctype || null)).flatten() path = Titles value = get("static_data.summary.titles.title") \ @@ -59,8 +59,7 @@ path = Publisher value = get("static_data.summary.publishers.publisher.names.name") \ - .thru(pub => ({ Publisher: pub?.full_name || "", \ - PublisherUnifiedName: (pub?.unified_name || "").toUpperCase() })) + .thru(pub => _.toUpper(pub?.unified_name || pub?.full_name || "")) path = PubInfo value = get("static_data.summary.pub_info") \ @@ -93,6 +92,7 @@ authorsData[key] = { \ wos_standard: author.wos_standard || "Unknown", \ full_name: author.preferred_name?.full_name || author.full_name || "Unknown", \ + full_name_deburred: _.deburr(author.preferred_name?.full_name || author.full_name || "Unknown"), \ addresses: [] \ }; \ } \ @@ -105,28 +105,36 @@ authorsData[key].addresses.push({ full_address, country, city, organizations }); \ }); \ }); \ - return Object.values(authorsData).map(a => ({ \ - wos_standard: a.wos_standard, \ - full_name: a.full_name, \ - addresses: a.addresses \ - })); \ + return Object.values(authorsData); \ }) path = ReprintAddresses value = get("static_data.fullrecord_metadata.reprint_addresses.address_name", []) \ - .thru(reprints => [].concat(reprints || []).map(addr => { \ - let authors = [].concat(addr.names?.name || []).filter(a => a.role === "author"); \ - let wos_standard = authors.length > 0 ? authors.map(a => a.wos_standard || "Unknown") : ["Unknown"]; \ - let full_name = authors.length > 0 ? authors.map(a => a.full_name || "Unknown") : ["Unknown"]; \ - return { \ - wos_standard: wos_standard, \ - full_name: full_name, \ - country: addr.address_spec?.country || "", \ - city: addr.address_spec?.city || "", \ - full_address: addr.address_spec?.full_address || "" \ - }; \ - })) + .thru(reprints => { \ + let authorsData = {}; \ + [].concat(reprints || []).forEach(addr => { \ + let author = addr.names?.name || {}; \ + let key = author.wos_standard || author.full_name || "Unknown"; \ + if (!authorsData[key]) { \ + authorsData[key] = { \ + wos_standard: author.wos_standard || "Unknown", \ + full_name: author.full_name || "Unknown", \ + full_name_deburred: _.deburr(author.full_name || "Unknown"), \ + addresses: [] \ + }; \ + } \ + authorsData[key].addresses.push({ \ + full_address: addr.address_spec?.full_address || "Unknown address", \ + country: addr.address_spec?.country || "Unknown country", \ + city: addr.address_spec?.city || "Unknown city", \ + organizations: _.castArray(addr.address_spec?.organizations?.organization || []) \ + .filter(org => org.pref === "Y") \ + .map(org => org.content) \ + }); \ + }); \ + return Object.values(authorsData); \ + }) path = KeywordsPlus value = get("static_data.item.keywords_plus.keyword", []) \ @@ -156,16 +164,14 @@ path = Categories value = get("static_data.fullrecord_metadata.category_info.subjects.subject", []) \ - .thru(subjects => ({ ResearchAreas: subjects.find(i => i.ascatype === "extended")?.content || "", \ - WebofScienceCategories: subjects.find(i => i.ascatype === "traditional")?.content || "" })) + .thru(subjects => ({ \ + ResearchAreas: _.castArray(subjects).filter(i => i.ascatype === "extended").map(i => i.content), \ + WebofScienceCategories: _.castArray(subjects).filter(i => i.ascatype === "traditional").map(i => i.content)})) path = NormalizedLanguage value = get("static_data.fullrecord_metadata.normalized_languages.language", []) \ .thru(langs => [].concat(langs || []).map(i => i.content || "")) -path = NormalizedDocumentType -value = get("static_data.fullrecord_metadata.normalized_doctypes.doctype", []).thru(docs => [].concat(docs || []).map(i => i || "")) - path = Abstract value = get("static_data.fullrecord_metadata.abstracts.abstract.abstract_text.p","") diff --git a/wos-dumps/03-enrich.ini b/wos-dumps/03-enrich.ini index a77ab43..a4dbcf7 100644 --- a/wos-dumps/03-enrich.ini +++ b/wos-dumps/03-enrich.ini @@ -5,6 +5,11 @@ plugin = analytics [unpack] + +[env] +path = dictionary +value = fix({"PEOPLES R CHINA": "CHINA","USA": "UNITED STATES","ENGLAND": "UNITED KINGDOM","WALES": "UNITED KINGDOM","SCOTLAND": "UNITED KINGDOM","BOSNIA & HERCEG": "BOSNIA & HERZEGOVINA","COTE IVOIRE": "COTE D’IVOIRE","NORTH IRELAND": "UNITED KINGDOM","DEM REP CONGO": "CONGO - KINSHASA","REP CONGO": "CONGO - BRAZZAVILLE","CZECH REPUBLIC": "CZECHIA","DOMINICAN REP": "DOMINICAN REPUBLIC","MYANMAR": "MYANMAR (BURMA)","CENT AFR REPUBL": "CENTRAL AFRICAN REPUBLIC","EQUAT GUINEA": "EQUATORIAL GUINEA","TRINIDAD TOBAGO": "TRINIDAD & TOBAGO","BRITISH VIRGIN ISL": "BRITISH VIRGIN ISLANDS","PAPUA N GUINEA": "PAPUA NEW GUINEA","U ARAB EMIRATES": "UNITED ARAB EMIRATES","MACEDONIA": "NORTH MACEDONIA","VATICAN": "VATICAN CITY","TURKEY": "TURKIYE","SWAZILAND": "ESWATINI","ST KITTS & NEVI": "ST. KITTS & NEVIS","ST HELENA": "ST. HELENA","SAO TOME & PRIN": "SAO TOME & PRINCIPE","PALESTINE": "PALESTINIAN TERRITORIES","GUINEA BISSAU": "GUINEA-BISSAU","FALKLAND ISLAND": "FALKLAND ISLANDS","ANTIGUA & BARBU": "ANTIGUA & BARBUDA"}) + [assign] path = RevueVolume value = fix(`${self.Titles.Source}${self.PubInfo.Volume ? ' Volume:'+self.PubInfo.Volume :''}${self.PubInfo.Issue ? ' Issue:'+self.PubInfo.Issue :''}${self.Identifiers.ArticleNumber ? ' Article Number:'+self.Identifiers.ArticleNumber :''}${self.PubInfo.BeginningPage && self.PubInfo.EndingPage ? ' Pages:'+self.PubInfo.BeginningPage+'-'+self.PubInfo.EndingPage :''} Published:${self.PubInfo.YearPublished}`) @@ -47,5 +52,98 @@ oa_status: data.oa_status || "inconnu", \ host_types: _.map(data.oa_locations, 'host_type') || []})) +[assign] +path = AuthorsWithAddress +value = get("AuthorsWithAddress").map(author => ({ \ + ...author, \ + addresses: author.addresses.map(addr => ({ \ + ...addr, \ + country: _.get(env("dictionary"), _.toUpper(_.deburr(addr.country || "")), _.toUpper(_.deburr(addr.country || ""))) \ + })) \ +})) + +[assign] +path = ReprintAddresses +value = get("ReprintAddresses").map(author => ({ \ + ...author, \ + addresses: author.addresses.map(addr => ({ \ + ...addr, \ + country: _.get(env("dictionary"), _.toUpper(_.deburr(addr.country || "")), _.toUpper(_.deburr(addr.country || ""))) \ + })) \ +})) + + +[assign] +path = CountriesToEnrich +value = pick(["AuthorsWithAddress", "ReprintAddresses"]).values().flatten().map("addresses").flatten().map("country").compact().uniq() +[expand] +path = CountriesToEnrich +size = 100 +file = ./03.3-enrich-Countries.ini + +[assign] +path = AuthorsWithAddress +value = get("AuthorsWithAddress").map(author => ({ \ + ...author, \ + addresses: author.addresses.map(addr => ({ \ + ...addr, \ + ...(_.find(self.CountriesToEnrich, e => e.pays === addr.country) || { \ + iso2: "N/A", \ + iso3: "N/A" \ + }) \ + })) \ +})) + +[assign] +path = ReprintAddresses +value = get("ReprintAddresses").map(author => ({ \ + ...author, \ + addresses: author.addresses.map(addr => ({ \ + ...addr, \ + ...(_.find(self.CountriesToEnrich, e => e.pays === addr.country) || { \ + iso2: "N/A", \ + iso3: "N/A" \ + }) \ + })) \ +})) + +[assign] +path = OrganizationISO3Pairs +value = pick(["AuthorsWithAddress", "ReprintAddresses"]) \ + .thru(data => _.flatMap(data, entries => \ + _.flatMap(entries || [], entry => \ + _.flatMap(entry.addresses || [], addr => \ + (addr.organizations || ["Unknown"]).map(org => `${addr.iso3} / ${org}`))))).uniq() + +[assign] +path = AuthorsNamesOnly +value = get("Authors").map("full_name_deburred") + +[assign] +path = AuthorsHasAddress +value = get("AuthorsWithAddress").map("full_name_deburred") + +[assign] +path = AuthorsToPut +value = get("AuthorsNamesOnly").xor(self.AuthorsHasAddress).map(auth =>`${auth}: Unknown` ) + +[assign] +path = AuthorsWithAddressesHtml +value = get("AuthorsWithAddress") \ + .groupBy("full_name") \ + .map((entries, author) => ({ \ + author: `${author}:`, \ + addresses: entries.flatMap(e => e.addresses.map(a => _.get(a, 'full_address', 'Unknown address'))).join("
- ")})) \ + .map(e => `${e.author} ${e.addresses}`).concat(self.AuthorsToPut) + +[assign] +path = ReprintAddressesHtml +value = get("ReprintAddresses") \ + .map(entry => ({ \ + author: `${entry.full_name}:`, \ + addresses: entry.addresses.map(a => _.get(a, 'full_address', 'Unknown address')).join("
- ") \ + })) \ + .map(e => `${e.author} ${e.addresses}`) + [exchange] -value = omit(["Unpaywall"]) +value = omit(["Unpaywall","AuthorsNamesOnly","AuthorsHasAddress","AuthorsToPut"]) diff --git a/wos-dumps/03.3-enrich-Countries.ini b/wos-dumps/03.3-enrich-Countries.ini new file mode 100644 index 0000000..2ee2937 --- /dev/null +++ b/wos-dumps/03.3-enrich-Countries.ini @@ -0,0 +1,37 @@ +# Création d'une variable d'environnement qui stocke la date au format ISO +[env] +path = date +value = thru(d => (new Date()).toISOString().split("T")[0]) +# Sert à traiter individuellement chaque valeur de la liste (chaque code ici) +[map] +path = value +# Création d'une clé temporaire pour stocker la valeur actuelle +[map/replace] +path = tmpkey +value = self() + +[map/combine] +path = tmpkey +# URL vers un fichier CSV accessible via internet +primer = http://mapping-tables.daf.intra.inist.fr/tablePaysIso2&3.tsv + +# Création d'un fichier temporaire local avec la date définie dans [env] (facultatif, évite de télécharger le fichier plusieurs fois) +cacheName = env("date").prepend("WosCountries") +default = "n/a" + +[map/combine/URLStream] +path = false + +[map/combine/CSVParse] +separator = fix("\t") + +[map/combine/CSVObject] +# Une fois le TSV parsé en objets JSON, la valeur de "From" est transformé en clé et celle de "To" en valeur ({"From": "code1", "To": "Intitulé1"}). Puis on transforme l'objet avec "intitule" comme clé. +[map/combine/assign] +path = id +value = get('FROM') +path = value +value = get('TO').split().zipObject(["intitule"]).invert() +# Enfin on extrait uniquement la vaeur de "intitule" après le mapping +[map/exchange] +value = get('tmpkey.value.intitule').split(";").thru(arr => ({ pays: arr[0] || "Unknown", iso2: arr[1] || "Unknown", iso3: arr[2] || "Unknown" })) \ No newline at end of file diff --git a/wos-dumps/README.md b/wos-dumps/README.md index 316f0c2..3e1da3f 100644 --- a/wos-dumps/README.md +++ b/wos-dumps/README.md @@ -25,18 +25,18 @@ - `Identifiers` => `DOI`, `ISSN`, `eISSN`, `eISBN`, `ArticleNumber` & `PMID` - `DocumentType` => Les types de document - `Titles` => `DocumentTitle` & `Source` +- `Publisher` => Récupère le nom normalisé (`unified_name`) s'il existe, sinon `full_name` - `PubInfo` => `Volume`, `Issue`, `BeginningPage`, `EndingPage`, `YearPublished` & `EarlyAccessYear` - `Conference` => `ConferenceDate`, `ConferenceTitle`, `ConferenceCity` & `ConferenceState` - `Authors` => Pour chaque auteur `wos_standard`, `full_name_deburred` & `full_name` -- `AuthorsWithAddress` => Pour chaque auteur `wos_standard`, `full_name`, `country`, `city` & `full_address` -- `ReprintAddresses` => Pour chaque auteur `wos_standard`, `full_name`, `country`, `city` & `full_address` +- `AuthorsWithAddress` => Pour chaque auteur `wos_standard`, `full_name`, `full_name_deburred` & `addresses`. Cette clé contient `full_address`, `country`, `city` & `organizations`. +- `ReprintAddresses` => Pour chaque auteur `wos_standard`, `full_name`, `full_name_deburred` & `addresses`. Cette clé contient `full_address`, `country`, `city` & `organizations`. - `KeywordsPlus` => Les mots-clés WoS - `AuthorKeywords` => Les mots-clés d'auteurs - `CitationCountWOS` => Le nombre de citations - `CitationTopics` => `Macro`, `Meso` & `Micro` - `Categories` => `ResearchAreas` & `WebofScienceCategories` - `NormalizedLanguage` => La/les langue(s) du document -- `NormalizedDocumentType` => Les types de document normalisés - `Abstract` => Le résumé du document - `SDG` => Les Sustainable Development Goals @@ -46,7 +46,11 @@ - `RevueVolume` => Template string qui concatène, si les données existent, les éléments suivants dans une chaîne du type : `Source` `Volume:` `Issue:` `Article Number:` `Pages:`(`BeginningPage EndingPage`) `Published:` - `DisciplinesESI` => enrichissement par sous-flux. Le fichier `03.1-enrich-ESI.ini` récupère les disciplines ESI à partir de `WebofScienceCategories` - `JCRSubjectCategory` => enrichissement par sous-flux. Le fichier `03.2-enrich-JCR.ini` récupère les données relatives à la notoriété d'une revue à partir de `Source` -- `Unpaywall` interroge le web-service Unpaywall uniquement si le document possède un DOI (évite les interrogations inutiles) +- `OAInfo` interroge le web-service Unpaywall uniquement si le document possède un DOI (évite les interrogations inutiles). Récupère `is_oa`, `oa_status` & `host_types` +- `AuthorsWithAddress` & `ReprintAddresses` sont enrichis avec `pays`, `iso2` & `iso3`. Dans un 1er temps `country` est soumis à un dictionnaire afin de convertir les quelques pays pour lesquels le Wos donne une verbalisation "non standard". Les pays sont ensuite enrichis par sous-flux via le fichier `03.3-enrich-Countries.ini` qui récupère `pays`, `iso2` & `iso3`. On utilise ensuite .find() qui cherche les objets où `pays` correspond à `country` et lorsque c'est le cas fusionne les données. Ce qui permet d'ajouter correctement `iso2` et `iso3` à chaque auteur dans`AuthorsWithAddress` & `ReprintAddresses`. +- `OrganizationISO3Pairs` => associe les organismes avec leur(s) pays. +- `AuthorsWithAddressesHtml` & `ReprintAddressesHtml` => Pour un affichage clair dans Lodex en format liste à puces on met chaque nom d'auteur entre balises `b`, puis on joute son adresse. S'il en a plusieurs, elles sont séparées par une balise `br` et un `-` pour créer un effet de sous-puce dans la liste. + ### 04-report