web-dumps/openalex-dumps/03-enrich.ini at 1cae9440e151feffd5cf823652a7da54a8b01a65

Fork: 0
tdm / web-dumps
Find file
Newer
Older
web-dumps / openalex-dumps / 03-enrich.ini
Nicolas Thouvenin on 21 Feb 9 KB add open-alex dumps
Raw Blame History
prepend = unpack
append = pack
; On retire les notices non pertinentes en raison d'erreurs d'affiliations d'OpenAlex.
; Exemple pour le laboratoire GANIL
;[A] On récupère dans un premier temps les adresses originales qu'OpenALex a identifié comme relevant du laboratoire requêté.
[assign]
path = filtered_raw_affiliation_strings
value = get("authorships").filter(obj => obj.institutions.some(obj => obj.id === `https://openalex.org/${self.lodexStamp.queryIdentifier}`)).flatMap(obj => obj.raw_affiliation_strings)

;[B] Dans un second temps on teste des regex pour vérifier que les adresses originales matchent bien avec le laboratoire requêté.
;[assign]
;path = filtered_and_tested_raw_affiliation_strings
;value = get("filtered_raw_affiliation_strings").some(item => /ganil/i.test(item) || /grand acc.*national.*ions.*lour/i.test(item) || /Large heavy ion Nat.*acc.*/i.test(item))

;[C] Enfin on supprime les notices non pertinentes.
;[remove]
;test = get("filtered_and_tested_raw_affiliation_strings").isEqual(false)
; }}}
[assign]
; [D]
path = uri
value = get('id').replace('https://openalex.org/', '')

path = doi
value = get('doi').replace("https://doi.org/","").toLower()

path = is_oa
value = get("open_access.is_oa").thru(bool=>bool === false ?"Non" : "Oui")

path = oa_status
value = get("open_access.oa_status").capitalize()

path = keywords
value = get('keywords').map('display_name')

path = author_name
value = get('authorships').map("author.display_name")

; [E] On récupère uniquement les auteurs du laboratoire requêté.
path = filtered_author_name
value = get("authorships").filter(obj => obj.institutions.some(obj => obj.id === `https://openalex.org/${self.lodexStamp.queryIdentifier}`)).flatMap(obj => obj.author.display_name)

path = institutions
value = get('authorships').flatMap("institutions").map("display_name").uniq()

path = source
value = get("primary_location.source.display_name")

path = is_retracted
value = get("is_retracted").thru(bool=>bool === false ?"Non" : "Oui")

path = pages
value = get("biblio.first_page").concat(self.biblio.last_page).uniq().join(" to ")

path = volume
value = get("biblio.volume")

path = issue
value = get("biblio.issue")

; On récupère ici les 4 niveaux de classification, via Template strings on préfixe les valeurs de chaque niveau par un nombre correspondant à sa place hiérarchique. Nécéssaire pour créer un arbre hiérarchique.
path = classification
value = get("topics").map(item => `1 - ${item.domain.display_name}@2 - ${item.field.display_name}@3 - ${item.subfield.display_name}@4 - ${item.display_name}`).map(item=>item.split("@"))

path = domains
value = get("topics").map("domain.display_name")

path = fields
value = get("topics").map("field.display_name")

path = subfields
value = get("topics").map("subfield.display_name")

path = topics
value = get("topics").map("display_name")

path = codes_iso2
value = get("authorships").flatMap("countries").uniq()

; On récupère ici un code langue, auquel on applique un constructeur Javascript, qui verbalise et traduit le code. "fr" devient "français".
path = language
value = get("language").thru(d => new Intl.DisplayNames(['FR'], { type: 'language'}).of(d))

[assign]
;On récupère des tableaux de codes Iso2, on enlève le code de la France. Puis on teste chaque tableau, s'il n'est pas vide on remplace par "Oui" (il y a d'autres pays donc collaboration), sinon par "Non".
path = collaborations_internationales
value = get("codes_iso2").pull("FR").thru(arr=>!_.isEmpty(arr)).thru(bool=>bool === false ?"Non" : "Oui")

; On récupère ici un code Iso2, auquel on applique un constructeur Javascript, qui verbalise et traduit le code. "FRA" devient "France".
path = countries
value = get("codes_iso2").map(d => new Intl.DisplayNames(['FR'], { type: 'region'}).of(d))

; On récupère un tableau d'objets, puis on itère sur chaque objet avec un template string pour en faire une chaîne de caractères.
; Chaque chaîne est préfixée par "Goal", on récupère ensuite les derniers chiffres de la variable "id" qui correspond au numéro du goal.
; On ajoute enfin la variable "display_name" qui est le nom du goal.
path = sustainable_development_goals
value = get("sustainable_development_goals").map( item => `Goal ${item.id.match(/\/sdg\/(\d+)/)[1]} : ${item.display_name}`).thru(arr => _.isEmpty(arr) ? ["No sustainable development goal"] : arr)

path = funder_display_name
value = get("grants").map("funder_display_name").uniq()

; On récupère le nom des funders et leur id sous forme de matrice. On utilise ensuite _.compact dans chaque tableau pour que le _.join après ne concatène pas en cas d'absence de valeur à id,
; de sorte à ne pas avoir de valeur comme "funder : ". Le _.join sert à transformer la matrice en un seul tableau. On remplace ensuite les tableaux vides par ["n/a"].
path = funders_award_id
value = get("grants").map(obj=>[obj.funder_display_name,obj.award_id]).map(subarr=>_.compact(subarr)).map(subarr => subarr.join(" : ")).thru(arr => _.isEmpty(arr) ? ["n/a"] : arr)

; On teste si dans la liste des urls où sont déposés les "works", au moins une appartient à un portail Hal.
; La regex couvre l'ensemble des portails hal et la spécificité de leurs urls. (Oui et Non pour facette).
[assign]
path = is_hal
value = get("locations").map("landing_page_url").compact().some(item=>(/[^a-zA-Z0-9]hal[^a-zA-Z0-9]/).test(item)).thru(bool=>bool === false ?"Non" : "Oui")

; On récupère le champ qui indique dans quelles bases bibliographiques sont indexés les "works".
; On concatène le champ "is_hal", on retire les "Non" et transforme les "Oui" en "hal".
[assign]
path = indexed_in_hal_enriched
value = get("indexed_in").concat(self.is_hal).pull("Non").map(item=>item === "Oui" ? "hal" : item).sort()

; On récupère l'éditeur de la revue. ([] si vide, nécessaire pour les instructions et fonctions futures).
[assign]
path = publisher_only
value = get("primary_location.source.host_organization_name","n/a")

; On récupère un tableau avec l'éditeur mais aussi, s'il en a, sa ou ses sociétés mères (3 niveaux de hiérarchie).
; L'ordre est aléatoire, on ne peut se baser sur l'index. (["n/a"] si vide, nécéssaire pour les instructions et fonctions futures).
[assign]
path = linked_publishers
value = get("primary_location.source.host_organization_lineage_names",["n/a"])

; On compare les 2 champ créés au-dessus, pour retirer du tableau les éditeurs du plus-bas niveau de hiérarchie.
; Si publisherOnly était au plus haut niveau (0) on obtient [].
; S'il était au niveau 1 on obtient son niveau supérieur (0).
; S'il était au plus bas on obtient ses 2 sociétés mères (0 & 1).
[assign]
path = xor_publisher
value = get("publisher_only").castArray().xor(self.linked_publishers)

; On isole tous les tableaux vides.
[swing]
test = get("xor_publisher").isEmpty()

; On leur réattribue leur unique valeur.
[swing/assign]
path = xor_publisher
value = get("publisher_only")

; On associe le tout, ne reste plus que le cas des tableaux avec niveaux 0 et 1.
; On retire les parents directs des éditeurs de niveau 2, on n'a donc plus que les éditeurs du plus haut niveau.
[assign]
path = publisher
value = get("xor_publisher").pull("Springer Science+Business Media","Vandenhoeck & Ruprecht","University of California, Berkeley","University of California, Los Angeles","Siberian Branch of the Russian Academy of Sciences").toString().replace(null,"n/a").replace(undefined,"n/a")

; On recupère l'abstract sous forme d'un tableau d'objets où keys sont les mots et values leur(s) position(s) dans l'abstract ([{"the":[0,12,25]}...]).
; Les keys contiennent donc des "." ou autres caractères spéciaux proscrits MongoDB Node.js driver.
; On transforme cela en un tableau où chaque value est "key:value", on découpe pour avoir une matrice. On parse les values en base 10 pour obtenir des entiers.
; On inverse ensuite les éléments dans chaque sous-tableau de la matrice. On effectue un tri dans la matrice, on ne garde plus que les mots et transforme le tout en un string.

[assign]
path = identify_author_name_error
value = get("authorships").groupBy("author.display_name").map((authors, displayName) => ({author_display_name: displayName,count: _.size(authors),raw_author_names: _.filter(_.map(authors, author => author.raw_author_name),Boolean)})).filter(entry => entry.count > 1)

[assign]
path = display_author_name_error
value = get("identify_author_name_error").map(item =>`${item.author_display_name} apparaît ${item.count} fois sous les noms : ${item.raw_author_names.join(" || ")}`)

[assign]
path = check_single_affiliation
value = get("authorships").map("raw_affiliation_strings").map(item=>item.join()).uniq().compact().size().thru(v=>v===1? _.size(self.authorships):"").replace(/^1$/,"")

[assign]
path = identify_same_affiliation_for_all_authors
value=get('authorships').map('raw_affiliation_strings').map(i=>_.size(i)).some(i=>i===0).thru(bool=>bool === true ? "" : self.check_single_affiliation)

[assign]
path = retrieve_affiliation_normalization_errors
value= get('filtered_and_tested_raw_affiliation_strings').thru(bool=>bool===false ? self.filtered_raw_affiliation_strings : [])

[assign]
path = abstract
value = get("abstract_inverted_index").flatMap((values, key) => values.map(value => [value, key])).sort((a, b) => a[0] - b[0]).map(item => item[1]).join(' ')

;[F]
[exchange]
value = omit(["ids","fwci","concepts","grants","datasets","versions","is_authors_truncated","biblio","abstract_inverted_index","indexed_in","display_name","publisher_only","xor_publisher","publication_date","countries_distinct_count","institutions_distinct_count","corresponding_author_ids","corresponding_institution_ids","has_fulltext","fulltext_origin","cited_by_percentile_year","is_paratext","primary_topic","locations_count","referenced_works","related_works","ngrams_url","cited_by_api_url","created_date","updated_date","check_single_affiliation"])
; }}}