diff --git a/affiliations-tools/README.md b/affiliations-tools/README.md index 776ad18..4611527 100644 --- a/affiliations-tools/README.md +++ b/affiliations-tools/README.md @@ -10,7 +10,6 @@ - `rnsr`: déduit de l'adresse d'une affiliation d'auteur et d'une date de publication (l'année suffit) zéro, un ou plusieurs identifiants RNSR (correspondant à une ou plusieurs structures de recherche française(s)). --`corporate` : permet de savoir si une affiliation relève d'un organisme privé Cet appariement suit des [règles certaines](https://github.com/Inist-CNRS/ezs/blob/master/packages/conditor/README.md#r%C3%A8gles-certaines). @@ -49,7 +48,6 @@ - [v1/rnsr/csv](#v1%2frnsr%2fcsv) - [v1/rnsr/json](#v1%2frnsr%2fjson) - [v1/rnsr/conditor](#v1%2frnsr%2fconditor) -- [v1/corporate](#v1%2fcorporate) ### v1/rnsr/csv @@ -249,37 +247,3 @@ ] EOF ``` - -### v1/corporate - -Prend en entrée une adresse d'affiliation (WOS ou Scopus) et indique si l'organisme lié à l'affiliation est public ou privé. - -#### Configuration de l'instance - -Il faudra penser à modifier "EZS_CONCURRENCY" en le passant à 1. - -#### Exemple Info - -Entrée: - -```json -[ -{"id": 1, "value": "EDF DIPNN TEGG, 905 Ave Camp Menthe, F-13097 Aix En Provence, France"}, -{"id": 2, "value": "AiryLab SARL, 34 Rue Jean Baptiste Malon, F-04800 Greoux Les Bains, France"}, -{"id": 3, "value": "4G TECHNOLOGY, F-06370 Mouans Sartoux, France"}, -{"id": 4, "value": "Abeeway, Sophia Antipolis, France"}, -{"id": 5, "value": "Univ Cote dAzur, INRIA, Ansys, Nice, France"} -] -``` - -Sortie: - -```json -[ -{"id": 1, "value": "organisme: edf dipnn tegg, statut: private"}, -{"id": 2, "value": "organisme: airylab sarl, statut: private"}, -{"id": 3, "value": "organisme: 4g technology, statut: private"}, -{"id": 4, "value": "organisme: abeeway, statut: private"}, -{"id": 5, "value": "organisme: univ cote dazur, statut: public"} -] -``` diff --git a/affiliations-tools/requirements.txt b/affiliations-tools/requirements.txt deleted file mode 100644 index f789ca6..0000000 --- a/affiliations-tools/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -ratelimit -backoff -requests \ No newline at end of file diff --git a/affiliations-tools/v1/corporate/abreviations.py b/affiliations-tools/v1/corporate/abreviations.py deleted file mode 100644 index e9addc4..0000000 --- a/affiliations-tools/v1/corporate/abreviations.py +++ /dev/null @@ -1,29 +0,0 @@ -import csv -import json -import pickle - -# fonction : création du dictionnaire depuis un CSV : colonne 1 = key et colonne 2 = value -def transform_to_dict(csv_file): - my_dict = {} - with open(csv_file, 'r') as file: - f = csv.reader(file, delimiter='\t') - - for row in f: - key = row[0].lower().strip() - value = row[1].lower().strip() - my_dict[key] = value - - return my_dict - -# fonction : sauvegarde du dictionnaire dans un json -def save_dict_to_json(my_dict,my_json) : - with open(my_json, 'w') as fichier: - json.dump(my_dict, fichier) - -# appel des fonctions -fichier_csv = 'abreviations.csv' -resultat = transform_to_dict(fichier_csv) - -fichier_json = 'abreviations.json' -save_dict_to_json(resultat, fichier_json) - diff --git a/affiliations-tools/v1/corporate/private_public.ini b/affiliations-tools/v1/corporate/private_public.ini deleted file mode 100644 index e85c735..0000000 --- a/affiliations-tools/v1/corporate/private_public.ini +++ /dev/null @@ -1,35 +0,0 @@ -# OpenAPI Documentation - JSON format (dot notation) -post.responses.default.description = Return all objects with enrich fields -post.responses.default.content.application/json.schema.$ref = #/components/schemas/JSONStream -post.summary = Enrich one field of each Object with a Python function -post.requestBody.required = true -post.requestBody.content.application/json.schema.$ref = #/components/schemas/JSONStream -post.parameters.0.in = query -post.parameters.0.name = path -post.parameters.0.schema.type = string -post.parameters.0.description = The path in each object to enrich with an Python script -post.parameters.1.in = query -post.parameters.1.name = indent -post.parameters.1.schema.type = boolean -post.parameters.1.description = Indent or not the JSON Result - -[use] -plugin = @ezs/spawn -plugin = @ezs/basics -plugin = @ezs/analytics - -[JSONParse] -separator = * - -[expand] -path = env('path', 'value') -size = 100 -# in production mode, uncomment the following line -# cache = boost - -[expand/exec] -# command should be executable ! -command = ./v1/corporate/private_public.py - -[dump] -indent = env('indent', false) diff --git a/affiliations-tools/v1/corporate/private_public.py b/affiliations-tools/v1/corporate/private_public.py deleted file mode 100755 index 2c27aa6..0000000 --- a/affiliations-tools/v1/corporate/private_public.py +++ /dev/null @@ -1,119 +0,0 @@ -#!/usr/bin/python3 - -import json -import sys -import re -import urllib.parse -import requests -from ratelimit import limits, RateLimitException -from backoff import on_exception, expo - -# Filtrage par mot clé pour ne garder que l'essentiel -def filter(affiliation) : - affiliation_lower = affiliation.lower() - adress = affiliation_lower.replace(",", "") - words = adress.split(" ") - private = ["sas", "sarl", "sa", "private", "edf", "orange"] - public = ["univ", "hop", "uar", "umr", "cea", "cnrs"] - for word in words : - if word in private : - return "private" - elif word in public : - return "public" - return adress - -# requête pour récupérer le fichier json -def request_abreviation(url): - response = requests.get(url) - return response.json() - -# Normaliser les abréviations -def expandAbbreviations(affiliation,dict): - affiliation = affiliation.lower() - arr = affiliation.split(" ") - res = [] - for word in arr : - short_word = word.replace(",","").replace(".","") - if short_word in dict: - suffix = "" - if "," in word: - suffix = "," - res.append(dict[short_word] + suffix) - else: - res.append(word) - return " ".join(res) - - -# découpage des adresses en plusieurs parties -def name_enterprise(affiliation) : - affiliation = affiliation.lower() - affiliations = affiliation.split(',') - return affiliations[0] - - -# Repérer le département dans l'affiliation -def num_dept(affiliation) : - res = re.findall('f-(\d{2})\d{3}',affiliation) - if len(res)==0 : - return None - return res[0] - -# requêtage de l'API pour les données filtrées -@on_exception(expo, RateLimitException, max_time=1) -@limits(calls=7, period=1) -def request(name, dept) : - url = "https://recherche-entreprises.api.gouv.fr/search?q=" + urllib.parse.quote(name) - if dept: - url += "&departement=" + dept - response = requests.get(url,headers={'Accept':'application/json'}) - return response.json() - -# gérer les réponses de l'API -def is_private_public(information): - if len(information)==0 or 'results' not in information or not information['results']: - return "Informations manquantes" - - # Parcourir chaque objet "results" extraire la valeur de "est_service_public" - est_service_public_list = [] - for result in information['results']: - complements = result.get('complements') - est_service_public = complements.get('est_service_public', None) - if est_service_public is not None: - est_service_public_list.append(est_service_public) - if True in est_service_public_list : - return "public" - return "private" - - # return est_service_public_list - -def publicOrPrivate(affiliation,my_dict): - privatePublicOrAffiliation = filter(affiliation) - if privatePublicOrAffiliation in ["private", "public"]: - adress = affiliation.lower().split(",") - enterprise = adress[0] - return "organisme: "+enterprise+", statut: "+privatePublicOrAffiliation - - expanded_affiliation = expandAbbreviations(affiliation,my_dict) - - name = name_enterprise(expanded_affiliation) - dept = num_dept(expanded_affiliation) - - information = request(name, dept) - nature = is_private_public(information) - - return "organisme: "+name+", statut: "+nature - - -def main(): - my_dict = request_abreviation("http://mapping-tables-1.daf.intra.inist.fr/affiliations-tools-corporate.json") - for line in sys.stdin: - data = json.loads(line) - texte = data["value"] - - data["value"] = publicOrPrivate(texte,my_dict) - - sys.stdout.write(json.dumps(data)) - sys.stdout.write("\n") - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/affiliations-tools/v1/corporate/test.jsonl b/affiliations-tools/v1/corporate/test.jsonl deleted file mode 100644 index c8532c5..0000000 --- a/affiliations-tools/v1/corporate/test.jsonl +++ /dev/null @@ -1 +0,0 @@ -{"id": 2, "value": "AiryLab SARL, 34 Rue Jean Baptiste Malon, F-04800 Greoux Les Bains, France"} \ No newline at end of file diff --git a/authors-tools/requirements.txt b/authors-tools/requirements.txt index 1de307f..3341329 100644 --- a/authors-tools/requirements.txt +++ b/authors-tools/requirements.txt @@ -1,4 +1,6 @@ pandas requests plac -unidecode \ No newline at end of file +unidecode +ratelimit +backoff \ No newline at end of file diff --git a/authors-tools/v1/corporate/abreviations.py b/authors-tools/v1/corporate/abreviations.py new file mode 100644 index 0000000..e9addc4 --- /dev/null +++ b/authors-tools/v1/corporate/abreviations.py @@ -0,0 +1,29 @@ +import csv +import json +import pickle + +# fonction : création du dictionnaire depuis un CSV : colonne 1 = key et colonne 2 = value +def transform_to_dict(csv_file): + my_dict = {} + with open(csv_file, 'r') as file: + f = csv.reader(file, delimiter='\t') + + for row in f: + key = row[0].lower().strip() + value = row[1].lower().strip() + my_dict[key] = value + + return my_dict + +# fonction : sauvegarde du dictionnaire dans un json +def save_dict_to_json(my_dict,my_json) : + with open(my_json, 'w') as fichier: + json.dump(my_dict, fichier) + +# appel des fonctions +fichier_csv = 'abreviations.csv' +resultat = transform_to_dict(fichier_csv) + +fichier_json = 'abreviations.json' +save_dict_to_json(resultat, fichier_json) + diff --git a/authors-tools/v1/corporate/private-public.ini b/authors-tools/v1/corporate/private-public.ini new file mode 100644 index 0000000..41ef781 --- /dev/null +++ b/authors-tools/v1/corporate/private-public.ini @@ -0,0 +1,35 @@ +# OpenAPI Documentation - JSON format (dot notation) +post.responses.default.description = Return all objects with enrich fields +post.responses.default.content.application/json.schema.$ref = #/components/schemas/JSONStream +post.summary = Enrich one field of each Object with a Python function +post.requestBody.required = true +post.requestBody.content.application/json.schema.$ref = #/components/schemas/JSONStream +post.parameters.0.in = query +post.parameters.0.name = path +post.parameters.0.schema.type = string +post.parameters.0.description = The path in each object to enrich with an Python script +post.parameters.1.in = query +post.parameters.1.name = indent +post.parameters.1.schema.type = boolean +post.parameters.1.description = Indent or not the JSON Result + +[use] +plugin = @ezs/spawn +plugin = @ezs/basics +plugin = @ezs/analytics + +[JSONParse] +separator = * + +[expand] +path = env('path', 'value') +size = 100 +# in production mode, uncomment the following line +# cache = boost + +[expand/exec] +# command should be executable ! +command = ./v1/corporate/private-public.py + +[dump] +indent = env('indent', false) diff --git a/authors-tools/v1/corporate/private-public.py b/authors-tools/v1/corporate/private-public.py new file mode 100755 index 0000000..36c1469 --- /dev/null +++ b/authors-tools/v1/corporate/private-public.py @@ -0,0 +1,119 @@ +#!/usr/bin/python3 + +import json +import sys +import re +import urllib.parse +import requests +from ratelimit import limits, RateLimitException +from backoff import on_exception, expo + +# Filtrage par mot clé pour ne garder que l'essentiel +def filter(affiliation) : + affiliation_lower = affiliation.lower() + adress = affiliation_lower.replace(",", "") + words = adress.split(" ") + private = ["sas", "sarl", "sa", "private", "edf", "orange"] + public = ["univ", "hop", "uar", "umr", "cea", "cnrs"] + for word in words : + if word in private : + return "private" + elif word in public : + return "public" + return adress + +# requête pour récupérer le fichier json +def request_abreviation(url): + response = requests.get(url) + return response.json() + +# Normaliser les abréviations +def expandAbbreviations(affiliation,dict): + affiliation = affiliation.lower() + arr = affiliation.split(" ") + res = [] + for word in arr : + short_word = word.replace(",","").replace(".","") + if short_word in dict: + suffix = "" + if "," in word: + suffix = "," + res.append(dict[short_word] + suffix) + else: + res.append(word) + return " ".join(res) + + +# découpage des adresses en plusieurs parties +def name_enterprise(affiliation) : + affiliation = affiliation.lower() + affiliations = affiliation.split(',') + return affiliations[0] + + +# Repérer le département dans l'affiliation +def num_dept(affiliation) : + res = re.findall('f-(\d{2})\d{3}',affiliation) + if len(res)==0 : + return None + return res[0] + +# requêtage de l'API pour les données filtrées +@on_exception(expo, RateLimitException, max_time=1) +@limits(calls=7, period=1) +def request(name, dept) : + url = "https://recherche-entreprises.api.gouv.fr/search?q=" + urllib.parse.quote(name) + if dept: + url += "&departement=" + dept + response = requests.get(url,headers={'Accept':'application/json'}) + return response.json() + +# gérer les réponses de l'API +def is_private_public(information): + if len(information)==0 or 'results' not in information or not information['results']: + return "Informations manquantes" + + # Parcourir chaque objet "results" extraire la valeur de "est_service_public" + est_service_public_list = [] + for result in information['results']: + complements = result.get('complements') + est_service_public = complements.get('est_service_public', None) + if est_service_public is not None: + est_service_public_list.append(est_service_public) + if True in est_service_public_list : + return "public" + return "private" + + # return est_service_public_list + +def publicOrPrivate(affiliation,my_dict): + privatePublicOrAffiliation = filter(affiliation) + if privatePublicOrAffiliation in ["private", "public"]: + adress = affiliation.lower().split(",") + enterprise = adress[0] + return "organisme: "+enterprise+", statut: "+privatePublicOrAffiliation + + expanded_affiliation = expandAbbreviations(affiliation,my_dict) + + name = name_enterprise(expanded_affiliation) + dept = num_dept(expanded_affiliation) + + information = request(name, dept) + nature = is_private_public(information) + + return "organisme: "+name+", statut: "+nature + + +def main(): + my_dict = request_abreviation("http://mapping-tables.daf.intra.inist.fr/affiliations-tools-corporate.json") + for line in sys.stdin: + data = json.loads(line) + texte = data["value"] + + data["value"] = publicOrPrivate(texte,my_dict) + + sys.stdout.write(json.dumps(data)) + sys.stdout.write("\n") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/authors-tools/v1/corporate/test.jsonl b/authors-tools/v1/corporate/test.jsonl new file mode 100644 index 0000000..c8532c5 --- /dev/null +++ b/authors-tools/v1/corporate/test.jsonl @@ -0,0 +1 @@ +{"id": 2, "value": "AiryLab SARL, 34 Rue Jean Baptiste Malon, F-04800 Greoux Les Bains, France"} \ No newline at end of file