diff --git a/affiliations-tools/v1/corporate/abreviations.csv b/affiliations-tools/v1/corporate/abreviations.csv new file mode 100644 index 0000000..1b02f4b --- /dev/null +++ b/affiliations-tools/v1/corporate/abreviations.csv @@ -0,0 +1,108 @@ +Acad Academie +Accid Accident +Adm Administrative +Aerosp Aerospaciale +Agr Agricole +AFB Air Force Base +AIDS Acquired Immuno Deficiency Syndrome +Amer American +Anal Analysis +Anat Anatomie +Anim Animal +Arthrit Arthritis +Assoc Association +Ave Avenue +Biochem Biochemistry +Biol Biologie +Bot Botanique +Blvd Boulevard +Bur Bureau +Canc Cancer +Ctr Centre +Chim Chimie +Chirurg Chirurgie +Coll Collège +Com Comite +Commun Communication +Co Compagnie +Conf Conference +Def Defense +Dept Departement +Dev Developpement +Diabet Diabetes +Diag Diagnosis +Div Division +Educ Education +Elect Electrique +Environm Environnement +Etab Etablissement +Etud Etudes +Expt Experimentation +Fac Faculte +Fed Federal +Ft Fort +Fdn Fondation +Gen General +Govt Gouvernement +Grp Groupe +Hist Histoire +Hop Hopital +Hort Horticulture +Hosp Hospital +Hyg Hygiene +Ind Industrie +Infect Infectieuse +Informat Informatique +Inst Institut +Int International +Intro Introduction +Invest Investigation +Jr Junior +Lab Laboratoire +Lect Lecture +Lib Librairie +Malad Maladie +Mat Materiel +Math Mathematique +Mech Mecanique +Med Medicine +Mem Memorial +Metab Metabolique +Met Metallurgie +Mil Militaire +Minist Ministre +Mol Moleculaire +Mt Mont +Nat Nature +Nav Navigation +Nucl Nucleaire +Nutr Nutrition +Observ Observatoire +Obstet Obstetrique +Off Office +Org Organisation +Pediat Pediatrique +Pharm Pharmacie +Phys Physique +Pl Place +Prod Production +Prov Province, Provincial +Psychiat Psychiatrique +Rech Recherche +Rehabil Rehabilitation +Reprod Reproduction +Sanit Sanitaire +Sci Science +Sect Section +Serv Service +Soc Societe +Stn Station +Stat Statistique +Struct Structure +Syst Systeme +Tech Technique +Tel Telephone +Temp Temperature +Terr Territoire +Text Textile +Univ Universite diff --git a/affiliations-tools/v1/corporate/abreviations.pickle b/affiliations-tools/v1/corporate/abreviations.pickle new file mode 100644 index 0000000..36dc6b7 --- /dev/null +++ b/affiliations-tools/v1/corporate/abreviations.pickle Binary files differ diff --git a/affiliations-tools/v1/corporate/abreviations.py b/affiliations-tools/v1/corporate/abreviations.py new file mode 100644 index 0000000..1441d5d --- /dev/null +++ b/affiliations-tools/v1/corporate/abreviations.py @@ -0,0 +1,28 @@ +import csv +import pickle + +# fonction : création du dictionnaire depuis un CSV : colonne 1 = key et colonne 2 = value +def transform_to_dict(csv_file): + my_dict = {} + with open(csv_file, 'r') as file: + f = csv.reader(file, delimiter='\t') + + for row in f: + key = row[0].lower().strip() + value = row[1].lower().strip() + my_dict[key] = value + + return my_dict + +# fonction : sauvegarde du dictionnaire dans un pickle +def save_dict_to_pickle(my_dict, my_pickel): + with open(my_pickel, 'wb') as file: + pickle.dump(my_dict, file) + +# appel des fonctions +fichier_csv = 'abreviations.csv' +resultat = transform_to_dict(fichier_csv) + +fichier_pickle = 'abreviations.pickle' +save_dict_to_pickle(resultat, fichier_pickle) + diff --git a/affiliations-tools/v1/corporate/private_public.py b/affiliations-tools/v1/corporate/private_public.py new file mode 100644 index 0000000..7ac6f68 --- /dev/null +++ b/affiliations-tools/v1/corporate/private_public.py @@ -0,0 +1,114 @@ +import json +import sys +import pickle +import re +import urllib.parse +import requests +from ratelimit import limits, RateLimitException +from backoff import on_exception, expo + +# Filtrage par mot clé pour ne garder que l'essentiel +def filter(affiliation) : + affiliation_lower = affiliation.lower() + adress = affiliation_lower.replace(",", "") + words = adress.split(" ") + private = ["sas", "sarl", "sa", "private", "edf", "orange"] + public = ["univ", "hop", "uar", "umr","cea", "cnrs"] + for word in words : + if word in private : + return "private" + elif word in public : + return "public" + return adress + +# Normaliser les abréviations +def expandAbbreviations(affiliation,dict): + affiliation = affiliation.lower() + arr = affiliation.split(" ") + res = [] + for word in arr : + short_word = word.replace(",","").replace(".","") + if short_word in dict: + suffix = "" + if "," in word: + suffix = "," + res.append(dict[short_word] + suffix) + else: + res.append(word) + return " ".join(res) + + +# découpage des adresses en plusieurs parties +def name_enterprise(affiliation) : + affiliation = affiliation.lower() + affiliations = affiliation.split(',') + return affiliations[0] + + +# Repérer le département dans l'affiliation +def num_dept(affiliation) : + res = re.findall('f-(\d{2})\d{3}',affiliation) + if len(res)==0 : + return None + return res[0] + +# requêtage de l'API pour les données filtrées +@on_exception(expo, RateLimitException, max_time=1) +@limits(calls=7, period=1) +def request(name, dept) : + url = "https://recherche-entreprises.api.gouv.fr/search?q=" + urllib.parse.quote(name) + if dept: + url += "&departement=" + dept + response = requests.get(url,headers={'Accept':'application/json'}) + return response.json() + +# gérer les réponses de l'API +def is_private_public(information): + if len(information)==0 or 'results' not in information or not information['results']: + return "Informations manquantes" + + # Parcourir chaque objet "results" extraire la valeur de "est_service_public" + est_service_public_list = [] + for result in information['results']: + complements = result.get('complements') + est_service_public = complements.get('est_service_public', None) + if est_service_public is not None: + est_service_public_list.append(est_service_public) + if True in est_service_public_list : + return "public" + return "private" + + # return est_service_public_list + +def publicOrPrivate(affiliation,my_dict): + privatePublicOrAffiliation = filter(affiliation) + if privatePublicOrAffiliation in ["private", "public"]: + adress = affiliation.lower().split(",") + enterprise = adress[0] + return "organisme: "+enterprise+", statut: "+privatePublicOrAffiliation + + expanded_affiliation = expandAbbreviations(affiliation,my_dict) + + name = name_enterprise(expanded_affiliation) + dept = num_dept(expanded_affiliation) + + information = request(name, dept) + nature = is_private_public(information) + + return "organisme: "+name+", statut: "+nature + + +def main(): + with open('abreviations.pickle', 'rb') as handle: + my_dict = pickle.load(handle) + for line in sys.stdin: + data = json.loads(line) + texte = data["value"] + + data["value"] = publicOrPrivate(texte,my_dict) + + sys.stdout.write(json.dumps(data)) + sys.stdout.write("\n") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/affiliations-tools/v1/corporate/requirements.txt b/affiliations-tools/v1/corporate/requirements.txt new file mode 100644 index 0000000..7061df9 --- /dev/null +++ b/affiliations-tools/v1/corporate/requirements.txt @@ -0,0 +1,2 @@ +ratelimit +backoff \ No newline at end of file