diff --git a/hospital-affiliations/.gitignore b/hospital-affiliations/.gitignore new file mode 100644 index 0000000..ad72c46 --- /dev/null +++ b/hospital-affiliations/.gitignore @@ -0,0 +1,2 @@ +# virtual environment +env/ \ No newline at end of file diff --git a/hospital-affiliations/aff_hosp.py b/hospital-affiliations/aff_hosp.py new file mode 100755 index 0000000..81dd3af --- /dev/null +++ b/hospital-affiliations/aff_hosp.py @@ -0,0 +1,110 @@ +from numpy import column_stack +import pandas as pd +pd.options.mode.chained_assignment = None +from nltk.tokenize import word_tokenize +from fuzzywuzzy import fuzz +#from fuzzywuzzy import process +import sys +import json + +## MODIFICATION DU FICHIER EXCEL +# On regarde si la liste des termes acronymes choisis est présente dans les affiliations +def terme_aff(texte, liste=None) : + for aff in liste : + if aff in texte.lower() : + return True + return False +# Il faut également que l'affiliation qu'on veut homogénéiser ait un terme de la liste + + +#Correction des villes +def correction(texte) : + sep = ' -' + stripped = texte.split(sep, 1)[0] + stripped = stripped.replace("-"," ") + stripped = stripped.replace("ç","c") + stripped = stripped.replace("Saint","St") + stripped = stripped.replace("é","e") + stripped = stripped.replace("è","e") + stripped = stripped.replace("É","E") + stripped = stripped.replace("â","a") + stripped = stripped.replace("ê","e") + stripped = stripped.replace("î","i") + stripped = stripped.replace("à","a") + + return stripped + +# On repère/marque les lignes dont l'affiliation contient la ville +def aff_ville(ville,texte): + if ville.lower() in texte.lower() : + return True + return False + +def fuzzywuzzy(affiliation,text = None): + return fuzz.ratio(affiliation,text) + +def base(texte,df): + acro = ["chr","ghu","chu ","chu,","chru","ap hp","ap hm","aphp","aphm","hosp","hop ","hop,"," serv","clcc","ctr lutte canc","antoine lacassagne","eugene marquis","baclesse", + "georges françois leclerc","gf leclerc","jf leclerc","henri becquerel","h becquerel","jean perrin","leon berard","ctr oscar lambret", + "oscar lambret comprehens","oscar lambret canc","clcc oscar lambret","ctr lutte canc oscar lambret","gustave roussy","bergonie", + "inst curie","curie inst","rene gauducheau","inst cancerol ouest","inst cancerol lorraine","alexis vautrin","inst reg canc montpellier", + "icans","paul strauss","paul str","paoli calmettes","inst j paoli i calmettes","claudius regaud","claudius rigaud","inst univ canc toulouse oncopole", + "inst univ cancerol oncopole","iuct oncopole","ctr lutte contre canc","ico canc","icm, montpellier canc inst","univ inst canc toulouse oncopole","rothschild"] + + Orga = "" + for ac in acro: + if ac in texte.lower(): + # APPEL DES FONCTIONS + df['Terme Affiliation'] = df["Affiliation"].apply(terme_aff,liste=acro) + df['Ville_canonique_Dpt'] = df['Ville_canonique_Dpt'].apply(correction) + if len(df[df['Terme Affiliation'] == True]) != 0: + df2 = df[df['Terme Affiliation'] == True] + df2["Ville_présente"] = df2['Ville_canonique_Dpt'].apply(aff_ville,texte=texte) + if len(df2[df2["Ville_présente"] == True]) != 0: + df3 = df2[df2["Ville_présente"] == True] + df3["ratio"] = df3["Affiliation"].apply(fuzzywuzzy,text=texte) + Orga = df3["Orga NonCnrs Acorriger"][df3["ratio"].idxmax()] + else: + Orga = "N.C" + else: + Orga = "N.C" + break + if Orga == "": + Orga = "N.C" + return Orga + +df = pd.read_excel ('hospital_affiliations.xlsx') + + +for line in sys.stdin: + data = json.loads(line) + texte=data['value'] + data['value']=base(texte,df) + sys.stdout.write(json.dumps(data)) + sys.stdout.write('\n') + +## TEST +#{"id": "1","value": "CHRU Hautepierre Strasbourg, Lab Anat Pathol, 1 Ave Moliere, F-67098 Strasbourg, France"}, +# {"id" :"2","value": "AP HP, Paris Ctr Necker Cochin, Clin Res Unit, Paris, France"}, +# {"id" :"3","value": "Hop La Pitie Salpetriere, AP HP, Serv Med Interne, Ctr Natl Reference Histiocytoses, Paris, France"}, +# {"id" :"4","value": "Hop Necker Enfants Malad, Gen Pediat Dept, Paris, France"}, +# {"id" :"5","value": "Bergonie Inst, Dept Radiat Oncol, Bordeaux, France"} +#{"id" :"6","value": "CHU Besancon, Serv Neurol, 2 Blvd Fleming, F-25030 Besancon, France"} +#{"id" :"7","value": "Univ Bordeaux, INCIA, CNRS UMR 5287, Equipe Neuropsychopharmacol Addict, BP31,146 Rue Leo Saignat, F-33076 Bordeaux, France"} + +""" +lines = json.loads(json.dumps([ +{"id" :"5","value": "Hop Charpennes, 27 Rue G Peri, F-69100 Villeurbanne, France"}, +{"id" :"6","value": "Fdn Ophtalmol Rothschild, ERT TREAT Vis, 25 Rue Manin, F-75019 Paris, France"}, +{"id" :"7","value": "Ctr Hosp, Mt De Marsan, France"}, +{"id": "8", "value":"CHU Pointe A Pitre, Serv Gynecol Obstet, F-97159 Pointe A Pitre, Guadeloupe, France"}, +])) + +for line in lines: + data = line + texte=data['value'] + data['value']=base(texte,df) + sys.stdout.write(json.dumps(data)) + sys.stdout.write('\n') +""" + diff --git a/hospital-affiliations/hospital_affiliations.xlsx b/hospital-affiliations/hospital_affiliations.xlsx new file mode 100755 index 0000000..f1906ec --- /dev/null +++ b/hospital-affiliations/hospital_affiliations.xlsx Binary files differ diff --git a/hospital-affiliations/input_data.txt b/hospital-affiliations/input_data.txt new file mode 100644 index 0000000..e375b72 --- /dev/null +++ b/hospital-affiliations/input_data.txt @@ -0,0 +1,5 @@ +{"id" :"3","value": "Hop La Pitie Salpetriere, AP HP, Serv Med Interne, Ctr Natl Reference Histiocytoses, Paris, France"} +{"id" :"4","value": "Hop Necker Enfants Malad, Gen Pediat Dept, Paris, France"} +{"id" :"5","value": "Bergonie Inst, Dept Radiat Oncol, Bordeaux, France"} +{"id" :"6","value": "CHU Besancon, Serv Neurol, 2 Blvd Fleming, F-25030 Besancon, France"} +{"id" :"7","value": "Univ Bordeaux, INCIA, CNRS UMR 5287, Equipe Neuropsychopharmacol Addict, BP31,146 Rue Leo Saignat, F-33076 Bordeaux, France"} \ No newline at end of file diff --git a/hospital-affiliations/requirements.txt b/hospital-affiliations/requirements.txt new file mode 100755 index 0000000..4669346 --- /dev/null +++ b/hospital-affiliations/requirements.txt @@ -0,0 +1,5 @@ +fuzzywuzzy +numpy +pandas +nltk +openpyxl