diff --git a/hospital-affiliations/aff_hosp.py b/hospital-affiliations/aff_hosp.py index 525ecff..6fbb8ac 100755 --- a/hospital-affiliations/aff_hosp.py +++ b/hospital-affiliations/aff_hosp.py @@ -8,15 +8,13 @@ import re pd.options.mode.chained_assignment = None -## MODIFICATION DU FICHIER EXCEL -# On regarde si la liste des termes acronymes choisis est présente dans les affiliations -def terme_aff(texte, liste=None) : - for aff in liste : - if aff in texte.lower() : +# Check if an element of the list is in affiliation +def is_hospital_affiliation(affiliation, list) : + affiliation_lower = affiliation.lower() + for aff in list : + if aff in affiliation_lower : return True return False -# Il faut également que l'affiliation qu'on veut homogénéiser ait un terme de la liste - #correction of the cities : remove the department numbers and transformation into acronyms def remove_department_numbers_and_acronyms(city_name) : @@ -47,7 +45,7 @@ return fuzz.ratio(affiliation,text) def base(texte,df): - acro = ["chr","chr ","chu ","chu,","chru","ap hp","ap hm","aphp","aphm","hosp","hop ","hop,"," serv","clcc","ctr lutte canc","antoine lacassagne","eugene marquis","baclesse", + acronyms = ["chr","chr ","chu ","chu,","chru","ap hp","ap hm","aphp","aphm","hosp","hop ","hop,"," serv","clcc","ctr lutte canc","antoine lacassagne","eugene marquis","baclesse", "georges françois leclerc","gf leclerc","jf leclerc","henri becquerel","h becquerel","jean perrin","leon berard","ctr oscar lambret", "oscar lambret comprehens","oscar lambret canc","clcc oscar lambret","ctr lutte canc oscar lambret","gustave roussy","bergonie", "inst curie","curie inst","rene gauducheau","inst cancerol ouest","inst cancerol lorraine","alexis vautrin","inst reg canc montpellier", @@ -55,10 +53,10 @@ "inst univ cancerol oncopole","iuct oncopole","ctr lutte contre canc","ico canc","icm, montpellier canc inst","univ inst canc toulouse oncopole","rothschild"] Orga = "" - for ac in acro: + for ac in acronyms: if ac in texte.lower(): # APPEL DES FONCTIONS - df['Terme Affiliation'] = df["Affiliation"].apply(terme_aff,liste=acro) + df['Terme Affiliation'] = df["Affiliation"].apply(is_hospital_affiliation,list=acronyms) df['Ville_canonique_Dpt'] = df['Ville_canonique_Dpt'].apply(remove_department_numbers_and_acronyms) df['Ville_canonique_Dpt'] = df['Ville_canonique_Dpt'].apply(remove_accents) if len(df[df['Terme Affiliation'] == True]) != 0: