diff --git a/hospital-affiliations/aff_hosp.py b/hospital-affiliations/aff_hosp.py index 5c24ffb..8c6f614 100755 --- a/hospital-affiliations/aff_hosp.py +++ b/hospital-affiliations/aff_hosp.py @@ -16,8 +16,8 @@ return True return False -#correction of the cities : remove the department numbers and transformation into acronyms -def remove_department_numbers_and_acronyms(city_name) : +#correction of the cities : remove the department numbers and convert acronyms +def remove_department_numbers_and_convert_acronyms(city_name) : # department numbers: Grenoble - 38 => Grenoble sep = ' -' stripped = city_name.split(sep, 1)[0] @@ -41,7 +41,8 @@ def affiliations_match_ratio(first_affiliation,second_affiliation): return fuzz.ratio(first_affiliation,second_affiliation) -def base(texte,df): +def get_corresponding_hospital_from_affiliation(affiliation): + affiliations_dataframe = pd.read_csv('hospital_affiliations.csv', sep=";") acronyms = ["chr","chr ","chu ","chu,","chru","ap hp","ap hm","aphp","aphm","hosp","hop ","hop,"," serv","clcc","ctr lutte canc","antoine lacassagne","eugene marquis","baclesse", "georges françois leclerc","gf leclerc","jf leclerc","henri becquerel","h becquerel","jean perrin","leon berard","ctr oscar lambret", "oscar lambret comprehens","oscar lambret canc","clcc oscar lambret","ctr lutte canc oscar lambret","gustave roussy","bergonie", @@ -49,35 +50,34 @@ "icans","paul strauss","paul str","paoli calmettes","inst j paoli i calmettes","claudius regaud","claudius rigaud","inst univ canc toulouse oncopole", "inst univ cancerol oncopole","iuct oncopole","ctr lutte contre canc","ico canc","icm, montpellier canc inst","univ inst canc toulouse oncopole","rothschild"] - Orga = "" - for ac in acronyms: - if ac in texte.lower(): - # APPEL DES FONCTIONS - df['Terme Affiliation'] = df["Affiliation"].apply(is_hospital_affiliation,list=acronyms) - df['Ville_canonique_Dpt'] = df['Ville_canonique_Dpt'].apply(remove_department_numbers_and_acronyms) - df['Ville_canonique_Dpt'] = df['Ville_canonique_Dpt'].apply(remove_accents) - if len(df[df['Terme Affiliation'] == True]) != 0: - df2 = df[df['Terme Affiliation'] == True] - df2["Ville_présente"] = df2['Ville_canonique_Dpt'].apply(is_city_in_affiliation,affiliation=texte) - if len(df2[df2["Ville_présente"] == True]) != 0: - df3 = df2[df2["Ville_présente"] == True] - df3["ratio"] = df3["Affiliation"].apply(affiliations_match_ratio,second_affiliation=texte) - Orga = df3["Orga NonCnrs Acorriger"][df3["ratio"].idxmax()] + hospital = "" + affiliation = affiliation.lower() + for acronym in acronyms: + if acronym in affiliation : + # standarize original dataframe + affiliations_dataframe["contains_acronyms"] = affiliations_dataframe["Affiliation"].apply(is_hospital_affiliation,list=acronyms) + affiliations_dataframe["standardized_city"] = affiliations_dataframe["Ville_canonique_Dpt"].apply(remove_department_numbers_and_convert_acronyms).apply(remove_accents) + if len(affiliations_dataframe[affiliations_dataframe['contains_acronyms'] == True]) > 0: + # create new dataframe only with affiliations that contain acronyms + dataframe_with_acronyms = affiliations_dataframe[affiliations_dataframe['contains_acronyms'] == True] + dataframe_with_acronyms["city_in_affiliations"] = dataframe_with_acronyms['standardized_city'].apply(is_city_in_affiliation,affiliation=affiliation) + if len(dataframe_with_acronyms[dataframe_with_acronyms["city_in_affiliations"] == True]) != 0: + # create new dataframe only with affiliations that contain acronyms and cities + dataframe_with_acronyms_and_cities = dataframe_with_acronyms[dataframe_with_acronyms["city_in_affiliations"] == True] + dataframe_with_acronyms_and_cities["ratio"] = dataframe_with_acronyms_and_cities["Affiliation"].apply(affiliations_match_ratio,second_affiliation=affiliation) + hospital = dataframe_with_acronyms_and_cities["Orga NonCnrs Acorriger"][dataframe_with_acronyms_and_cities["ratio"].idxmax()] else: - Orga = "N.C" + hospital = "N.C" else: - Orga = "N.C" + hospital = "N.C" break - if Orga == "": - Orga = "N.C" - return Orga - -df = pd.read_csv('hospital_affiliations.csv', sep=";") - + if hospital == "": + hospital = "N.C" + return hospital for line in sys.stdin: data = json.loads(line) texte=data['value'] - data['value']=base(texte,df) + data['value']=get_corresponding_hospital_from_affiliation(texte) sys.stdout.write(json.dumps(data)) sys.stdout.write('\n')