diff --git a/hospital-affiliations/v1/aff_hosp.py b/hospital-affiliations/v1/aff_hosp.py index 47a22f6..3667ece 100755 --- a/hospital-affiliations/v1/aff_hosp.py +++ b/hospital-affiliations/v1/aff_hosp.py @@ -33,8 +33,8 @@ # correction of the cities: convert acronyms def convert_acronyms(city_name): - stripped = city_name.replace("Saint", "St") - stripped = stripped.replace("Mont", "Mt") + stripped = city_name.replace("Saint ", "St ") + stripped = stripped.replace("Mont ", "Mt ") return stripped @@ -55,33 +55,27 @@ def get_corresponding_hospital_from_affiliation(affiliation): - affiliations_dataframe = pd.read_csv("hospital_affiliations.csv", sep=";") + affiliations_dataframe = pd.read_csv("hospital_affiliation_normalize.csv", sep=";") acronyms = config.acronyms - hospital = "N.C" + hospital = "n/a" affiliation = affiliation.lower() + affiliation = remove_accents(affiliation) + for acronym in acronyms: if acronym not in affiliation: continue - # standarize original dataframe - affiliations_dataframe["contains_acronyms"] = affiliations_dataframe["Affiliation"].apply(is_hospital_affiliation) - affiliations_dataframe["standardized_city"] = affiliations_dataframe["Ville_canonique_Dpt"].apply(remove_department_numbers).apply(convert_acronyms).apply(remove_accents) - - acronyms_dataframe = affiliations_dataframe[affiliations_dataframe["contains_acronyms"] == True] # noqa: E712 - if len(acronyms_dataframe) == 0: # noqa: E712 - continue - # create new dataframe only with affiliations that contain acronyms - acronyms_dataframe["city_in_affiliations"] = acronyms_dataframe["standardized_city"].apply(is_city_in_affiliation, affiliation=affiliation) + affiliations_dataframe["city_in_affiliations"] = affiliations_dataframe["standardized_city"].apply(is_city_in_affiliation, affiliation=affiliation) - ancronyms_cities_dataframe = acronyms_dataframe[acronyms_dataframe["city_in_affiliations"] == True] # noqa: E712 - if len(ancronyms_cities_dataframe) == 0: + acronyms_cities_dataframe = affiliations_dataframe[affiliations_dataframe["city_in_affiliations"] == True] # noqa: E712 + if len(acronyms_cities_dataframe) == 0: continue # create new dataframe only with affiliations that contain acronyms and cities - ancronyms_cities_dataframe["ratio"] = ancronyms_cities_dataframe["Affiliation"].apply(affiliations_match_ratio, second_affiliation=affiliation) - hospital = ancronyms_cities_dataframe["Orga NonCnrs Acorriger"][ancronyms_cities_dataframe["ratio"].idxmax()] + acronyms_cities_dataframe["ratio"] = acronyms_cities_dataframe["Affiliation"].apply(affiliations_match_ratio, second_affiliation=affiliation) + hospital = acronyms_cities_dataframe["standardized_hospital"][acronyms_cities_dataframe["ratio"].idxmax()] break return hospital