import pandas as pd from fuzzywuzzy import fuzz import sys import json import unicodedata import re import config pd.options.mode.chained_assignment = None # Check if an element of the list is in affiliation def is_hospital_affiliation(affiliation) : affiliation_lower = affiliation.lower() for aff in config.acronyms : if aff in affiliation_lower : return True return False #correction of the cities : remove the department numbers and convert acronyms def remove_department_numbers_and_convert_acronyms(city_name) : # department numbers: Grenoble - 38 => Grenoble sep = " -" stripped = city_name.split(sep, 1)[0] stripped = stripped.replace("-"," ") # acronyms stripped = stripped.replace("Saint","St") stripped = stripped.replace("Mont","Mt") return stripped # correction of the cities : remove accents def remove_accents(city_name) : normalized_text = unicodedata.normalize("NFD", city_name) text_with_no_accent = re.sub("[\u0300-\u036f]", "", normalized_text) return text_with_no_accent def is_city_in_affiliation(city,affiliation): return city.lower() in affiliation.lower() def affiliations_match_ratio(first_affiliation,second_affiliation): return fuzz.ratio(first_affiliation,second_affiliation) def get_corresponding_hospital_from_affiliation(affiliation): affiliations_dataframe = pd.read_csv("hospital_affiliations.csv", sep=";") acronyms = config.acronyms hospital = "N.C" affiliation = affiliation.lower() for acronym in acronyms: if acronym not in affiliation : continue # standarize original dataframe affiliations_dataframe["contains_acronyms"] = affiliations_dataframe["Affiliation"].apply(is_hospital_affiliation) affiliations_dataframe["standardized_city"] = affiliations_dataframe["Ville_canonique_Dpt"].apply(remove_department_numbers_and_convert_acronyms).apply(remove_accents) if len(affiliations_dataframe[affiliations_dataframe["contains_acronyms"] == True]) == 0: continue # create new dataframe only with affiliations that contain acronyms dataframe_with_acronyms = affiliations_dataframe[affiliations_dataframe["contains_acronyms"] == True] dataframe_with_acronyms["city_in_affiliations"] = dataframe_with_acronyms["standardized_city"].apply(is_city_in_affiliation,affiliation=affiliation) if len(dataframe_with_acronyms[dataframe_with_acronyms["city_in_affiliations"] == True]) == 0: continue # create new dataframe only with affiliations that contain acronyms and cities dataframe_with_acronyms_and_cities = dataframe_with_acronyms[dataframe_with_acronyms["city_in_affiliations"] == True] dataframe_with_acronyms_and_cities["ratio"] = dataframe_with_acronyms_and_cities["Affiliation"].apply(affiliations_match_ratio,second_affiliation=affiliation) hospital = dataframe_with_acronyms_and_cities["Orga NonCnrs Acorriger"][dataframe_with_acronyms_and_cities["ratio"].idxmax()] break return hospital def main(): for line in sys.stdin: data = json.loads(line) texte=data["value"] data["value"]=get_corresponding_hospital_from_affiliation(texte) sys.stdout.write(json.dumps(data)) sys.stdout.write("\n") if __name__ == "__main__": main()