diff --git a/hospital-affiliations/aff_hosp.py b/hospital-affiliations/aff_hosp.py index f280ccc..1d5ed2e 100755 --- a/hospital-affiliations/aff_hosp.py +++ b/hospital-affiliations/aff_hosp.py @@ -19,7 +19,7 @@ #correction of the cities : remove the department numbers and convert acronyms def remove_department_numbers_and_convert_acronyms(city_name) : # department numbers: Grenoble - 38 => Grenoble - sep = ' -' + sep = " -" stripped = city_name.split(sep, 1)[0] stripped = stripped.replace("-"," ") @@ -32,7 +32,7 @@ # correction of the cities : remove accents def remove_accents(city_name) : normalized_text = unicodedata.normalize("NFD", city_name) - text_with_no_accent = re.sub("[\u0300-\u036f]", '', normalized_text) + text_with_no_accent = re.sub("[\u0300-\u036f]", "", normalized_text) return text_with_no_accent def is_city_in_affiliation(city,affiliation): @@ -42,7 +42,7 @@ return fuzz.ratio(first_affiliation,second_affiliation) def get_corresponding_hospital_from_affiliation(affiliation): - affiliations_dataframe = pd.read_csv('hospital_affiliations.csv', sep=";") + affiliations_dataframe = pd.read_csv("hospital_affiliations.csv", sep=";") acronyms = ["chr","chr ","chu ","chu,","chru","ap hp","ap hm","aphp","aphm","hosp","hop ","hop,"," serv","clcc","ctr lutte canc","antoine lacassagne","eugene marquis","baclesse", "georges françois leclerc","gf leclerc","jf leclerc","henri becquerel","h becquerel","jean perrin","leon berard","ctr oscar lambret", "oscar lambret comprehens","oscar lambret canc","clcc oscar lambret","ctr lutte canc oscar lambret","gustave roussy","bergonie", @@ -60,12 +60,12 @@ affiliations_dataframe["contains_acronyms"] = affiliations_dataframe["Affiliation"].apply(is_hospital_affiliation,list=acronyms) affiliations_dataframe["standardized_city"] = affiliations_dataframe["Ville_canonique_Dpt"].apply(remove_department_numbers_and_convert_acronyms).apply(remove_accents) - if len(affiliations_dataframe[affiliations_dataframe['contains_acronyms'] == True]) == 0: + if len(affiliations_dataframe[affiliations_dataframe["contains_acronyms"] == True]) == 0: continue # create new dataframe only with affiliations that contain acronyms - dataframe_with_acronyms = affiliations_dataframe[affiliations_dataframe['contains_acronyms'] == True] - dataframe_with_acronyms["city_in_affiliations"] = dataframe_with_acronyms['standardized_city'].apply(is_city_in_affiliation,affiliation=affiliation) + dataframe_with_acronyms = affiliations_dataframe[affiliations_dataframe["contains_acronyms"] == True] + dataframe_with_acronyms["city_in_affiliations"] = dataframe_with_acronyms["standardized_city"].apply(is_city_in_affiliation,affiliation=affiliation) if len(dataframe_with_acronyms[dataframe_with_acronyms["city_in_affiliations"] == True]) == 0: continue @@ -79,9 +79,13 @@ return hospital -for line in sys.stdin: - data = json.loads(line) - texte=data['value'] - data['value']=get_corresponding_hospital_from_affiliation(texte) - sys.stdout.write(json.dumps(data)) - sys.stdout.write('\n') +def main(): + for line in sys.stdin: + data = json.loads(line) + texte=data["value"] + data["value"]=get_corresponding_hospital_from_affiliation(texte) + sys.stdout.write(json.dumps(data)) + sys.stdout.write("\n") + +if __name__ == "__main__": + main() \ No newline at end of file