diff --git a/.vscode/settings.json b/.vscode/settings.json index 5f3108a..1a12a8a 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,5 @@ { - "markdown.extension.toc.updateOnSave": false + "markdown.extension.toc.updateOnSave": false, + "python.linting.enabled": true, + "python.linting.flake8Enabled": true } \ No newline at end of file diff --git a/hospital-affiliations/aff_hosp.py b/hospital-affiliations/aff_hosp.py index c45f3e6..2667c57 100755 --- a/hospital-affiliations/aff_hosp.py +++ b/hospital-affiliations/aff_hosp.py @@ -1,3 +1,5 @@ +# flake8: noqa E501 (line length) + import pandas as pd from fuzzywuzzy import fuzz import sys @@ -5,40 +7,50 @@ import unicodedata import re import config -pd.options.mode.chained_assignment = None +pd.options.mode.chained_assignment = None + # Check if an element of the list is in affiliation -def is_hospital_affiliation(affiliation) : +def is_hospital_affiliation(affiliation): affiliation_lower = affiliation.lower() - for aff in config.acronyms : - if aff in affiliation_lower : + for aff in config.acronyms: + if aff in affiliation_lower: return True return False -#correction of the cities : remove the department numbers and convert acronyms -def remove_department_numbers_and_convert_acronyms(city_name) : - # department numbers: Grenoble - 38 => Grenoble + +# correction of the cities : remove the department numbers +def remove_department_numbers(city_name): + # Grenoble - 38 => Grenoble sep = " -" stripped = city_name.split(sep, 1)[0] - stripped = stripped.replace("-"," ") - - # acronyms - stripped = stripped.replace("Saint","St") - stripped = stripped.replace("Mont","Mt") + stripped = stripped.replace("-", " ") return stripped + +# correction of the cities: convert acronyms +def convert_acronyms(city_name): + stripped = city_name.replace("Saint", "St") + stripped = stripped.replace("Mont", "Mt") + + return stripped + + # correction of the cities : remove accents -def remove_accents(city_name) : +def remove_accents(city_name): normalized_text = unicodedata.normalize("NFD", city_name) text_with_no_accent = re.sub("[\u0300-\u036f]", "", normalized_text) return text_with_no_accent -def is_city_in_affiliation(city,affiliation): + +def is_city_in_affiliation(city, affiliation): return city.lower() in affiliation.lower() -def affiliations_match_ratio(first_affiliation,second_affiliation): - return fuzz.ratio(first_affiliation,second_affiliation) + +def affiliations_match_ratio(first_affiliation, second_affiliation): + return fuzz.ratio(first_affiliation, second_affiliation) + def get_corresponding_hospital_from_affiliation(affiliation): affiliations_dataframe = pd.read_csv("hospital_affiliations.csv", sep=";") @@ -47,39 +59,40 @@ hospital = "N.C" affiliation = affiliation.lower() for acronym in acronyms: - if acronym not in affiliation : + if acronym not in affiliation: continue # standarize original dataframe affiliations_dataframe["contains_acronyms"] = affiliations_dataframe["Affiliation"].apply(is_hospital_affiliation) - affiliations_dataframe["standardized_city"] = affiliations_dataframe["Ville_canonique_Dpt"].apply(remove_department_numbers_and_convert_acronyms).apply(remove_accents) - - if len(affiliations_dataframe[affiliations_dataframe["contains_acronyms"] == True]) == 0: + affiliations_dataframe["standardized_city"] = affiliations_dataframe["Ville_canonique_Dpt"].apply(remove_department_numbers).apply(convert_acronyms).apply(remove_accents) + + acronyms_dataframe = affiliations_dataframe[affiliations_dataframe["contains_acronyms"] == True] # noqa: E712 + if len(acronyms_dataframe) == 0: # noqa: E712 continue # create new dataframe only with affiliations that contain acronyms - dataframe_with_acronyms = affiliations_dataframe[affiliations_dataframe["contains_acronyms"] == True] - dataframe_with_acronyms["city_in_affiliations"] = dataframe_with_acronyms["standardized_city"].apply(is_city_in_affiliation,affiliation=affiliation) - - if len(dataframe_with_acronyms[dataframe_with_acronyms["city_in_affiliations"] == True]) == 0: + acronyms_dataframe["city_in_affiliations"] = acronyms_dataframe["standardized_city"].apply(is_city_in_affiliation, affiliation=affiliation) + + ancronyms_cities_dataframe = acronyms_dataframe[acronyms_dataframe["city_in_affiliations"] == True] # noqa: E712 + if len(ancronyms_cities_dataframe) == 0: continue # create new dataframe only with affiliations that contain acronyms and cities - dataframe_with_acronyms_and_cities = dataframe_with_acronyms[dataframe_with_acronyms["city_in_affiliations"] == True] - dataframe_with_acronyms_and_cities["ratio"] = dataframe_with_acronyms_and_cities["Affiliation"].apply(affiliations_match_ratio,second_affiliation=affiliation) - hospital = dataframe_with_acronyms_and_cities["Orga NonCnrs Acorriger"][dataframe_with_acronyms_and_cities["ratio"].idxmax()] - + ancronyms_cities_dataframe["ratio"] = ancronyms_cities_dataframe["Affiliation"].apply(affiliations_match_ratio, second_affiliation=affiliation) + hospital = ancronyms_cities_dataframe["Orga NonCnrs Acorriger"][ancronyms_cities_dataframe["ratio"].idxmax()] break return hospital + def main(): for line in sys.stdin: data = json.loads(line) - texte=data["value"] - data["value"]=get_corresponding_hospital_from_affiliation(texte) + texte = data["value"] + data["value"] = get_corresponding_hospital_from_affiliation(texte) sys.stdout.write(json.dumps(data)) sys.stdout.write("\n") + if __name__ == "__main__": - main() \ No newline at end of file + main()