diff --git a/hospital-affiliations/v1/preprocessing.py b/hospital-affiliations/v1/preprocessing.py new file mode 100644 index 0000000..f82cca5 --- /dev/null +++ b/hospital-affiliations/v1/preprocessing.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +# flake8: noqa E501 (line length) + +import pandas as pd +import config +import unicodedata +import re +pd.options.mode.chained_assignment = None + +def is_hospital_affiliation(affiliation): + affiliation_lower = affiliation.lower() + for aff in config.acronyms: + if aff in affiliation_lower: + return True + return False + +# correction of the cities : remove accents +def remove_accents(city_name): + normalized_text = unicodedata.normalize("NFD", city_name) + text_with_no_accent = re.sub("[\u0300-\u036f]", "", normalized_text) + return text_with_no_accent + +# correction of the cities : remove the department numbers +def remove_department_numbers(city_name): + # Grenoble - 38 => Grenoble + sep = " -" + stripped = city_name.split(sep, 1)[0] + stripped = stripped.replace("-", " ") + + return stripped + +# correction of the cities: convert acronyms +def convert_acronyms(city_name): + stripped = city_name.replace("Saint ", "St ") + stripped = stripped.replace("Mont ", "Mt ") + + return stripped + +def is_city_in_affiliation(city, affiliation): + return city.lower() in affiliation.lower() + + +affiliations_dataframe = pd.read_csv("hospital_affiliations_1.csv", sep=";") +acronyms = config.acronyms + +for acronym in acronyms: + # standarize original dataframe + affiliations_dataframe["contains_acronyms"] = affiliations_dataframe["Affiliation"].apply(is_hospital_affiliation) + affiliations_dataframe["standardized_city"] = affiliations_dataframe["Ville_canonique_Dpt"].apply(remove_department_numbers).apply(convert_acronyms).apply(remove_accents) + # print(affiliations_dataframe) + + normalize_dataframe = affiliations_dataframe[affiliations_dataframe["contains_acronyms"] == True] # noqa: E712 + if len(normalize_dataframe) == 0: # noqa: E712 + continue + + normalize_dataframe ["standardized_hospital"] = affiliations_dataframe["Orga NonCnrs Acorriger"] + +normalize_dataframe = normalize_dataframe.drop(normalize_dataframe.columns[[1,2,3]], axis=1) +normalize_dataframe.to_csv('hospital_affiliation_normalize.csv', sep=';', encoding='utf-8', index=False) +