#!/usr/bin/env python3 # flake8: noqa E501 (line length) import pandas as pd import config import unicodedata import re pd.options.mode.chained_assignment = None def is_hospital_affiliation(affiliation): affiliation_lower = affiliation.lower() for aff in config.acronyms: if aff in affiliation_lower: return True return False def remove_accents(city_name): """correction of the cities : remove accents""" normalized_text = unicodedata.normalize("NFD", city_name) text_with_no_accent = re.sub("[\u0300-\u036f]", "", normalized_text) return text_with_no_accent def remove_department_numbers(city_name): """correction of the cities : remove the department numbers""" # Grenoble - 38 => Grenoble sep = " -" stripped = city_name.split(sep, 1)[0] stripped = stripped.replace("-", " ") return stripped def convert_acronyms(city_name): """correction of the cities: convert acronyms""" stripped = city_name.replace("Saint ", "St ") stripped = stripped.replace("Mont ", "Mt ") return stripped def is_city_in_affiliation(city, affiliation): return city.lower() in affiliation.lower() affiliations_dataframe = pd.read_csv("hospital_affiliations_1.csv", sep=";") acronyms = config.acronyms for acronym in acronyms: # standarize original dataframe affiliations_dataframe["contains_acronyms"] = affiliations_dataframe["Affiliation"].apply( is_hospital_affiliation) affiliations_dataframe["standardized_city"] = affiliations_dataframe["Ville_canonique_Dpt"].apply( remove_department_numbers).apply(convert_acronyms).apply(remove_accents) # print(affiliations_dataframe) normalized_dataframe = affiliations_dataframe[affiliations_dataframe["contains_acronyms"] == True] # noqa: E712 if len(normalized_dataframe) == 0: # noqa: E712 continue normalized_dataframe["standardized_hospital"] = affiliations_dataframe["Hospital"] normalized_dataframe = normalized_dataframe.drop( normalized_dataframe.columns[[1, 2, 3]], axis=1) normalized_dataframe.to_csv( 'hospital_affiliation_normalized.csv', sep=';', encoding='utf-8', index=False)