#!/usr/bin/env python3
# flake8: noqa E501 (line length)
import pandas as pd
import config
import unicodedata
import re
pd.options.mode.chained_assignment = None
def is_hospital_affiliation(affiliation):
affiliation_lower = affiliation.lower()
for aff in config.acronyms:
if aff in affiliation_lower:
return True
return False
def remove_accents(city_name):
"""correction of the cities : remove accents"""
normalized_text = unicodedata.normalize("NFD", city_name)
text_with_no_accent = re.sub("[\u0300-\u036f]", "", normalized_text)
return text_with_no_accent
def remove_department_numbers(city_name):
"""correction of the cities : remove the department numbers"""
# Grenoble - 38 => Grenoble
sep = " -"
stripped = city_name.split(sep, 1)[0]
stripped = stripped.replace("-", " ")
return stripped
def convert_acronyms(city_name):
"""correction of the cities: convert acronyms"""
stripped = city_name.replace("Saint ", "St ")
stripped = stripped.replace("Mont ", "Mt ")
return stripped
def is_city_in_affiliation(city, affiliation):
return city.lower() in affiliation.lower()
affiliations_dataframe = pd.read_csv("hospital_affiliations_1.csv", sep=";")
acronyms = config.acronyms
for acronym in acronyms:
# standarize original dataframe
affiliations_dataframe["contains_acronyms"] = affiliations_dataframe["Affiliation"].apply(
is_hospital_affiliation)
affiliations_dataframe["standardized_city"] = affiliations_dataframe["Ville_canonique_Dpt"].apply(
remove_department_numbers).apply(convert_acronyms).apply(remove_accents)
# print(affiliations_dataframe)
normalized_dataframe = affiliations_dataframe[affiliations_dataframe["contains_acronyms"] == True] # noqa: E712
if len(normalized_dataframe) == 0: # noqa: E712
continue
normalized_dataframe["standardized_hospital"] = affiliations_dataframe["Hospital"]
normalized_dataframe = normalized_dataframe.drop(
normalized_dataframe.columns[[1, 2, 3]], axis=1)
normalized_dataframe.to_csv(
'hospital_affiliation_normalized.csv', sep=';', encoding='utf-8', index=False)