diff --git a/hospital-affiliations/aff_hosp.py b/hospital-affiliations/aff_hosp.py index c741bad..525ecff 100755 --- a/hospital-affiliations/aff_hosp.py +++ b/hospital-affiliations/aff_hosp.py @@ -1,10 +1,12 @@ from numpy import column_stack import pandas as pd -pd.options.mode.chained_assignment = None from nltk.tokenize import word_tokenize from fuzzywuzzy import fuzz import sys import json +import unicodedata +import re +pd.options.mode.chained_assignment = None ## MODIFICATION DU FICHIER EXCEL # On regarde si la liste des termes acronymes choisis est présente dans les affiliations @@ -16,23 +18,25 @@ # Il faut également que l'affiliation qu'on veut homogénéiser ait un terme de la liste -#Correction des villes -def correction(texte) : +#correction of the cities : remove the department numbers and transformation into acronyms +def remove_department_numbers_and_acronyms(city_name) : + # department numbers: Grenoble - 38 => Grenoble sep = ' -' - stripped = texte.split(sep, 1)[0] + stripped = city_name.split(sep, 1)[0] stripped = stripped.replace("-"," ") - stripped = stripped.replace("ç","c") + + # acronyms stripped = stripped.replace("Saint","St") - stripped = stripped.replace("é","e") - stripped = stripped.replace("è","e") - stripped = stripped.replace("É","E") - stripped = stripped.replace("â","a") - stripped = stripped.replace("ê","e") - stripped = stripped.replace("î","i") - stripped = stripped.replace("à","a") + stripped = stripped.replace("Mont","Mt") return stripped +# correction of the cities : remove accents +def remove_accents(city_name) : + normalized_text = unicodedata.normalize("NFD", city_name) + text_with_no_accent = re.sub("[\u0300-\u036f]", '', normalized_text) + return text_with_no_accent + # On repère/marque les lignes dont l'affiliation contient la ville def aff_ville(ville,texte): if ville.lower() in texte.lower() : @@ -55,7 +59,8 @@ if ac in texte.lower(): # APPEL DES FONCTIONS df['Terme Affiliation'] = df["Affiliation"].apply(terme_aff,liste=acro) - df['Ville_canonique_Dpt'] = df['Ville_canonique_Dpt'].apply(correction) + df['Ville_canonique_Dpt'] = df['Ville_canonique_Dpt'].apply(remove_department_numbers_and_acronyms) + df['Ville_canonique_Dpt'] = df['Ville_canonique_Dpt'].apply(remove_accents) if len(df[df['Terme Affiliation'] == True]) != 0: df2 = df[df['Terme Affiliation'] == True] df2["Ville_présente"] = df2['Ville_canonique_Dpt'].apply(aff_ville,texte=texte) diff --git a/hospital-affiliations/input_data.txt b/hospital-affiliations/input_data.txt index e375b72..9a74acb 100644 --- a/hospital-affiliations/input_data.txt +++ b/hospital-affiliations/input_data.txt @@ -2,4 +2,4 @@ {"id" :"4","value": "Hop Necker Enfants Malad, Gen Pediat Dept, Paris, France"} {"id" :"5","value": "Bergonie Inst, Dept Radiat Oncol, Bordeaux, France"} {"id" :"6","value": "CHU Besancon, Serv Neurol, 2 Blvd Fleming, F-25030 Besancon, France"} -{"id" :"7","value": "Univ Bordeaux, INCIA, CNRS UMR 5287, Equipe Neuropsychopharmacol Addict, BP31,146 Rue Leo Saignat, F-33076 Bordeaux, France"} \ No newline at end of file +{"id" :"7","value": "CHU St Etienne, Ave Albert Raimond, F-42055 St Etienne, France"} \ No newline at end of file