from numpy import column_stack import pandas as pd pd.options.mode.chained_assignment = None from nltk.tokenize import word_tokenize from fuzzywuzzy import fuzz import sys import json ## MODIFICATION DU FICHIER EXCEL # On regarde si la liste des termes acronymes choisis est présente dans les affiliations def terme_aff(texte, liste=None) : for aff in liste : if aff in texte.lower() : return True return False # Il faut également que l'affiliation qu'on veut homogénéiser ait un terme de la liste #Correction des villes def correction(texte) : sep = ' -' stripped = texte.split(sep, 1)[0] stripped = stripped.replace("-"," ") stripped = stripped.replace("ç","c") stripped = stripped.replace("Saint","St") stripped = stripped.replace("é","e") stripped = stripped.replace("è","e") stripped = stripped.replace("É","E") stripped = stripped.replace("â","a") stripped = stripped.replace("ê","e") stripped = stripped.replace("î","i") stripped = stripped.replace("à","a") return stripped # On repère/marque les lignes dont l'affiliation contient la ville def aff_ville(ville,texte): if ville.lower() in texte.lower() : return True return False def fuzzywuzzy(affiliation,text = None): return fuzz.ratio(affiliation,text) def base(texte,df): acro = ["chr","chr ","chu ","chu,","chru","ap hp","ap hm","aphp","aphm","hosp","hop ","hop,"," serv","clcc","ctr lutte canc","antoine lacassagne","eugene marquis","baclesse", "georges françois leclerc","gf leclerc","jf leclerc","henri becquerel","h becquerel","jean perrin","leon berard","ctr oscar lambret", "oscar lambret comprehens","oscar lambret canc","clcc oscar lambret","ctr lutte canc oscar lambret","gustave roussy","bergonie", "inst curie","curie inst","rene gauducheau","inst cancerol ouest","inst cancerol lorraine","alexis vautrin","inst reg canc montpellier", "icans","paul strauss","paul str","paoli calmettes","inst j paoli i calmettes","claudius regaud","claudius rigaud","inst univ canc toulouse oncopole", "inst univ cancerol oncopole","iuct oncopole","ctr lutte contre canc","ico canc","icm, montpellier canc inst","univ inst canc toulouse oncopole","rothschild"] Orga = "" for ac in acro: if ac in texte.lower(): # APPEL DES FONCTIONS df['Terme Affiliation'] = df["Affiliation"].apply(terme_aff,liste=acro) df['Ville_canonique_Dpt'] = df['Ville_canonique_Dpt'].apply(correction) if len(df[df['Terme Affiliation'] == True]) != 0: df2 = df[df['Terme Affiliation'] == True] df2["Ville_présente"] = df2['Ville_canonique_Dpt'].apply(aff_ville,texte=texte) if len(df2[df2["Ville_présente"] == True]) != 0: df3 = df2[df2["Ville_présente"] == True] df3["ratio"] = df3["Affiliation"].apply(fuzzywuzzy,text=texte) Orga = df3["Orga NonCnrs Acorriger"][df3["ratio"].idxmax()] else: Orga = "N.C" else: Orga = "N.C" break if Orga == "": Orga = "N.C" return Orga df = pd.read_csv('hospital_affiliations.csv', sep=";") for line in sys.stdin: data = json.loads(line) texte=data['value'] data['value']=base(texte,df) sys.stdout.write(json.dumps(data)) sys.stdout.write('\n')