from numpy import column_stack import pandas as pd from nltk.tokenize import word_tokenize from fuzzywuzzy import fuzz import sys import json import unicodedata import re pd.options.mode.chained_assignment = None # Check if an element of the list is in affiliation def is_hospital_affiliation(affiliation, list) : affiliation_lower = affiliation.lower() for aff in list : if aff in affiliation_lower : return True return False #correction of the cities : remove the department numbers and transformation into acronyms def remove_department_numbers_and_acronyms(city_name) : # department numbers: Grenoble - 38 => Grenoble sep = ' -' stripped = city_name.split(sep, 1)[0] stripped = stripped.replace("-"," ") # acronyms stripped = stripped.replace("Saint","St") stripped = stripped.replace("Mont","Mt") return stripped # correction of the cities : remove accents def remove_accents(city_name) : normalized_text = unicodedata.normalize("NFD", city_name) text_with_no_accent = re.sub("[\u0300-\u036f]", '', normalized_text) return text_with_no_accent def is_city_in_affiliation(city,affiliation): return city.lower() in affiliation.lower() def fuzzywuzzy(affiliation,text = None): return fuzz.ratio(affiliation,text) def base(texte,df): acronyms = ["chr","chr ","chu ","chu,","chru","ap hp","ap hm","aphp","aphm","hosp","hop ","hop,"," serv","clcc","ctr lutte canc","antoine lacassagne","eugene marquis","baclesse", "georges françois leclerc","gf leclerc","jf leclerc","henri becquerel","h becquerel","jean perrin","leon berard","ctr oscar lambret", "oscar lambret comprehens","oscar lambret canc","clcc oscar lambret","ctr lutte canc oscar lambret","gustave roussy","bergonie", "inst curie","curie inst","rene gauducheau","inst cancerol ouest","inst cancerol lorraine","alexis vautrin","inst reg canc montpellier", "icans","paul strauss","paul str","paoli calmettes","inst j paoli i calmettes","claudius regaud","claudius rigaud","inst univ canc toulouse oncopole", "inst univ cancerol oncopole","iuct oncopole","ctr lutte contre canc","ico canc","icm, montpellier canc inst","univ inst canc toulouse oncopole","rothschild"] Orga = "" for ac in acronyms: if ac in texte.lower(): # APPEL DES FONCTIONS df['Terme Affiliation'] = df["Affiliation"].apply(is_hospital_affiliation,list=acronyms) df['Ville_canonique_Dpt'] = df['Ville_canonique_Dpt'].apply(remove_department_numbers_and_acronyms) df['Ville_canonique_Dpt'] = df['Ville_canonique_Dpt'].apply(remove_accents) if len(df[df['Terme Affiliation'] == True]) != 0: df2 = df[df['Terme Affiliation'] == True] df2["Ville_présente"] = df2['Ville_canonique_Dpt'].apply(is_city_in_affiliation,affiliation=texte) if len(df2[df2["Ville_présente"] == True]) != 0: df3 = df2[df2["Ville_présente"] == True] df3["ratio"] = df3["Affiliation"].apply(fuzzywuzzy,text=texte) Orga = df3["Orga NonCnrs Acorriger"][df3["ratio"].idxmax()] else: Orga = "N.C" else: Orga = "N.C" break if Orga == "": Orga = "N.C" return Orga df = pd.read_csv('hospital_affiliations.csv', sep=";") for line in sys.stdin: data = json.loads(line) texte=data['value'] data['value']=base(texte,df) sys.stdout.write(json.dumps(data)) sys.stdout.write('\n')