Newer
Older
web-services / hospital-affiliations / aff_hosp.py
from numpy import column_stack
import pandas as pd
pd.options.mode.chained_assignment = None  
from nltk.tokenize import word_tokenize
from fuzzywuzzy import fuzz
import sys
import json

## MODIFICATION DU FICHIER EXCEL
# On regarde si la liste des termes acronymes choisis est présente dans les affiliations
def terme_aff(texte, liste=None) :
    for aff in liste :
        if aff in texte.lower() :
            return True
    return False
# Il faut également que l'affiliation qu'on veut homogénéiser ait un terme de la liste


#Correction des villes
def correction(texte) :
    sep = ' -'
    stripped = texte.split(sep, 1)[0]
    stripped = stripped.replace("-"," ")
    stripped = stripped.replace("ç","c")
    stripped = stripped.replace("Saint","St")
    stripped = stripped.replace("é","e")
    stripped = stripped.replace("è","e")
    stripped = stripped.replace("É","E")
    stripped = stripped.replace("â","a")
    stripped = stripped.replace("ê","e")
    stripped = stripped.replace("î","i")
    stripped = stripped.replace("à","a")

    return stripped

# On repère/marque les lignes dont l'affiliation contient la ville
def aff_ville(ville,texte):
    if ville.lower() in texte.lower() :
        return True
    return False

def fuzzywuzzy(affiliation,text = None):
    return fuzz.ratio(affiliation,text)

def base(texte,df):
    acro = ["chr","chr ","chu ","chu,","chru","ap hp","ap hm","aphp","aphm","hosp","hop ","hop,"," serv","clcc","ctr lutte canc","antoine lacassagne","eugene marquis","baclesse",
    "georges françois leclerc","gf leclerc","jf leclerc","henri becquerel","h becquerel","jean perrin","leon berard","ctr oscar lambret",
    "oscar lambret comprehens","oscar lambret canc","clcc oscar lambret","ctr lutte canc oscar lambret","gustave roussy","bergonie",
    "inst curie","curie inst","rene gauducheau","inst cancerol ouest","inst cancerol lorraine","alexis vautrin","inst reg canc montpellier",
    "icans","paul strauss","paul str","paoli calmettes","inst j paoli i calmettes","claudius regaud","claudius rigaud","inst univ canc toulouse oncopole",
    "inst univ cancerol oncopole","iuct oncopole","ctr lutte contre canc","ico canc","icm, montpellier canc inst","univ inst canc toulouse oncopole","rothschild"]
    
    Orga = ""
    for ac in acro:
        if ac in texte.lower():
            # APPEL DES FONCTIONS
            df['Terme Affiliation'] = df["Affiliation"].apply(terme_aff,liste=acro)
            df['Ville_canonique_Dpt'] = df['Ville_canonique_Dpt'].apply(correction)
            if len(df[df['Terme Affiliation'] == True]) != 0:
                df2 = df[df['Terme Affiliation'] == True]
                df2["Ville_présente"] = df2['Ville_canonique_Dpt'].apply(aff_ville,texte=texte)
                if len(df2[df2["Ville_présente"] == True]) != 0:
                    df3 = df2[df2["Ville_présente"] == True]
                    df3["ratio"] = df3["Affiliation"].apply(fuzzywuzzy,text=texte)
                    Orga = df3["Orga NonCnrs Acorriger"][df3["ratio"].idxmax()]
                else:
                    Orga = "N.C"
            else:
                Orga = "N.C"
            break
    if Orga == "":
        Orga = "N.C"
    return Orga

df = pd.read_csv('hospital_affiliations.csv', sep=";")


for line in sys.stdin:
    data = json.loads(line)
    texte=data['value']
    data['value']=base(texte,df)
    sys.stdout.write(json.dumps(data))
    sys.stdout.write('\n')

## TEST 
#{"id": "1","value": "CHRU Hautepierre Strasbourg, Lab Anat Pathol, 1 Ave Moliere, F-67098 Strasbourg, France"},
# {"id" :"2","value": "AP HP, Paris Ctr Necker Cochin, Clin Res Unit, Paris, France"},
# {"id" :"3","value": "Hop La Pitie Salpetriere, AP HP, Serv Med Interne, Ctr Natl Reference Histiocytoses, Paris, France"},
# {"id" :"4","value": "Hop Necker Enfants Malad, Gen Pediat Dept, Paris, France"},
# {"id" :"5","value": "Bergonie Inst, Dept Radiat Oncol, Bordeaux, France"}
#{"id" :"6","value": "CHU Besancon, Serv Neurol, 2 Blvd Fleming, F-25030 Besancon, France"}
#{"id" :"7","value": "Univ Bordeaux, INCIA, CNRS UMR 5287, Equipe Neuropsychopharmacol Addict, BP31,146 Rue Leo Saignat, F-33076 Bordeaux, France"}

"""
lines = json.loads(json.dumps([
{"id" :"5","value": "Hop Charpennes, 27 Rue G Peri, F-69100 Villeurbanne, France"},
{"id" :"6","value": "Fdn Ophtalmol Rothschild, ERT TREAT Vis, 25 Rue Manin, F-75019 Paris, France"},
{"id" :"7","value": "Ctr Hosp, Mt De Marsan, France"},
{"id": "8", "value":"CHU Pointe A Pitre, Serv Gynecol Obstet, F-97159 Pointe A Pitre, Guadeloupe, France"},
]))

for line in lines:
    data = line
    texte=data['value']
    data['value']=base(texte,df)
    sys.stdout.write(json.dumps(data))
    sys.stdout.write('\n')
"""