Newer
Older
web-services / hospital-affiliations / aff_hosp.py
from numpy import column_stack
import pandas as pd
from nltk.tokenize import word_tokenize
from fuzzywuzzy import fuzz
import sys
import json
import unicodedata
import re
pd.options.mode.chained_assignment = None  

# Check if an element of the list is in affiliation
def is_hospital_affiliation(affiliation, list) :
    affiliation_lower = affiliation.lower()
    for aff in list :
        if aff in affiliation_lower :
            return True
    return False

#correction of the cities : remove the department numbers and convert acronyms
def remove_department_numbers_and_convert_acronyms(city_name) :
    # department numbers: Grenoble - 38 => Grenoble
    sep = ' -'
    stripped = city_name.split(sep, 1)[0]
    stripped = stripped.replace("-"," ")

    # acronyms
    stripped = stripped.replace("Saint","St")
    stripped = stripped.replace("Mont","Mt")

    return stripped

# correction of the cities : remove accents
def remove_accents(city_name) :
    normalized_text = unicodedata.normalize("NFD", city_name)
    text_with_no_accent = re.sub("[\u0300-\u036f]", '', normalized_text)
    return text_with_no_accent

def is_city_in_affiliation(city,affiliation):
    return city.lower() in affiliation.lower()

def affiliations_match_ratio(first_affiliation,second_affiliation):
    return fuzz.ratio(first_affiliation,second_affiliation)

def get_corresponding_hospital_from_affiliation(affiliation):
    affiliations_dataframe = pd.read_csv('hospital_affiliations.csv', sep=";")
    acronyms = ["chr","chr ","chu ","chu,","chru","ap hp","ap hm","aphp","aphm","hosp","hop ","hop,"," serv","clcc","ctr lutte canc","antoine lacassagne","eugene marquis","baclesse",
    "georges françois leclerc","gf leclerc","jf leclerc","henri becquerel","h becquerel","jean perrin","leon berard","ctr oscar lambret",
    "oscar lambret comprehens","oscar lambret canc","clcc oscar lambret","ctr lutte canc oscar lambret","gustave roussy","bergonie",
    "inst curie","curie inst","rene gauducheau","inst cancerol ouest","inst cancerol lorraine","alexis vautrin","inst reg canc montpellier",
    "icans","paul strauss","paul str","paoli calmettes","inst j paoli i calmettes","claudius regaud","claudius rigaud","inst univ canc toulouse oncopole",
    "inst univ cancerol oncopole","iuct oncopole","ctr lutte contre canc","ico canc","icm, montpellier canc inst","univ inst canc toulouse oncopole","rothschild"]
    
    hospital = "N.C"
    affiliation = affiliation.lower()
    for acronym in acronyms:
        if acronym not in affiliation :
            continue

        # standarize original dataframe
        affiliations_dataframe["contains_acronyms"] = affiliations_dataframe["Affiliation"].apply(is_hospital_affiliation,list=acronyms)
        affiliations_dataframe["standardized_city"] = affiliations_dataframe["Ville_canonique_Dpt"].apply(remove_department_numbers_and_convert_acronyms).apply(remove_accents)
        
        if len(affiliations_dataframe[affiliations_dataframe['contains_acronyms'] == True]) == 0:
            continue

        # create new dataframe only with affiliations that contain acronyms
        dataframe_with_acronyms = affiliations_dataframe[affiliations_dataframe['contains_acronyms'] == True]
        dataframe_with_acronyms["city_in_affiliations"] = dataframe_with_acronyms['standardized_city'].apply(is_city_in_affiliation,affiliation=affiliation)
        
        if len(dataframe_with_acronyms[dataframe_with_acronyms["city_in_affiliations"] == True]) == 0:
            continue

        # create new dataframe only with affiliations that contain acronyms and cities
        dataframe_with_acronyms_and_cities = dataframe_with_acronyms[dataframe_with_acronyms["city_in_affiliations"] == True]
        dataframe_with_acronyms_and_cities["ratio"] = dataframe_with_acronyms_and_cities["Affiliation"].apply(affiliations_match_ratio,second_affiliation=affiliation)
        hospital = dataframe_with_acronyms_and_cities["Orga NonCnrs Acorriger"][dataframe_with_acronyms_and_cities["ratio"].idxmax()]             
        
        break

    return hospital

for line in sys.stdin:
    data = json.loads(line)
    texte=data['value']
    data['value']=get_corresponding_hospital_from_affiliation(texte)
    sys.stdout.write(json.dumps(data))
    sys.stdout.write('\n')