#!/usr/bin/env python3
# flake8: noqa E501 (line length)
import pandas as pd
from fuzzywuzzy import fuzz
import sys
import json
import unicodedata
import re
import config
pd.options.mode.chained_assignment = None
def is_hospital_affiliation(affiliation):
"""Check if an element of the list is in affiliation"""
affiliation_lower = affiliation.lower()
for aff in config.acronyms:
if aff in affiliation_lower:
return True
return False
def convert_acronyms(city_name):
"""correction of the cities: convert acronyms"""
stripped = city_name.replace("Saint ", "St ")
stripped = stripped.replace("Mont ", "Mt ")
return stripped
def remove_accents(city_name):
"""correction of the cities : remove accents"""
normalized_text = unicodedata.normalize("NFD", city_name)
text_with_no_accent = re.sub("[\u0300-\u036f]", "", normalized_text)
return text_with_no_accent
def is_city_in_affiliation(city, affiliation):
return city.lower() in affiliation.lower()
def affiliations_match_ratio(first_affiliation, second_affiliation):
return fuzz.ratio(first_affiliation, second_affiliation)
def get_corresponding_hospital_from_affiliation(affiliation, affiliations_dataframe):
acronyms = config.acronyms
hospital = "n/a"
affiliation = affiliation.lower()
affiliation = remove_accents(affiliation)
for acronym in acronyms:
if acronym not in affiliation:
continue
# create new dataframe only with affiliations that contain acronyms
affiliations_dataframe["city_in_affiliations"] = affiliations_dataframe["standardized_city"].apply(
is_city_in_affiliation, affiliation=affiliation)
acronyms_cities_dataframe = affiliations_dataframe[affiliations_dataframe["city_in_affiliations"] == True] # noqa: E712
if len(acronyms_cities_dataframe) == 0:
continue
# create new dataframe only with affiliations that contain acronyms and cities
acronyms_cities_dataframe["ratio"] = acronyms_cities_dataframe["Affiliation"].apply(
affiliations_match_ratio, second_affiliation=affiliation)
hospital = acronyms_cities_dataframe["standardized_hospital"][acronyms_cities_dataframe["ratio"].idxmax(
)]
break
return hospital
def main():
affiliations_dataframe = pd.read_csv(
"./v1/hospital/hospital_affiliation_normalized.csv", sep=";")
for line in sys.stdin:
data = json.loads(line)
texte = data["value"]
data["value"] = get_corresponding_hospital_from_affiliation(
texte, affiliations_dataframe)
sys.stdout.write(json.dumps(data))
sys.stdout.write("\n")
if __name__ == "__main__":
main()