#!/usr/bin/env python3 # flake8: noqa E501 (line length) import pandas as pd from fuzzywuzzy import fuzz import sys import json import unicodedata import re import config pd.options.mode.chained_assignment = None def is_hospital_affiliation(affiliation): """Check if an element of the list is in affiliation""" affiliation_lower = affiliation.lower() for aff in config.acronyms: if aff in affiliation_lower: return True return False def convert_acronyms(city_name): """correction of the cities: convert acronyms""" stripped = city_name.replace("Saint ", "St ") stripped = stripped.replace("Mont ", "Mt ") return stripped def remove_accents(city_name): """correction of the cities : remove accents""" normalized_text = unicodedata.normalize("NFD", city_name) text_with_no_accent = re.sub("[\u0300-\u036f]", "", normalized_text) return text_with_no_accent def is_city_in_affiliation(city, affiliation): return city.lower() in affiliation.lower() def affiliations_match_ratio(first_affiliation, second_affiliation): return fuzz.ratio(first_affiliation, second_affiliation) def get_corresponding_hospital_from_affiliation(affiliation, affiliations_dataframe): acronyms = config.acronyms hospital = "n/a" affiliation = affiliation.lower() affiliation = remove_accents(affiliation) for acronym in acronyms: if acronym not in affiliation: continue # create new dataframe only with affiliations that contain acronyms affiliations_dataframe["city_in_affiliations"] = affiliations_dataframe["standardized_city"].apply( is_city_in_affiliation, affiliation=affiliation) acronyms_cities_dataframe = affiliations_dataframe[affiliations_dataframe["city_in_affiliations"] == True] # noqa: E712 if len(acronyms_cities_dataframe) == 0: continue # create new dataframe only with affiliations that contain acronyms and cities acronyms_cities_dataframe["ratio"] = acronyms_cities_dataframe["Affiliation"].apply( affiliations_match_ratio, second_affiliation=affiliation) hospital = acronyms_cities_dataframe["standardized_hospital"][acronyms_cities_dataframe["ratio"].idxmax( )] break return hospital def main(): affiliations_dataframe = pd.read_csv( "./v1/hospital/hospital_affiliation_normalized.csv", sep=";") for line in sys.stdin: data = json.loads(line) texte = data["value"] data["value"] = get_corresponding_hospital_from_affiliation( texte, affiliations_dataframe) sys.stdout.write(json.dumps(data)) sys.stdout.write("\n") if __name__ == "__main__": main()