Newer
Older
web-services / mapping-tools / v1 / hospital / affiliations.py
#!/usr/bin/env python3

# flake8: noqa E501 (line length)

import pandas as pd
from fuzzywuzzy import fuzz
import sys
import json
import unicodedata
import re
import config
pd.options.mode.chained_assignment = None


def is_hospital_affiliation(affiliation):
    """Check if an element of the list is in affiliation"""
    affiliation_lower = affiliation.lower()
    for aff in config.acronyms:
        if aff in affiliation_lower:
            return True
    return False


def convert_acronyms(city_name):
    """correction of the cities: convert acronyms"""
    stripped = city_name.replace("Saint ", "St ")
    stripped = stripped.replace("Mont ", "Mt ")

    return stripped


def remove_accents(city_name):
    """correction of the cities : remove accents"""
    normalized_text = unicodedata.normalize("NFD", city_name)
    text_with_no_accent = re.sub("[\u0300-\u036f]", "", normalized_text)
    return text_with_no_accent


def is_city_in_affiliation(city, affiliation):
    return city.lower() in affiliation.lower()


def affiliations_match_ratio(first_affiliation, second_affiliation):
    return fuzz.ratio(first_affiliation, second_affiliation)


def get_corresponding_hospital_from_affiliation(affiliation, affiliations_dataframe):
    acronyms = config.acronyms

    hospital = "n/a"
    affiliation = affiliation.lower()
    affiliation = remove_accents(affiliation)

    for acronym in acronyms:
        if acronym not in affiliation:
            continue

        # create new dataframe only with affiliations that contain acronyms
        affiliations_dataframe["city_in_affiliations"] = affiliations_dataframe["standardized_city"].apply(
            is_city_in_affiliation, affiliation=affiliation)

        acronyms_cities_dataframe = affiliations_dataframe[affiliations_dataframe["city_in_affiliations"] == True]  # noqa: E712
        if len(acronyms_cities_dataframe) == 0:
            continue

        # create new dataframe only with affiliations that contain acronyms and cities
        acronyms_cities_dataframe["ratio"] = acronyms_cities_dataframe["Affiliation"].apply(
            affiliations_match_ratio, second_affiliation=affiliation)
        hospital = acronyms_cities_dataframe["standardized_hospital"][acronyms_cities_dataframe["ratio"].idxmax(
        )]
        break

    return hospital


def main():
    affiliations_dataframe = pd.read_csv(
        "hospital_affiliation_normalize.csv", sep=";")
    for line in sys.stdin:
        data = json.loads(line)
        texte = data["value"]
        data["value"] = get_corresponding_hospital_from_affiliation(
            texte, affiliations_dataframe)
        sys.stdout.write(json.dumps(data))
        sys.stdout.write("\n")


if __name__ == "__main__":
    main()