diff --git a/hospital-affiliations/.gitignore b/hospital-affiliations/.gitignore index ad72c46..b144ce9 100644 --- a/hospital-affiliations/.gitignore +++ b/hospital-affiliations/.gitignore @@ -1,2 +1,5 @@ # virtual environment -env/ \ No newline at end of file +env/ + +# cache python +__pycache__ diff --git a/hospital-affiliations/aff_hosp.py b/hospital-affiliations/aff_hosp.py index 1d5ed2e..c45f3e6 100755 --- a/hospital-affiliations/aff_hosp.py +++ b/hospital-affiliations/aff_hosp.py @@ -1,17 +1,16 @@ -from numpy import column_stack import pandas as pd -from nltk.tokenize import word_tokenize from fuzzywuzzy import fuzz import sys import json import unicodedata import re +import config pd.options.mode.chained_assignment = None # Check if an element of the list is in affiliation -def is_hospital_affiliation(affiliation, list) : +def is_hospital_affiliation(affiliation) : affiliation_lower = affiliation.lower() - for aff in list : + for aff in config.acronyms : if aff in affiliation_lower : return True return False @@ -43,13 +42,8 @@ def get_corresponding_hospital_from_affiliation(affiliation): affiliations_dataframe = pd.read_csv("hospital_affiliations.csv", sep=";") - acronyms = ["chr","chr ","chu ","chu,","chru","ap hp","ap hm","aphp","aphm","hosp","hop ","hop,"," serv","clcc","ctr lutte canc","antoine lacassagne","eugene marquis","baclesse", - "georges françois leclerc","gf leclerc","jf leclerc","henri becquerel","h becquerel","jean perrin","leon berard","ctr oscar lambret", - "oscar lambret comprehens","oscar lambret canc","clcc oscar lambret","ctr lutte canc oscar lambret","gustave roussy","bergonie", - "inst curie","curie inst","rene gauducheau","inst cancerol ouest","inst cancerol lorraine","alexis vautrin","inst reg canc montpellier", - "icans","paul strauss","paul str","paoli calmettes","inst j paoli i calmettes","claudius regaud","claudius rigaud","inst univ canc toulouse oncopole", - "inst univ cancerol oncopole","iuct oncopole","ctr lutte contre canc","ico canc","icm, montpellier canc inst","univ inst canc toulouse oncopole","rothschild"] - + acronyms = config.acronyms + hospital = "N.C" affiliation = affiliation.lower() for acronym in acronyms: @@ -57,7 +51,7 @@ continue # standarize original dataframe - affiliations_dataframe["contains_acronyms"] = affiliations_dataframe["Affiliation"].apply(is_hospital_affiliation,list=acronyms) + affiliations_dataframe["contains_acronyms"] = affiliations_dataframe["Affiliation"].apply(is_hospital_affiliation) affiliations_dataframe["standardized_city"] = affiliations_dataframe["Ville_canonique_Dpt"].apply(remove_department_numbers_and_convert_acronyms).apply(remove_accents) if len(affiliations_dataframe[affiliations_dataframe["contains_acronyms"] == True]) == 0: diff --git a/hospital-affiliations/config.py b/hospital-affiliations/config.py new file mode 100644 index 0000000..e9f04e6 --- /dev/null +++ b/hospital-affiliations/config.py @@ -0,0 +1,6 @@ +acronyms = ["chr","chr ","chu ","chu,","chru","ap hp","ap hm","aphp","aphm","hosp","hop ","hop,"," serv","clcc","ctr lutte canc","antoine lacassagne","eugene marquis","baclesse", + "georges françois leclerc","gf leclerc","jf leclerc","henri becquerel","h becquerel","jean perrin","leon berard","ctr oscar lambret", + "oscar lambret comprehens","oscar lambret canc","clcc oscar lambret","ctr lutte canc oscar lambret","gustave roussy","bergonie", + "inst curie","curie inst","rene gauducheau","inst cancerol ouest","inst cancerol lorraine","alexis vautrin","inst reg canc montpellier", + "icans","paul strauss","paul str","paoli calmettes","inst j paoli i calmettes","claudius regaud","claudius rigaud","inst univ canc toulouse oncopole", + "inst univ cancerol oncopole","iuct oncopole","ctr lutte contre canc","ico canc","icm, montpellier canc inst","univ inst canc toulouse oncopole","rothschild"]