import json import sys import pickle import re import urllib.parse import requests from ratelimit import limits, RateLimitException from backoff import on_exception, expo # Filtrage par mot clé pour ne garder que l'essentiel def filter(affiliation) : affiliation_lower = affiliation.lower() adress = affiliation_lower.replace(",", "") words = adress.split(" ") private = ["sas", "sarl", "sa", "private", "edf", "orange"] public = ["univ", "hop", "uar", "umr", "cea", "cnrs"] for word in words : if word in private : return "private" elif word in public : return "public" return adress # requête pour récupérer le fichier json def request_abreviation(url): response = requests.get(url) return response.json() # Normaliser les abréviations def expandAbbreviations(affiliation,dict): affiliation = affiliation.lower() arr = affiliation.split(" ") res = [] for word in arr : short_word = word.replace(",","").replace(".","") if short_word in dict: suffix = "" if "," in word: suffix = "," res.append(dict[short_word] + suffix) else: res.append(word) return " ".join(res) # découpage des adresses en plusieurs parties def name_enterprise(affiliation) : affiliation = affiliation.lower() affiliations = affiliation.split(',') return affiliations[0] # Repérer le département dans l'affiliation def num_dept(affiliation) : res = re.findall('f-(\d{2})\d{3}',affiliation) if len(res)==0 : return None return res[0] # requêtage de l'API pour les données filtrées @on_exception(expo, RateLimitException, max_time=1) @limits(calls=7, period=1) def request(name, dept) : url = "https://recherche-entreprises.api.gouv.fr/search?q=" + urllib.parse.quote(name) if dept: url += "&departement=" + dept response = requests.get(url,headers={'Accept':'application/json'}) return response.json() # gérer les réponses de l'API def is_private_public(information): if len(information)==0 or 'results' not in information or not information['results']: return "Informations manquantes" # Parcourir chaque objet "results" extraire la valeur de "est_service_public" est_service_public_list = [] for result in information['results']: complements = result.get('complements') est_service_public = complements.get('est_service_public', None) if est_service_public is not None: est_service_public_list.append(est_service_public) if True in est_service_public_list : return "public" return "private" # return est_service_public_list def publicOrPrivate(affiliation,my_dict): privatePublicOrAffiliation = filter(affiliation) if privatePublicOrAffiliation in ["private", "public"]: adress = affiliation.lower().split(",") enterprise = adress[0] return "organisme: "+enterprise+", statut: "+privatePublicOrAffiliation expanded_affiliation = expandAbbreviations(affiliation,my_dict) name = name_enterprise(expanded_affiliation) dept = num_dept(expanded_affiliation) information = request(name, dept) nature = is_private_public(information) return "organisme: "+name+", statut: "+nature def main(): my_dict = request_abreviation("http://mapping-tables-1.daf.intra.inist.fr/affiliations-tools-corporate.json") for line in sys.stdin: data = json.loads(line) texte = data["value"] data["value"] = publicOrPrivate(texte,my_dict) sys.stdout.write(json.dumps(data)) sys.stdout.write("\n") if __name__ == "__main__": main()