#!/opt/bitnami/python/bin/python3.7 import sys import json import re from flair.models import SequenceTagger from flair.data import Sentence import logging logging.getLogger('flair').handlers[0].stream = sys.stderr def data_normalization(dic,sentence): cpy_sentence = sentence cpy_sentence = re.sub(r'\bArt. \w*\b', '', cpy_sentence) cpy_sentence = re.sub(r'\.\.\.*', '', cpy_sentence) for key in dic: cpy_sentence = cpy_sentence.replace(key,dic[key]) return cpy_sentence tagger = SequenceTagger.load("./v1/perseeBaipTagger/model.pt") error_dic = {} error_dic["\n"] = " " error_dic["¬ "] = "" error_dic["l'"] = "l' " error_dic["d'"] = "d' " error_dic["1° "] = "" error_dic["2° "] = "" error_dic["3° "] = "" error_dic["Art. "] = "" error_dic[", A"] = ", a" uniqueOrg = ["collège","lycée","académie","faculté","école","ecole","établissement","institut"] trans = ["nomination","affectation","concession","érection","suppression","transformation"] for line in sys.stdin: data = json.loads(line) text=data['value'] locL = [] orgL = [] basicOrg = [] operation = [] sent = data_normalization(error_dic,text) sentS = sent.split(".") sentences = [Sentence(sentS[i]+".") for i in range(len(sentS))] tagger.predict(sentences) for word in sent.lower().split(" "): for transfo in trans: if word.startswith(transfo): if transfo not in operation: operation.append(transfo) break for sentence in sentences: for entity in sentence.get_spans('ner'): if (entity.labels[0].value == "LOC"): if entity.text not in locL: locL.append(entity.text) if entity.labels[0].value == "ORG": org = entity.text.split(" ") if len(org[-1]) > 2: for borg in uniqueOrg: if entity.text.lower().startswith(borg): basicOrg.append(borg) if entity.text not in orgL: orgL.append(entity.text) if len(org)>1: for k in ["à","de","l'","d'","'","du","la"]: if org[-2] == k: if org[-1] not in locL: locL.append(org[-1]) break returnDic = {"loc":locL,"org":orgL,"basicOrg":basicOrg,"operation":operation} data['value'] = returnDic sys.stdout.write(json.dumps(data)) sys.stdout.write('\n')