diff --git a/geo-tagger/v1/perseeBaipTagger/model.pt b/geo-tagger/v1/perseeBaipTagger/model.pt new file mode 100644 index 0000000..10cdfbf --- /dev/null +++ b/geo-tagger/v1/perseeBaipTagger/model.pt Binary files differ diff --git a/geo-tagger/v1/perseeBaipTagger/tagger.ini b/geo-tagger/v1/perseeBaipTagger/tagger.ini new file mode 100644 index 0000000..e61adc0 --- /dev/null +++ b/geo-tagger/v1/perseeBaipTagger/tagger.ini @@ -0,0 +1,37 @@ +# OpenAPI Documentation - JSON format (dot notation) +mimeType = application/json + +post.description = Détecte les organismes scolaires et localisations dans les baip +post.responses.default.description = Renvoie un Json composé d'`id`, `value` avec `value` un dictionnaire contenant les organismes scolaires et localisations extraites. +post.responses.default.content.application/json.schema.$ref = #/components/schemas/JSONStream +post.summary = Détection dans les baip +post.requestBody.required = true +post.requestBody.content.application/json.schema.$ref = #/components/schemas/JSONStream +post.parameters.0.in = query +post.parameters.0.name = path +post.parameters.0.schema.type = string +post.parameters.0.description = The path in each object to enrich with an Python script +post.parameters.1.in = query +post.parameters.1.name = indent +post.parameters.1.schema.type = boolean +post.parameters.1.description = Indent or not the JSON Result + +[use] +plugin = @ezs/spawn +plugin = @ezs/basics +plugin = @ezs/storage +plugin = @ezs/analytics + +[JSONParse] +separator = * + +[expand] +path = env('path', 'value') +size = 100 + +[expand/exec] +# command should be executable ! +command = ./v1/perseeBaipTagger/tagger.py + +[dump] +indent = env('indent', false) \ No newline at end of file diff --git a/geo-tagger/v1/perseeBaipTagger/tagger.py b/geo-tagger/v1/perseeBaipTagger/tagger.py new file mode 100644 index 0000000..7b98620 --- /dev/null +++ b/geo-tagger/v1/perseeBaipTagger/tagger.py @@ -0,0 +1,80 @@ +import sys +import json +import re + +from flair.models import SequenceTagger +from flair.data import Sentence + +import logging + +def data_normalization(dic,sentence): + cpy_sentence = sentence + + cpy_sentence = re.sub(r'\bArt. \w*\b', '', cpy_sentence) + cpy_sentence = re.sub(r'\.\.\.*', '', cpy_sentence) + + for key in dic: + cpy_sentence = cpy_sentence.replace(key,dic[key]) + + return cpy_sentence + + +logging.getLogger('flair').handlers[0].stream = sys.stderr +tagger = SequenceTagger.load("model.pt") + +error_dic = {} +error_dic["\n"] = " " +error_dic["¬ "] = "" +error_dic["l'"] = "l' " +error_dic["d'"] = "d' " +error_dic["1° "] = "" +error_dic["2° "] = "" +error_dic["3° "] = "" +error_dic["Art. "] = "" +error_dic[", A"] = ", a" + +uniqueOrg = ["collège","lycée","académie","faculté","école","établissement","institut"] +trans = ["nomination","affectation","concession","érection","suppression"] +for line in sys.stdin: + data = json.loads(line) + text=data['value'] + + locL = [] + orgL = [] + basicOrg = [] + operation = [] + + sent = data_normalization(error_dic,text) + + sentence = Sentence(sent) + tagger.predict(sentence) + for word in sent.lower().split(" "): + for transfo in trans: + if word.startswith(transfo): + if transfo not in operation: + operation.append(transfo) + break + + for entity in sentence.get_spans('ner'): + if (entity.labels[0].value == "LOC"): + if entity.text not in locL: + locL.append(entity.text) + if entity.labels[0].value == "ORG": + org = entity.text.split(" ") + if len(org[-1]) > 2: + for borg in uniqueOrg: + if entity.text.lower().startswith(borg): + basicOrg.append(borg) + if entity.text not in orgL: + orgL.append(entity.text) + if len(org)>1: + for k in ["à","de","l'","d'","'","du","la"]: + if org[-2] == k: + if org[-1] not in locL: + locL.append(org[-1]) + break + + returnDic = {"loc":locL,"org":orgL,"basicOrg":basicOrg,"operation":operation} + data['value'] = returnDic + sys.stdout.write(json.dumps(data)) + sys.stdout.write('\n') \ No newline at end of file