# -*- coding: utf-8 -*-
import sys
import json
from flair.models import SequenceTagger
from flair.data import Sentence
from unidecode import unidecode
import logging
logging.getLogger("flair").handlers[0].stream = sys.stderr
def data_normalization(sentence):
cpy_sentence = sentence.lower()
return cpy_sentence
tagger = SequenceTagger.load("v1/model.pt")
for line in sys.stdin:
data = json.loads(line)
text = data["value"]
PL = []
TNQ = []
SNAT = []
OA = []
SSO = []
EB = []
ET = []
NRA = []
CST = []
GAL = []
AST = []
ST = []
AS = []
SN = []
XPL = []
SR = []
sent = data_normalization(text)
sentS = sent.split(".")
sentences = [Sentence(sentS[i] + ".") for i in range(len(sentS))]
tagger.predict(sentences)
label_lists = {
"PL": PL,
"TNQ": TNQ,
"SNAT": SNAT,
"OA": OA,
"SSO": SSO,
"EB": EB,
"ET": ET,
"NRA": NRA,
"CST": CST,
"GAL": GAL,
"AST": AST,
"ST": ST,
"AS": AS,
"SN": SN,
"XPL": XPL,
"SR": SR,
}
for sentence in sentences:
for entity in sentence.get_spans("ner"):
label_value = entity.labels[0].value
if entity.text not in label_lists.get(label_value, []):
label_lists[label_value].append(entity.text)
returnDic = {
unidecode("Planète"): PL,
unidecode("Trou noirs, quasars et apparentés"): TNQ,
"Satellite naturel": SNAT,
"Objets artificiels": OA,
unidecode("Système solaire"): SSO,
unidecode("Étoiles binaires (et pulsars)"): EB,
unidecode("Étoiles"): ET,
unidecode("Nébuleuse et région apparentés"): NRA,
"Constellations": CST,
"Galaxies et amas de galaxie": GAL,
unidecode("Astèroïdes"): AST,
unidecode("Satue hypotétique"): ST,
"amas stellaires": AS,
"supernovas": SN,
unidecode("exoplanètes"): XPL,
"sursaut radio, source radio, autres sursauts": SR,
}
# ajouter unidecode
data["value"] = {id: value for id, value in returnDic.items() if value != []}
sys.stdout.write(json.dumps(data))
sys.stdout.write("\n")