#!/usr/bin/env python3
from nlptools import *
from nlptools.tools import *
from nlptools.resources import *
import os
import time
import logging
import sys
import json
import plac
# exemple :
# sed -e '1d; $d' ../data/data2.json | sed 's/\,$/ /g' | python3 analyze.py stemmer -o doc -ini conf_EZ.ini -log
# analyze.log
@plac.annotations(
analyzer=(
"Name oh the analyzer [default stemmer porter]",
"positional",
None,
str,
["stemmer", "termMatcher", "ner", "NPchunker", "POStagger", "gazetteer","NPchunkerDP"],
),
output=("Format result ", "option", "o", str, ["doc", "json"]),
init_file=("initialisation file [default config.ini]", "option", "ini", str),
log=("log file", "option", "log", str)
)
def main( init_file, log, analyzer="stemmer", output="doc"):
field = 1 # indique le nombre de champs tsv/csv/txt des fichiers du corpus
compteur = 0
pipe = full_run(analyzer, init_file, output ) # doc = corpus traité sur la sortie standard
logging.basicConfig(filename=log, level=logging.DEBUG)
t1 = time.time()
# boucle de traitement sur le champ "text" de chaque document
# label et keywords sont extraits, puis replacés
for json_line in sys.stdin:
compteur += 1
try:
data = json.loads(json_line)
except json.decoder.JSONDecodeError:
logging.error("Input format problem line :{} : String could not be converted to JSON".format(compteur))
exit(1)
#print("in".format(compteur))
data["value"] = pipe.pipe_analyse(data["value"])
#print("ou".format(compteur))
sys.stdout.write(json.dumps(data))
sys.stdout.write("\n")
# calcul du temps execution
t2 = time.time()
logging.info("TRACE::Executing times %.3f " % (t2 - t1))
# class runner
class full_run:
def __init__(self, analyzer, init_file, output):
self.location = os.path.realpath(
os.path.join(os.getcwd(), os.path.dirname("doc"))
)
self.parsers = [
exec_spacy_pipe(analyzer, init_file, output),
]
def pipe_analyse(self, text):
# execution du pipe
for parser in self.parsers:
text = parser(text)
return text
if __name__ == "__main__":
if False:
import cProfile
import pstats
cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
s = pstats.Stats("Profile.prof")
s.strip_dirs().sort_stats("time").print_stats()
else:
plac.call(main)