#!/usr/bin/env python3 from nlptools import * from nlptools.tools import * from nlptools.resources import * import os import time import logging import sys import json import plac # exemple : # sed -e '1d; $d' ../data/data2.json | sed 's/\,$/ /g' | python3 analyze.py stemmer -o doc -ini conf_EZ.ini -log # analyze.log @plac.annotations( analyzer=( "Name oh the analyzer [default stemmer porter]", "positional", None, str, ["stemmer", "termMatcher", "ner", "NPchunker", "POStagger", "gazetteer","NPchunkerDP"], ), output=("Format result ", "option", "o", str, ["doc", "json"]), init_file=("initialisation file [default config.ini]", "option", "ini", str), log=("log file", "option", "log", str) ) def main( init_file, log, analyzer="stemmer", output="doc"): field = 1 # indique le nombre de champs tsv/csv/txt des fichiers du corpus compteur = 0 pipe = full_run(analyzer, init_file, output ) # doc = corpus traité sur la sortie standard logging.basicConfig(filename=log, level=logging.DEBUG) t1 = time.time() # boucle de traitement sur le champ "text" de chaque document # label et keywords sont extraits, puis replacés for json_line in sys.stdin: compteur += 1 try: data = json.loads(json_line) except json.decoder.JSONDecodeError: logging.error("Input format problem line :{} : String could not be converted to JSON".format(compteur)) exit(1) #print("in".format(compteur)) data["value"] = pipe.pipe_analyse(data["value"]) #print("ou".format(compteur)) sys.stdout.write(json.dumps(data)) sys.stdout.write("\n") # calcul du temps execution t2 = time.time() logging.info("TRACE::Executing times %.3f " % (t2 - t1)) # class runner class full_run: def __init__(self, analyzer, init_file, output): self.location = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname("doc")) ) self.parsers = [ exec_spacy_pipe(analyzer, init_file, output), ] def pipe_analyse(self, text): # execution du pipe for parser in self.parsers: text = parser(text) return text if __name__ == "__main__": if False: import cProfile import pstats cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof") s = pstats.Stats("Profile.prof") s.strip_dirs().sort_stats("time").print_stats() else: plac.call(main)