web-services/nlp-tools/analyze.py at bf7280eff34fdc7e369f6b6574ebca86a9d8c7d8

Fork: 0
tdm / web-services
Find file
Newer
Older
web-services / nlp-tools / analyze.py
Nicolas Thouvenin on 10 Dec 2021 2 KB paths hamronization
Raw Blame History
#!/usr/bin/env python3
from nlptools import *
from nlptools.tools import *
from nlptools.resources import *
from nlptools.run import full_run
import os
import time
import logging
import sys
import json
import plac

# exemple :
# sed -e '1d; $d' ../data/data.json | sed  's/\,$/ /g' |  python3 analyze.py stemmer -o doc -lang fr -log analyze.log

@plac.annotations(
    analyzer=(
        "Name oh the NLPpipe ",
        "positional",
        None,
        str,
        ["stemmer", "termMatcher", "ner", "NPchunker", "POStagger", "gazetteer", "NPchunkerDP", "lefff_tagger"]
    ),
    language=("language", "option", "lang", str, ["fr", "en"]),
    output=("Format result ", "option", "o", str, ["doc", "json"]),
    param=("initialisation param in json", "option", "param", str),  
    init_file=("initialisation file [default config.ini]", "option", "ini", str),
    log=("log file", "option", "log", str)
)

def main(analyzer,  init_file, log, language,  param, output="doc"):

    compteur = 0
    # init d un pipe nlptools
    pipe = full_run(analyzer, language, init_file, param, output)  
    # doc = corpus traité sur la sortie standard
    # init_file = fichier de param par defaut

    logging.basicConfig(filename=log, level=logging.DEBUG)
    t1 = time.time()

    # boucle de traitement sur le champ "text" de chaque document
    # label et keywords sont extraits, puis replacés
    for json_line in sys.stdin:
        compteur += 1
        try:
            data = json.loads(json_line)
        except json.decoder.JSONDecodeError:
            logging.error("Input format problem line :{} : String could not be converted to JSON".format(compteur))
            exit(1)

        #print("in".format(compteur))
        data["value"] = pipe.pipe_analyse(data["value"])
        #print("ou".format(compteur))
        sys.stdout.write(json.dumps(data))
        sys.stdout.write("\n")

    # calcul du temps execution
    t2 = time.time()
    logging.info("TRACE::Executing times %.3f " % (t2 - t1))


if __name__ == "__main__":

    if False:
        import cProfile
        import pstats

        cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
        s = pstats.Stats("Profile.prof")
        s.strip_dirs().sort_stats("time").print_stats()
    else:
        plac.call(main)