Newer
Older
web-services / kos2vec / exec_indexer.py
@schneist schneist on 21 Jun 2022 3 KB ini
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
    .py

"""
__authors__ = "Stephane Schneider"
__contact__ = "stephane.schneider@inis.fr"
#

from cso_classifier import CSOClassifier

# from tools.tools import precision_recall
import glob
import sys
import json
import plac
import os

#
#   Version NON EZ_MASTER  de l indexeur memoire
#
#
#  CSOClassifier(modules="both", enhancement="first", explanation=False)
#  Additional parameters:
#   (i)   workers,                => nbre de process en //
#   (ii)  modules,                => module d indexation à executer  : "syntactic", "semantic", "both"
#   (iii) enhancement,            => relevant super-areas,  "first", "all", or "no".
#   (iv)  explanation,            => chunks of text that allowed the classifier to infer a given topic.
#   (v)   delete_outliers,        => emoving erroneous topics that were conceptually distant from the others.
#   (vi)  fast_classification,    => determines whether the semantic module should use the full model or the cached one.
#   (vii) silent.                 => prints its progress in the console.


# File "/home/schneist/app/kos2vec/cso/cso-classifier/cso_classifier/postprocmodule.py", line 66, in __create_matrix_distance_from_ontology

# ex : 
#        python3 exec_indexer.py -indent  data/memory20/annotated
#
@plac.annotations(    

    article_paths=("Corpus file path", "positional", None, str),
    indent=("Indent output json", "flag", "indent")
       
)

def main (
    article_paths,
    indent
    
):

    workers="8"

    if os.path.exists(article_paths):
        article_paths = glob.glob(article_paths + "/*.txt")
    else :     
        print(f"ERROR : Path '{article_paths}' not exist !!")
        exit(0)
        #raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT))
            
    # parametrage de l indexeur
    indexer = CSOClassifier(
        modules="both",
        enhancement="first",
        explanation=True,
        delete_outliers=False, #  False pour que ca marche sinon error :
        silent=False,
        fast_classification=False,
        workers=workers
    )
    
    # print(article_paths)
    # filename = "/home/schneist/app/cso/test/memory_20.txt"

    i = 0
    for i, article in enumerate(article_paths):

        i += 1
        try:
            f_article = open(article)
        except OSError:
            print(f"ERROR : Could not open/read {article} file:")
            sys.exit(0)


        with f_article:

            text = f_article.read()
            
            data = indexer.run(text)
            data['text']=text
            data['path']=article
            if indent:
                js = json.dumps(data, indent=4, sort_keys=True)
            else:
                js = json.dumps(data, sort_keys=True)
                
            print(
                f"{js}" # format json.ld
            )

        # (silence, inter, recall, precision, f1, intersect) = precision_recall(
        #            list_ids_mc, article
        #        )
if __name__ == "__main__":
    if False:
        import cProfile
        import pstats
        cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
        s = pstats.Stats("Profile.prof")
        s.strip_dirs().sort_stats("time").print_stats()
    else:
        plac.call(main)