diff --git a/data-computer/requirements.txt b/data-computer/requirements.txt index ad3d950..4b1dec2 100755 --- a/data-computer/requirements.txt +++ b/data-computer/requirements.txt @@ -4,4 +4,6 @@ pandas==1.4.0 lxml==4.7.1 -fr_core_news_sm @ https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.6.0/fr_core_news_sm-3.6.0-py3-none-any.whl \ No newline at end of file +fr_core_news_sm @ https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.6.0/fr_core_news_sm-3.6.0-py3-none-any.whl + +prometheus-client==0.19.0 diff --git a/data-computer/v1/lda.py b/data-computer/v1/lda.py index 9cd999a..c7d164c 100755 --- a/data-computer/v1/lda.py +++ b/data-computer/v1/lda.py @@ -6,6 +6,11 @@ import unicodedata import re import spacy +from prometheus_client import CollectorRegistry, Counter, push_to_gateway + +registry = CollectorRegistry() +c = Counter('documents', 'Number of documents processed', registry=registry) +job_name='lda' nlp = spacy.load('en_core_web_sm', disable = ['parser','ner']) @@ -112,6 +117,8 @@ sys.stdout.write(json.dumps(line)) sys.stdout.write("\n") else: + c.inc() + push_to_gateway('jobs-metrics.daf.intra.inist.fr', job=job_name, registry=registry) line = all_data[i] doc = line["value"] doc_bow = dictionary.doc2bow(tokenize(uniformize(line["value"]))) @@ -122,7 +129,7 @@ words = [] words_weights = [] for word, word_weight in lda_model.show_topic(topic_id): - words.append(word) + words.append(word) words_weights.append(str(word_weight)) topic_info[f"topic_{topic_id + 1}"]["words"] = words topic_info[f"topic_{topic_id + 1}"]["words_weights"] = words_weights