diff --git a/data-computer/swagger.json b/data-computer/swagger.json index 0ea400c..ab6eb7a 100644 --- a/data-computer/swagger.json +++ b/data-computer/swagger.json @@ -3,7 +3,7 @@ "info": { "title": "data-computer - Calculs sur fichier corpus compressé", "summary": "Algorithmes de calculs sur un corpus compressé", - "version": "2.6.0", + "version": "2.7.0", "termsOfService": "https://services.istex.fr/", "contact": { "name": "Inist-CNRS", diff --git a/data-computer/v1/lda.py b/data-computer/v1/lda.py index 426a137..9cd999a 100755 --- a/data-computer/v1/lda.py +++ b/data-computer/v1/lda.py @@ -51,7 +51,7 @@ best_topic = {} best_proba = 0 for topic in dico: - proba = float(dico[topic]["weight"]) + proba = float(dico[topic]["topic_weight"]) if proba>best_proba: best_proba = proba best_topic = topic @@ -94,7 +94,7 @@ else: index_without_value.append(i) dictionary = corpora.Dictionary(texts) # Create a tf dictionary, but replace text by an id : [ [(id_token,numb_token),...] , [....] ]. The list represent docs of corpus -dictionary.filter_extremes(no_below=3,no_above=0.8) +dictionary.filter_extremes(no_below=3,no_above=0.5) corpus = [dictionary.doc2bow(text) for text in texts] try: @@ -119,13 +119,21 @@ topic_info = {} for topic_id, topic_weight in topics: topic_info[f"topic_{topic_id + 1}"] = {} - topic_words = [{"word":word, "word_weight":str(word_weight)} for word, word_weight in lda_model.show_topic(topic_id)] - topic_info[f"topic_{topic_id + 1}"]["words"] = topic_words - topic_info[f"topic_{topic_id + 1}"]["weight"] = str(topic_weight) + words = [] + words_weights = [] + for word, word_weight in lda_model.show_topic(topic_id): + words.append(word) + words_weights.append(str(word_weight)) + topic_info[f"topic_{topic_id + 1}"]["words"] = words + topic_info[f"topic_{topic_id + 1}"]["words_weights"] = words_weights + topic_info[f"topic_{topic_id + 1}"]["topic_weight"] = str(topic_weight) line["value"]={} line["value"]["topics"]=topic_info - line["value"]["best_topic"]=max_topic(topic_info) + try: + line["value"]["best_topic"]=max_topic(topic_info) + except: + line["value"]["best_topic"]="n/a" sys.stdout.write(json.dumps(line)) sys.stdout.write("\n")