diff --git a/data-computer/v1/lda.py b/data-computer/v1/lda.py index 84ca117..b7206a1 100755 --- a/data-computer/v1/lda.py +++ b/data-computer/v1/lda.py @@ -22,7 +22,7 @@ return text_with_no_accent def uniformize(text): - # del accents + # del accents, using remove_accents function text = remove_accents(text) # remove punctuation except " ' " @@ -46,6 +46,7 @@ # Max topic def max_topic(dico): + # for a dictionary of topics, return a json with a single key "best topic" and his value is the value of the dictionary. best_topic = {} best_proba = 0 for topic in dico: @@ -58,14 +59,14 @@ # WS -# Datas +# load all datas all_data = [] for line in sys.stdin: data=json.loads(line) all_data.append(data) -# params +# following parameters depends of the size of the corpus : num_topics and num_iterations n = len(all_data) if n< 1001: num_topics = 10 @@ -91,12 +92,15 @@ lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,iterations=num_iterations,alpha="symmetric", eta = "auto",minimum_probability=0.1) -# #To see topics (just for me) + +# #To see topics (to test it with a jsonl file) # sys.stdout.write(json.dumps(lda_model.print_topics())) # #Get coherence # cm = models.coherencemodel.CoherenceModel(model=lda_model, texts=texts, coherence='c_v') # cm.get_coherence() +# exit() + # extract infos for line in all_data: @@ -114,7 +118,7 @@ line["value"]["topics"]=topic_info line["value"]["best_topic"]=max_topic(topic_info) -# Write all corpus in once +# Write output for line in all_data: sys.stdout.write(json.dumps(line)) sys.stdout.write("\n")