diff --git a/data-computer/v1/lda.py b/data-computer/v1/lda.py index d6a9a01..66c4535 100755 --- a/data-computer/v1/lda.py +++ b/data-computer/v1/lda.py @@ -40,7 +40,7 @@ def tokenize(text): tokens = [word for word in text.replace("'"," ").split() if word not in stopwords and len(word)>2] if len(tokens)==0: - return ["n/a"] + return [] return tokens # Max topic @@ -68,11 +68,11 @@ # following parameters depends of the size of the corpus : num_topics and num_iterations -n = len(all_data) -if n< 1001: +len_data = len(all_data) +if len_data< 1001: num_topics = 10 num_iterations=150 -elif n < 20001: +elif len_data < 20001: num_topics = 15 num_iterations=200 else: @@ -82,16 +82,52 @@ # training LDA texts = [] -for line in all_data: - if "value" in line: - texts.append(tokenize(lemmatize(uniformize(line["value"])))) +index_without_value = [] +for i in range(len_data): + line = all_data[i] + if "value" in line and type(line["value"])==str: + tokens = tokenize(lemmatize(uniformize(line["value"]))) + if tokens != []: + texts.append(tokenize(lemmatize(uniformize(line["value"])))) + else: + index_without_value.append(i) else: - texts.append("n/a") + index_without_value.append(i) dictionary = corpora.Dictionary(texts) # Create a tf dictionary, but replace text by an id : [ [(id_token,numb_token),...] , [....] ]. The list represent docs of corpus -dictionary.filter_extremes(no_below=3,no_above=0.6) +dictionary.filter_extremes(no_below=3,no_above=0.8) corpus = [dictionary.doc2bow(text) for text in texts] -lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,iterations=num_iterations,alpha="symmetric", eta = "auto",minimum_probability=0.1) +try: + lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,iterations=num_iterations,alpha="symmetric", eta = "auto",minimum_probability=0.1) +except: + index_without_value = [i for i in range(len_data)] + + +# extract infos +for i in range(len_data): + + #return n/a if docs wasn't in model + if i in index_without_value: + line["value"]="n/a" + sys.stdout.write(json.dumps(line)) + sys.stdout.write("\n") + else: + line = all_data[i] + doc = line["value"] + doc_bow = dictionary.doc2bow(tokenize(uniformize(line["value"]))) + topics = lda_model[doc_bow] + topic_info = {} + for topic_id, topic_weight in topics: + topic_info[f"topic_{topic_id + 1}"] = {} + topic_words = [word for word, _ in lda_model.show_topic(topic_id)] + topic_info[f"topic_{topic_id + 1}"]["words"] = topic_words + topic_info[f"topic_{topic_id + 1}"]["weight"] = str(topic_weight) + + line["value"]={} + line["value"]["topics"]=topic_info + line["value"]["best_topic"]=max_topic(topic_info) + sys.stdout.write(json.dumps(line)) + sys.stdout.write("\n") # #To see topics (to test it with a jsonl file) @@ -101,25 +137,3 @@ # cm = models.coherencemodel.CoherenceModel(model=lda_model, texts=texts, coherence='c_v') # cm.get_coherence() # exit() - - -# extract infos -for line in all_data: - doc = line["value"] - doc_bow = dictionary.doc2bow(tokenize(uniformize(line["value"]))) - topics = lda_model[doc_bow] - topic_info = {} - for topic_id, topic_weight in topics: - topic_info[f"topic_{topic_id + 1}"] = {} - topic_words = [word for word, _ in lda_model.show_topic(topic_id)] - topic_info[f"topic_{topic_id + 1}"]["words"] = topic_words - topic_info[f"topic_{topic_id + 1}"]["weight"] = str(topic_weight) - - line["value"]={} - line["value"]["topics"]=topic_info - line["value"]["best_topic"]=max_topic(topic_info) - -# Write output -for line in all_data: - sys.stdout.write(json.dumps(line)) - sys.stdout.write("\n")