diff --git a/data-computer/v1/lda.py b/data-computer/v1/lda.py index 6ded96c..6b42d0f 100755 --- a/data-computer/v1/lda.py +++ b/data-computer/v1/lda.py @@ -4,7 +4,6 @@ import sys from gensim import corpora, models import unicodedata -import string import re import spacy @@ -27,10 +26,9 @@ text = remove_accents(text) # remove punctuation except " ' " - punctuation = ''.join(char for char in string.punctuation if char != "'") - text = ''.join(char for char in text if char not in punctuation) + text = ''.join(char if char.isalpha() or char == "'" else ' ' for char in text) - return text.lower() + return ' '.join(text.lower().split()) #lemmatize def lemmatize(text): @@ -59,13 +57,13 @@ n = len(all_data) if n< 1001: num_topics = 10 - num_iterations=100 + num_iterations=150 elif n < 20001: num_topics = 15 - num_iterations=150 + num_iterations=200 else: num_topics = 20 - num_iterations=200 + num_iterations=250 # training LDA @@ -76,10 +74,14 @@ else: texts.append("n/a") dictionary = corpora.Dictionary(texts) # Create a tf dictionary, but replace text by an id : [ [(id_token,numb_token),...] , [....] ]. The list represent docs of corpus +dictionary.filter_extremes(no_below=3,no_above=0.6) corpus = [dictionary.doc2bow(text) for text in texts] -lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,iterations=num_iterations) +lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,iterations=num_iterations,alpha="symmetric", eta = "auto",minimum_probability=0.1) +# To see topics (just for me) +# sys.stdout.write(json.dumps(lda_model.print_topics())) +# exit() # extract infos for line in all_data: @@ -92,11 +94,10 @@ topic_words = [word for word, _ in lda_model.show_topic(topic_id)] topic_info[f"topic_{topic_id + 1}"]["words"] = topic_words topic_info[f"topic_{topic_id + 1}"]["weight"] = str(topic_weight) - + line["lda"]= topic_info # Write all corpus in once for line in all_data: sys.stdout.write(json.dumps(line)) sys.stdout.write("\n") -