diff --git a/data-computer/v1/lda.py b/data-computer/v1/lda.py index b393c50..74c8e46 100755 --- a/data-computer/v1/lda.py +++ b/data-computer/v1/lda.py @@ -8,10 +8,6 @@ import re import spacy -# params -num_topics = 5 # Number of topics -num_iterations=100 # "epochs" ====> shall it depends of number of docs ? number of topics ? - nlp = spacy.load('en_core_web_sm', disable = ['parser','ner']) #stopwords @@ -55,6 +51,19 @@ all_data.append(data) +# params +n = len(all_data) +if n< 1001: + num_topics = 10 + num_iterations=100 +elif n < 20001: + num_topics = 15 + num_iterations=150 +else: + num_topics = 20 + num_iterations=200 + + # training LDA texts = [tokenize(lemmatize(uniformize(line["value"]))) for line in all_data] dictionary = corpora.Dictionary(texts) # Create a tf dictionary, but replace text by an id : [ [(id_token,numb_token),...] , [....] ]. The list represent docs of corpus