diff --git a/data-computer/requirements.txt b/data-computer/requirements.txt index 0bc7d8e..915444b 100755 --- a/data-computer/requirements.txt +++ b/data-computer/requirements.txt @@ -1 +1,3 @@ gensim==4.3.2 +spacy==3.6.1 +en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz diff --git a/data-computer/v1/lda.py b/data-computer/v1/lda.py index 85e207e..e8aedc2 100755 --- a/data-computer/v1/lda.py +++ b/data-computer/v1/lda.py @@ -6,11 +6,14 @@ import unicodedata import string import re +import spacy # params num_topics = 5 # Number of topics num_iterations=100 # "epochs" ====> shall it depends of number of docs ? number of topics ? +nlp = spacy.load('en_core_web_sm', disable = ['parser','ner']) + #stopwords with open('./v1/stopwords/en.json','r') as f_in: stopwords =json.load(f_in) @@ -31,6 +34,11 @@ return text.lower() +#lemmatize +def lemmatize(text): + doc = nlp(text) + return " ".join([token.lemma_ for token in doc]) + #tokenize def tokenize(text): tokens = [word for word in text.replace("'"," ").split() if word not in stopwords and len(word)>2] @@ -46,7 +54,7 @@ # training LDA -texts = [tokenize(uniformize(line["value"])) for line in all_data] +texts = [tokenize(lemmatize(uniformize(line["value"]))) for line in all_data] dictionary = corpora.Dictionary(texts) # Create a tf dictionary, but replace text by an id : [ [(id_token,numb_token),...] , [....] ]. The list represent docs of corpus corpus = [dictionary.doc2bow(text) for text in texts]