diff --git a/data-computer/v1/lda.py b/data-computer/v1/lda.py index ca3081c..b694728 100755 --- a/data-computer/v1/lda.py +++ b/data-computer/v1/lda.py @@ -5,6 +5,7 @@ from gensim import corpora, models import unicodedata import string +import re # params num_topics = 5 # Number of topics @@ -37,10 +38,15 @@ stopwords = stopwords_lists +def remove_accents(text): + normalized_text = unicodedata.normalize("NFD", text) + text_with_no_accent = re.sub("[\u0300-\u036f]", "", normalized_text) + return text_with_no_accent + def uniformize(text): # del accents - text = ''.join(char for char in unicodedata.normalize('NFD', text) if unicodedata.category(char) != 'Mn') + text = remove_accents(text) # remove punctuation except " ' " punctuation = ''.join(char for char in string.punctuation if char != "'") @@ -50,7 +56,7 @@ def tokenize(text): - tokens = [word for word in text.split() if word not in stopwords] + tokens = [word for word in text.replace("'"," ").split() if word not in stopwords and len(word)>2] return tokens