diff --git a/data-computer/v1/lda.py b/data-computer/v1/lda.py index 6ded96c..7c6e39e 100755 --- a/data-computer/v1/lda.py +++ b/data-computer/v1/lda.py @@ -27,10 +27,9 @@ text = remove_accents(text) # remove punctuation except " ' " - punctuation = ''.join(char for char in string.punctuation if char != "'") - text = ''.join(char for char in text if char not in punctuation) + text = ''.join(char if char.isalpha() or char == "'" else ' ' for char in text) - return text.lower() + return ' '.join(text.lower().split()) #lemmatize def lemmatize(text):