#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import sys
from gensim import corpora, models
import unicodedata
import string
import re
# params
num_topics = 5 # Number of topics
num_iterations=100 # "epochs" ====> shall it depends of number of docs ? number of topics ?
#stopwords
with open('./v1/stopwords/en.json','r') as f_in:
stopwords =json.load(f_in)
#normalize text
def remove_accents(text):
normalized_text = unicodedata.normalize("NFD", text)
text_with_no_accent = re.sub("[\u0300-\u036f]", "", normalized_text)
return text_with_no_accent
def uniformize(text):
# del accents
text = remove_accents(text)
# remove punctuation except " ' "
punctuation = ''.join(char for char in string.punctuation if char != "'")
text = ''.join(char for char in text if char not in punctuation)
return text.lower()
#tokenize
def tokenize(text):
tokens = [word for word in text.replace("'"," ").split() if word not in stopwords and len(word)>2]
return tokens
# WS
# Datas
all_data = []
for line in sys.stdin:
data=json.loads(line)
all_data.append(data)
# training LDA
texts = [tokenize(uniformize(line["value"])) for line in all_data]
dictionary = corpora.Dictionary(texts) # Create a tf dictionary, but replace text by an id : [ [(id_token,numb_token),...] , [....] ]. The list represent docs of corpus
corpus = [dictionary.doc2bow(text) for text in texts]
lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,iterations=num_iterations)
# extract infos
for line in all_data:
doc = line["value"]
doc_bow = dictionary.doc2bow(tokenize(uniformize(line["value"])))
topics = lda_model[doc_bow]
topic_info = {}
for topic_id, topic_weight in topics:
topic_info[f"topic_{topic_id + 1}"] = {}
topic_words = [word for word, _ in lda_model.show_topic(topic_id)]
topic_info[f"topic_{topic_id + 1}"]["words"] = topic_words
topic_info[f"topic_{topic_id + 1}"]["weight"] = str(topic_weight)
line["lda"]= topic_info
# Write all corpus in once
for line in all_data:
sys.stdout.write(json.dumps(line))
sys.stdout.write("\n")