#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import sys
from gensim import corpora, models
import unicodedata
import re
import spacy
nlp = spacy.load('en_core_web_sm', disable = ['parser','ner'])
#stopwords
with open('./v1/stopwords/en.json','r') as f_in:
stopwords =json.load(f_in)
#normalize text
def remove_accents(text):
if text == "" or type(text)!= str:
return ""
normalized_text = unicodedata.normalize("NFD", text)
text_with_no_accent = re.sub("[\u0300-\u036f]", "", normalized_text)
return text_with_no_accent
def uniformize(text):
# del accents, using remove_accents function
text = remove_accents(text)
# remove punctuation except " ' "
text = ''.join(char if char.isalpha() or char == "'" else ' ' for char in text)
return ' '.join(text.lower().split())
#lemmatize
def lemmatize(text):
if text == "":
return text
doc = nlp(text)
return " ".join([token.lemma_ for token in doc])
#tokenize
def tokenize(text):
tokens = [word for word in text.replace("'"," ").split() if word not in stopwords and len(word)>2]
if len(tokens)==0:
return ["n/a"]
return tokens
# Max topic
def max_topic(dico):
# for a dictionary of topics, return a json with a single key "best topic" and his value is the value of the dictionary.
best_topic = {}
best_proba = 0
for topic in dico:
proba = float(dico[topic]["weight"])
if proba>best_proba:
best_proba = proba
best_topic = topic
return {best_topic:dico[best_topic]}
# WS
# load all datas
all_data = []
for line in sys.stdin:
data=json.loads(line)
all_data.append(data)
# following parameters depends of the size of the corpus : num_topics and num_iterations
n = len(all_data)
if n< 1001:
num_topics = 10
num_iterations=150
elif n < 20001:
num_topics = 15
num_iterations=200
else:
num_topics = 20
num_iterations=250
# training LDA
texts = []
for line in all_data:
if "value" in line:
texts.append(tokenize(lemmatize(uniformize(line["value"]))))
else:
texts.append("n/a")
dictionary = corpora.Dictionary(texts) # Create a tf dictionary, but replace text by an id : [ [(id_token,numb_token),...] , [....] ]. The list represent docs of corpus
dictionary.filter_extremes(no_below=3,no_above=0.6)
corpus = [dictionary.doc2bow(text) for text in texts]
lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,iterations=num_iterations,alpha="symmetric", eta = "auto",minimum_probability=0.1)
# #To see topics (to test it with a jsonl file)
# sys.stdout.write(json.dumps(lda_model.print_topics()))
# #Get coherence
# cm = models.coherencemodel.CoherenceModel(model=lda_model, texts=texts, coherence='c_v')
# cm.get_coherence()
# exit()
# extract infos
for line in all_data:
doc = line["value"]
doc_bow = dictionary.doc2bow(tokenize(uniformize(line["value"])))
topics = lda_model[doc_bow]
topic_info = {}
for topic_id, topic_weight in topics:
topic_info[f"topic_{topic_id + 1}"] = {}
topic_words = [word for word, _ in lda_model.show_topic(topic_id)]
topic_info[f"topic_{topic_id + 1}"]["words"] = topic_words
topic_info[f"topic_{topic_id + 1}"]["weight"] = str(topic_weight)
line["value"]={}
line["value"]["topics"]=topic_info
line["value"]["best_topic"]=max_topic(topic_info)
# Write output
for line in all_data:
sys.stdout.write(json.dumps(line))
sys.stdout.write("\n")