Newer
Older
web-services / data-computer / v1 / lda.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import sys
from gensim import corpora, models
import unicodedata
import re
import spacy

nlp = spacy.load('en_core_web_sm', disable = ['parser','ner'])

#stopwords
with open('./v1/stopwords/en.json','r') as f_in:
    stopwords =json.load(f_in)

#normalize text
def remove_accents(text):
    if text == "" or type(text)!= str:
        return ""
    normalized_text = unicodedata.normalize("NFD", text)
    text_with_no_accent = re.sub("[\u0300-\u036f]", "", normalized_text)
    return text_with_no_accent

def uniformize(text):
    text = remove_accents(text)

    # remove punctuation except " ' "
    text = ''.join(char if char.isalpha() or char == "'" else ' ' for char in text)

    return ' '.join(text.lower().split())

#lemmatize
def lemmatize(text):
    if text == "":
        return text
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

#tokenize
def tokenize(text):
    tokens = [word for word in text.replace("'"," ").split() if word not in stopwords and len(word)>2]
    if len(tokens)==0:
        return []
    return tokens

# Max topic
def max_topic(dico):
    """
    for a dictionary of topics, return a json with a single key "best_topic" and its value is the value of this topic in the dictionary.
    """
    best_topic = {}
    best_proba = 0
    for topic in dico:
        proba = float(dico[topic]["topic_weight"])
        if proba>best_proba:
            best_proba = proba
            best_topic = topic
    return {best_topic:dico[best_topic]}



# WS
# load all datas
all_data = []
for line in sys.stdin:
    data=json.loads(line)
    all_data.append(data)


# following parameters depends of the size of the corpus : num_topics and num_iterations
len_data = len(all_data)
if len_data< 1001:
    num_topics = 10
    num_iterations=150
elif len_data < 20001:
    num_topics = 15
    num_iterations=200
else:
    num_topics = 20
    num_iterations=250


# training LDA
texts = []
index_without_value = []
for i in range(len_data):
    line = all_data[i]
    if "value" in line and type(line["value"])==str:
        tokens = tokenize(lemmatize(uniformize(line["value"])))
        if tokens != []:
            texts.append(tokenize(lemmatize(uniformize(line["value"]))))
        else:
            index_without_value.append(i)
    else:
        index_without_value.append(i)
dictionary = corpora.Dictionary(texts) # Create a tf dictionary, but replace text by an id : [ [(id_token,numb_token),...] , [....] ]. The list represent docs of corpus
dictionary.filter_extremes(no_below=3,no_above=0.5)
corpus = [dictionary.doc2bow(text) for text in texts]

try:
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,iterations=num_iterations,alpha="symmetric", eta = "auto",minimum_probability=0.1)
except:
    index_without_value = [i for i in range(len_data)]


# extract infos
for i in range(len_data):

    #return n/a if docs wasn't in model
    if i in index_without_value:
        line["value"]="n/a"
        sys.stdout.write(json.dumps(line))
        sys.stdout.write("\n")
    else:
        line = all_data[i]
        doc = line["value"]
        doc_bow = dictionary.doc2bow(tokenize(uniformize(line["value"])))
        topics = lda_model[doc_bow]
        topic_info = {}
        for topic_id, topic_weight in topics:
            topic_info[f"topic_{topic_id + 1}"] = {}
            words = []
            words_weights = []
            for word, word_weight in lda_model.show_topic(topic_id):
                words.append(word) 
                words_weights.append(str(word_weight))
            topic_info[f"topic_{topic_id + 1}"]["words"] = words
            topic_info[f"topic_{topic_id + 1}"]["words_weights"] = words_weights
            topic_info[f"topic_{topic_id + 1}"]["topic_weight"] = str(topic_weight)

        line["value"]={}
        line["value"]["topics"]=topic_info
        try:
            line["value"]["best_topic"]=max_topic(topic_info)
        except:
            line["value"]["best_topic"]="n/a"
        sys.stdout.write(json.dumps(line))
        sys.stdout.write("\n")


# #To see topics (to test it with a jsonl file)
# sys.stdout.write(json.dumps(lda_model.print_topics()))

# #Get coherence
# cm = models.coherencemodel.CoherenceModel(model=lda_model, texts=texts, coherence='c_v')
# cm.get_coherence()
# exit()