web-services/data-computer/v1/lda.py at c8d44bc88c2c3789d5604e8fa7a50a1fa5e36d7e

Fork: 0
tdm / web-services
Find file
Newer
Older
web-services / data-computer / v1 / lda.py
Leo-gail on 10 Oct 2023 2 KB fix(data-computer): now number of topics depends of len(data)
Raw Blame History
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import sys
from gensim import corpora, models
import unicodedata
import string
import re
import spacy

nlp = spacy.load('en_core_web_sm', disable = ['parser','ner'])

#stopwords
with open('./v1/stopwords/en.json','r') as f_in:
    stopwords =json.load(f_in)

#normalize text
def remove_accents(text):
    normalized_text = unicodedata.normalize("NFD", text)
    text_with_no_accent = re.sub("[\u0300-\u036f]", "", normalized_text)
    return text_with_no_accent

def uniformize(text):
    # del accents
    text = remove_accents(text)

    # remove punctuation except " ' "
    punctuation = ''.join(char for char in string.punctuation if char != "'")
    text = ''.join(char for char in text if char not in punctuation)

    return text.lower()

#lemmatize
def lemmatize(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

#tokenize
def tokenize(text):
    tokens = [word for word in text.replace("'"," ").split() if word not in stopwords and len(word)>2]
    if len(tokens)==0:
        return ["none"]
    return tokens


# WS
# Datas
all_data = []
for line in sys.stdin:
    data=json.loads(line)
    all_data.append(data)


# params
n = len(all_data)
if n< 1001:
    num_topics = 10
    num_iterations=100
elif n < 20001:
    num_topics = 15
    num_iterations=150
else:
    num_topics = 20
    num_iterations=200


# training LDA
texts = [tokenize(lemmatize(uniformize(line["value"]))) for line in all_data]
dictionary = corpora.Dictionary(texts) # Create a tf dictionary, but replace text by an id : [ [(id_token,numb_token),...] , [....] ]. The list represent docs of corpus
corpus = [dictionary.doc2bow(text) for text in texts]

lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,iterations=num_iterations)


# extract infos
for line in all_data:
    doc = line["value"]
    doc_bow = dictionary.doc2bow(tokenize(uniformize(line["value"])))
    topics = lda_model[doc_bow]
    topic_info = {}
    for topic_id, topic_weight in topics:
        topic_info[f"topic_{topic_id + 1}"] = {}
        topic_words = [word for word, _ in lda_model.show_topic(topic_id)]
        topic_info[f"topic_{topic_id + 1}"]["words"] = topic_words
        topic_info[f"topic_{topic_id + 1}"]["weight"] = str(topic_weight)
    
    line["lda"]= topic_info

# Write all corpus in once
for line in all_data:
    sys.stdout.write(json.dumps(line))
    sys.stdout.write("\n")