diff --git a/data-computer/v1/lda.py b/data-computer/v1/lda.py index 3728339..916b5e3 100755 --- a/data-computer/v1/lda.py +++ b/data-computer/v1/lda.py @@ -2,10 +2,92 @@ # -*- coding: utf-8 -*- import json import sys +from gensim import corpora, models +import unicodedata +import string +import math +# params +num_topics = 5 # Number of topics +num_iterations=100 # "epochs" ====> shall it depends of number of docs ? number of topics ? + +stopwords_lists = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", + 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', + 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', + 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', + 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', + 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', + 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', + 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', + 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', + 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', + 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", + 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', + "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", + 'won', "won't", 'wouldn', "wouldn't", 'au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', + 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'meme', 'mes', 'moi', 'mon', + 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', + 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'm', 'n', 's', 't', + 'y', 'ete', 'etee', 'etees', 'etes', 'etant', 'etante', 'etants', 'etantes', 'suis', 'es', 'est', 'sommes', 'etes', 'sont', + 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'etais', 'etait', + 'etions', 'etiez', 'etaient', 'fus', 'fut', 'fumes', 'futes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', + 'fusses', 'fut', 'fussions', 'fussiez', 'fussent', 'ayant', 'ayante', 'ayantes', 'ayants', 'eu', 'eue', 'eues', 'eus', 'ai', 'as', + 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', 'aurez', 'auront', 'aurais', 'aurait', 'aurions', 'auriez', 'auraient', + 'avais', 'avait', 'avions', 'aviez', 'avaient', 'eut', 'eumes', 'eutes', 'eurent', 'aie', 'aies', 'ait', 'ayons', 'ayez', 'aient', + 'eusse', 'eusses', 'eut', 'eussions', 'eussiez', 'eussent'] + +stopwords = stopwords_lists + + +def uniformize(text): + # del accents + text = ''.join(char for char in unicodedata.normalize('NFD', text) if unicodedata.category(char) != 'Mn') + + # remove punctuation except " ' " + punctuation = ''.join(char for char in string.punctuation if char != "'") + text = ''.join(char for char in text if char not in punctuation) + + return text.replace("'"," ").lower() + + +def tokenize(text): + tokens = [word for word in text.split() if word not in stopwords] + return tokens + + + +# WS +# Datas +all_data = [] for line in sys.stdin: data=json.loads(line) - data['lda']='Comming soon' - sys.stdout.write(json.dumps(data)) - sys.stdout.write('\n') + all_data.append(data) + + +# training LDA +texts = [tokenize(uniformize(line["value"])) for line in all_data] +dictionary = corpora.Dictionary(texts) # Dictionary from texts : corpus is [ [(num_token,numb_token),...] , [....] ]. The list represent docs of corpus +corpus = [dictionary.doc2bow(text) for text in texts] + +lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,iterations=num_iterations) + + +# extract infos +for line in all_data: + idx = line["id"] + doc = line["value"] + doc_bow = dictionary.doc2bow(texts[int(idx)-1]) + topics = lda_model[doc_bow] + topic_info = {} + for topic_id, topic_weight in topics: + topic_words = [word for word, _ in lda_model.show_topic(topic_id)] + topic_info[f"topic {topic_id + 1}"] = topic_words + topic_info[f"weight {topic_id + 1}"] = str(topic_weight) + + line["lda"]= topic_info + +# Write all corpus in once +for line in all_data: + sys.stdout.write(json.dumps(all_data)) + sys.stdout.write("\n")