diff --git a/.gitignore b/.gitignore index f91ec48..d19c7c3 100644 --- a/.gitignore +++ b/.gitignore @@ -39,6 +39,7 @@ # Python virtual env directory env/ +.venv/ # Optional REPL history .node_repl_history diff --git a/data-computer/README.md b/data-computer/README.md index 0808b52..87a0064 100644 --- a/data-computer/README.md +++ b/data-computer/README.md @@ -157,7 +157,105 @@ ### v1/lda -... +Créer à partir de l'ensemble des documents un champ "lda" constitué de 5 topics. Chaque topic contient un champ "word", qui est composé une liste de 10 mots qui sont les plus caractéristiques du topic, ainsi que d'un champ "weight" qui correspond au poids associé au sujet dans le document. Le texte doit être en anglais. + + +Par exemple, pour un document pris dans un ensemble de document (l'id "85" est totalement arbitraire) + +```json +{ + "id": 85, + "value": "During my culinary adventure through the bustling markets of Marrakech, where the scent of exotic spices hung in the air and vendors beckoned with colorful displays of fruits and textiles, I savored tagines, couscous, and mint tea, discovering the rich tapestry of Moroccan flavors." +} +``` + +On obtiendra : +```json +{ + "id":85, + "value": "During my culinary adventure through the bustling markets of Marrakech, where the scent of exotic spices hung in the air and vendors beckoned with colorful displays of fruits and textiles, I savored tagines, couscous, and mint tea, discovering the rich tapestry of Moroccan flavors.", + "lda": { + "topic_1": { + "words": [ + "sky", + "tranquil", + "yellow", + "solace", + "symphony", + "leave", + "bird", + "taxi", + "cityscape", + "provide" + ], + "weight": "0.0133591" + }, + "topic_2": { + "words": [ + "bustling", + "air", + "savor", + "tapestry", + "rich", + "adventure", + "tea", + "discover", + "flavor", + "hang" + ], + "weight": "0.94660753" + }, + "topic_3": { + "words": [ + "street", + "air", + "cottage", + "quaint", + "melodic", + "seaside", + "water", + "shore", + "collect", + "sandy" + ], + "weight": "0.013361818" + }, + "topic_4": { + "words": [ + "forest", + "atmosphere", + "leave", + "filter", + "tale", + "tower", + "create", + "floor", + "enchant", + "shadow" + ], + "weight": "0.013335978" + }, + "topic_5": { + "words": [ + "mystery", + "sky", + "embark", + "ponder", + "gaze", + "overwhelming", + "light", + "mountaintop", + "night", + "universe" + ], + "weight": "0.013335522" + } + } +} + +``` + +NOTE : l'algorithme a besoin de beaucoup de documents pour fonctionner (plus d'une centaine), d'où la non exhaustivité de l'exemple. #### Paramètre(s) URL @@ -175,6 +273,9 @@ ```bash -... -``` +# Send data for batch processing +cat input.tar.gz |curl --data-binary @- -H "X-Hook: https://webhook.site/dce2fefa-9a72-4f76-96e5-059405a04f6c" "http://localhost:31976/v1/lda" > output.json +# When the corpus is processed, get the result +cat output.json |curl --data-binary @- "http://localhost:31976/v1/retrieve" > output.tar.gz +``` diff --git a/data-computer/requirements.txt b/data-computer/requirements.txt new file mode 100755 index 0000000..1a0fc4f --- /dev/null +++ b/data-computer/requirements.txt @@ -0,0 +1,3 @@ +gensim==4.3.2 +spacy==3.6.1 +en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0.tar.gz diff --git a/data-computer/swagger.json b/data-computer/swagger.json index ca6a34a..c1f79b1 100644 --- a/data-computer/swagger.json +++ b/data-computer/swagger.json @@ -2,7 +2,7 @@ "info": { "title": "data-computer - Calculs sur fichier coprus compressé", "summary": "Algorithmes de calculs sur un corpus compressé", - "version": "1.4.0", + "version": "1.6.1", "termsOfService": "https://objectif-tdm.inist.fr/", "contact": { "name": "Inist-CNRS", diff --git a/data-computer/v1/lda.ini b/data-computer/v1/lda.ini index 94e1cdf..78f9ac0 100644 --- a/data-computer/v1/lda.ini +++ b/data-computer/v1/lda.ini @@ -3,8 +3,8 @@ # OpenAPI Documentation - JSON format (dot notation) post.operationId = post-v1-lda -post.description = description... -post.summary = summary... +post.description = Créer à partir de l'ensemble des documents un champ "lda" constitué de 5 topics eux-mêmes caractérisés par 10 mots. +post.summary = Classifie un ensemble de documents parmi 5 topics. post.tags.0 = data-computer post.requestBody.content.application/x-tar.schema.type = string post.requestBody.content.application/x-tar.schema.format = binary diff --git a/data-computer/v1/lda.py b/data-computer/v1/lda.py index 3728339..74c8e46 100755 --- a/data-computer/v1/lda.py +++ b/data-computer/v1/lda.py @@ -2,10 +2,92 @@ # -*- coding: utf-8 -*- import json import sys +from gensim import corpora, models +import unicodedata +import string +import re +import spacy +nlp = spacy.load('en_core_web_sm', disable = ['parser','ner']) + +#stopwords +with open('./v1/stopwords/en.json','r') as f_in: + stopwords =json.load(f_in) + +#normalize text +def remove_accents(text): + normalized_text = unicodedata.normalize("NFD", text) + text_with_no_accent = re.sub("[\u0300-\u036f]", "", normalized_text) + return text_with_no_accent + +def uniformize(text): + # del accents + text = remove_accents(text) + + # remove punctuation except " ' " + punctuation = ''.join(char for char in string.punctuation if char != "'") + text = ''.join(char for char in text if char not in punctuation) + + return text.lower() + +#lemmatize +def lemmatize(text): + doc = nlp(text) + return " ".join([token.lemma_ for token in doc]) + +#tokenize +def tokenize(text): + tokens = [word for word in text.replace("'"," ").split() if word not in stopwords and len(word)>2] + if len(tokens)==0: + return ["none"] + return tokens + + +# WS +# Datas +all_data = [] for line in sys.stdin: data=json.loads(line) - data['lda']='Comming soon' - sys.stdout.write(json.dumps(data)) - sys.stdout.write('\n') + all_data.append(data) + + +# params +n = len(all_data) +if n< 1001: + num_topics = 10 + num_iterations=100 +elif n < 20001: + num_topics = 15 + num_iterations=150 +else: + num_topics = 20 + num_iterations=200 + + +# training LDA +texts = [tokenize(lemmatize(uniformize(line["value"]))) for line in all_data] +dictionary = corpora.Dictionary(texts) # Create a tf dictionary, but replace text by an id : [ [(id_token,numb_token),...] , [....] ]. The list represent docs of corpus +corpus = [dictionary.doc2bow(text) for text in texts] + +lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,iterations=num_iterations) + + +# extract infos +for line in all_data: + doc = line["value"] + doc_bow = dictionary.doc2bow(tokenize(uniformize(line["value"]))) + topics = lda_model[doc_bow] + topic_info = {} + for topic_id, topic_weight in topics: + topic_info[f"topic_{topic_id + 1}"] = {} + topic_words = [word for word, _ in lda_model.show_topic(topic_id)] + topic_info[f"topic_{topic_id + 1}"]["words"] = topic_words + topic_info[f"topic_{topic_id + 1}"]["weight"] = str(topic_weight) + + line["lda"]= topic_info + +# Write all corpus in once +for line in all_data: + sys.stdout.write(json.dumps(line)) + sys.stdout.write("\n") diff --git a/data-computer/v1/stopwords/en.json b/data-computer/v1/stopwords/en.json new file mode 100644 index 0000000..af7c866 --- /dev/null +++ b/data-computer/v1/stopwords/en.json @@ -0,0 +1 @@ +["able", "about", "above", "abroad", "according", "accordingly", "across", "actually", "adj", "after", "afterwards", "again", "against", "ago", "ahead", "ain", "all", "allow", "allows", "almost", "alone", "along", "alongside", "already", "also", "although", "always", "amid", "amidst", "among", "amongst", "and", "another", "any", "anybody", "anyhow", "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "appear", "appreciate", "appropriate", "are", "aren", "around", "aside", "ask", "asking", "associated", "available", "away", "awfully", "back", "backward", "backwards", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "both", "brief", "but", "came", "can", "cannot", "cant", "caption", "cause", "causes", "certain", "certainly", "changes", "clearly", "co", "com", "come", "comes", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn", "course", "currently", "dare", "daren", "definitely", "described", "despite", "did", "didn", "different", "directly", "does", "doesn", "doing", "done", "don", "down", "downwards", "during", "each", "edu", "eight", "eighty", "either", "else", "elsewhere", "end", "ending", "enough", "entirely", "especially", "etc", "even", "ever", "evermore", "every", "everybody", "everyone", "everything", "everywhere", "exactly", "example", "except", "fairly", "far", "farther", "few", "fewer", "fifth", "first", "five", "followed", "following", "follows", "for", "forever", "former", "formerly", "forth", "forward", "found", "four", "from", "further", "furthermore", "get", "gets", "getting", "given", "gives", "goes", "going", "gone", "got", "gotten", "greetings", "had", "hadn", "half", "happens", "hardly", "has", "hasn", "have", "haven", "having", "hello", "help", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "hither", "hopefully", "how", "howbeit", "however", "hundred", "ignored", "immediate", "inasmuch", "inc", "inc", "indeed", "indicate", "indicated", "indicates", "inner", "inside", "insofar", "instead", "into", "inward", "isn", "its", "itself", "just", "keep", "keeps", "kept", "know", "known", "knows", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "like", "liked", "likely", "likewise", "little", "look", "looking", "looks", "low", "lower", "ltd", "made", "mainly", "make", "makes", "many", "may", "maybe", "mayn", "mean", "meantime", "meanwhile", "merely", "might", "mightn", "mine", "minus", "miss", "more", "moreover", "most", "mostly", "mrs", "much", "must", "mustn", "myself", "name", "namely", "near", "nearly", "necessary", "need", "needn", "needs", "neither", "never", "neverf", "neverless", "nevertheless", "new", "next", "nine", "ninety", "nobody", "non", "none", "nonetheless", "noone", "noone", "nor", "normally", "not", "nothing", "notwithstanding", "novel", "now", "nowhere", "obviously", "off", "often", "okay", "old", "once", "one", "ones", "only", "onto", "opposite", "other", "others", "otherwise", "ought", "oughtn", "our", "ours", "ourselves", "out", "outside", "over", "overall", "own", "particular", "particularly", "past", "per", "perhaps", "placed", "please", "plus", "possible", "presumably", "probably", "provided", "provides", "que", "quite", "rather", "really", "reasonably", "recent", "recently", "regarding", "regardless", "regards", "relatively", "respectively", "right", "round", "said", "same", "saw", "say", "saying", "says", "second", "secondly", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall", "shan", "she", "should", "shouldn", "since", "six", "some", "somebody", "someday", "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified", "specify", "specifying", "still", "sub", "such", "sup", "sure", "take", "taken", "taking", "tell", "tends", "than", "thank", "thanks", "thanx", "that", "thats", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "theres", "thereupon", "these", "they", "thing", "things", "think", "third", "thirty", "this", "thorough", "thoroughly", "those", "though", "three", "through", "throughout", "thru", "thus", "till", "together", "too", "took", "toward", "towards", "tried", "tries", "truly", "try", "trying", "twice", "two", "under", "underneath", "undoing", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "upon", "upwards", "use", "used", "useful", "uses", "using", "usually", "value", "various", "versus", "very", "via", "viz", "want", "wants", "was", "wasn", "way", "welcome", "well", "went", "were", "weren", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "whichever", "while", "whilst", "whither", "who", "whoever", "whole", "whom", "whomever", "whose", "why", "will", "willing", "wish", "with", "within", "without", "wonder", "won", "would", "wouldn", "yes", "yet", "you", "your", "yours", "yourself", "yourselves", "zero", "uucp", "www", "amount", "bill", "bottom", "call", "computer", "con", "couldnt", "cry", "describe", "detail", "due", "eleven", "empty", "fifteen", "fifty", "fill", "find", "fire", "forty", "front", "full", "give", "hasnt", "herse", "himse", "interest", "itse\u201d", "mill", "move", "myse\u201d", "part", "put", "show", "side", "sincere", "sixty", "system", "ten", "thick", "thin", "top", "twelve", "twenty", "abst", "accordance", "act", "added", "adopted", "affected", "affecting", "affects", "announce", "anymore", "apparently", "approximately", "arent", "arise", "auth", "beginning", "beginnings", "begins", "biol", "briefly", "date", "effect", "etal", "fix", "gave", "giving", "heres", "hes", "hid", "home", "immediately", "importance", "important", "index", "information", "invention", "itd", "keys", "largely", "lets", "line", "means", "million", "mug", "nay", "necessarily", "nos", "noted", "obtain", "obtained", "omitted", "ord", "owing", "page", "pages", "poorly", "possibly", "potentially", "predominantly", "present", "previously", "primarily", "promptly", "proud", "quickly", "ran", "readily", "ref", "refs", "related", "research", "resulted", "resulting", "results", "run", "sec", "section", "shed", "shes", "showed", "shown", "showns", "shows", "significant", "significantly", "similar", "similarly", "slightly", "somethan", "specifically", "state", "states", "stop", "strongly", "substantially", "successfully", "sufficiently", "suggest", "thered", "thereof", "therere", "thereto", "theyd", "theyre", "thou", "thoughh", "thousand", "throug", "til", "tip", "ups", "usefully", "usefulness", "vol", "vols", "wed", "whats", "wheres", "whim", "whod", "whos", "widely", "words", "world", "youd", "youre"] \ No newline at end of file diff --git a/data-computer/v1/stopwords/fr.json b/data-computer/v1/stopwords/fr.json new file mode 100644 index 0000000..e86310d --- /dev/null +++ b/data-computer/v1/stopwords/fr.json @@ -0,0 +1 @@ +["abord", "afin", "aie", "ainsi", "allaient", "allo", "allo", "allons", "apres", "assez", "attendu", "aucun", "aucune", "aujourd", "auquel", "aura", "auront", "aussi", "autre", "autres", "aux", "auxquelles", "auxquels", "avaient", "avais", "avait", "avant", "avec", "avoir", "ayant", "bah", "beaucoup", "bien", "bigre", "boum", "bravo", "brrr", "car", "ceci", "cela", "celle", "celleci", "cellela", "celles", "cellesci", "cellesla", "celui", "celuici", "celuila", "cent", "cependant", "certain", "certaine", "certaines", "certains", "certes", "ces", "cet", "cette", "ceux", "ceuxci", "ceuxla", "chacun", "chaque", "cher", "chere", "cheres", "chers", "chez", "chiche", "chut", "cinq", "cinquantaine", "cinquante", "cinquantieme", "cinquieme", "clac", "clic", "combien", "comme", "comment", "compris", "concernant", "contre", "couic", "crac", "dans", "debout", "dedans", "dehors", "dela", "depuis", "derriere", "des", "des", "desormais", "desquelles", "desquels", "dessous", "dessus", "deux", "deuxieme", "deuxiemement", "devant", "devers", "devra", "different", "differente", "differentes", "differents", "dire", "divers", "diverse", "diverses", "dix", "dixhuit", "dixieme", "dixneuf", "dixsept", "doit", "doivent", "donc", "dont", "douze", "douzieme", "dring", "duquel", "durant", "effet", "elle", "ellememe", "elles", "ellesmemes", "encore", "entre", "envers", "environ", "est", "etant", "etaient", "etais", "etait", "etant", "etc", "ete", "etre", "etre", "euh", "eux", "euxmemes", "excepte", "facon", "fais", "faisaient", "faisant", "fait", "feront", "flac", "floc", "font", "gens", "hein", "helas", "hem", "hep", "hola", "hop", "hormis", "hors", "hou", "houp", "hue", "hui", "huit", "huitieme", "hum", "hurrah", "ils", "importe", "jusqu", "jusque", "laquelle", "las", "lequel", "les", "les", "lesquelles", "lesquels", "leur", "leurs", "longtemps", "lorsque", "lui", "luimeme", "maint", "mais", "malgre", "meme", "memes", "merci", "mes", "mien", "mienne", "miennes", "miens", "mille", "mince", "moi", "moimeme", "moins", "mon", "moyennant", "neanmoins", "neuf", "neuvieme", "nombreuses", "nombreux", "non", "nos", "notre", "notre", "notres", "nous", "nousmemes", "nul", "ohe", "ole", "olle", "ont", "onze", "onzieme", "ore", "ouf", "ouias", "oust", "ouste", "outre", "paf", "pan", "par", "parmi", "partant", "particulier", "particuliere", "particulierement", "pas", "passe", "pendant", "personne", "peu", "peut", "peuvent", "peux", "pff", "pfft", "pfut", "pif", "plein", "plouf", "plus", "plusieurs", "plutot", "pouah", "pour", "pourquoi", "premier", "premiere", "premierement", "pres", "proche", "psitt", "puisque", "quand", "quant", "quanta", "quantasoi", "quarante", "quatorze", "quatre", "quatrevingt", "quatrieme", "quatriemement", "que", "quel", "quelconque", "quelle", "quelles", "quelque", "quelques", "quelqu", "quels", "qui", "quiconque", "quinze", "quoi", "quoique", "revoici", "revoila", "rien", "sacrebleu", "sans", "sapristi", "sauf", "seize", "selon", "sept", "septieme", "sera", "seront", "ses", "sien", "sienne", "siennes", "siens", "sinon", "six", "sixieme", "soi", "soimeme", "soit", "soixante", "son", "sont", "sous", "stop", "suis", "suivant", "sur", "surtout", "tac", "tant", "tel", "telle", "tellement", "telles", "tels", "tenant", "tes", "tic", "tien", "tienne", "tiennes", "tiens", "toc", "toi", "toimeme", "ton", "touchant", "toujours", "tous", "tout", "toute", "toutes", "treize", "trente", "tres", "trois", "troisieme", "troisiemement", "trop", "tsoin", "tsouin", "une", "unes", "uns", "vais", "vas", "vers", "via", "vif", "vifs", "vingt", "vivat", "vive", "vives", "vlan", "voici", "voila", "vont", "vos", "votre", "votre", "votres", "vous", "vousmemes", "zut", "alors", "aucuns", "bon", "devrait", "dos", "droite", "debut", "essai", "faites", "fois", "force", "haut", "ici", "juste", "maintenant", "mine", "mot", "nommes", "nouveaux", "parce", "parole", "personnes", "piece", "plupart", "seulement", "soyez", "sujet", "tandis", "valeur", "voie", "voient", "etat", "etions"] \ No newline at end of file