diff --git a/data-computer/v1/charger-xml.cfg b/data-computer/v1/charger-xml.cfg index 55237d0..b27b1f3 100644 --- a/data-computer/v1/charger-xml.cfg +++ b/data-computer/v1/charger-xml.cfg @@ -25,4 +25,4 @@ value = get(env('pid')).replace('uid:/', '') [metrics] -bucket = charger +bucket = charger-xml diff --git a/data-computer/v1/rapido-ws.ini b/data-computer/v1/rapido-ws.ini deleted file mode 100644 index 6eb8c8b..0000000 --- a/data-computer/v1/rapido-ws.ini +++ /dev/null @@ -1,57 +0,0 @@ -# Entrypoint output format -mimeType = application/json -# OpenAPI Documentation - JSON format (dot notation) -post.operationId = post-v1-rapido-ws -post.description = WS-rapido -post.summary = ws-rapido -post.tags.0 = data-computer -post.requestBody.content.application/x-tar.schema.type = string -post.requestBody.content.application/x-tar.schema.format = binary -post.requestBody.required = true -post.responses.default.description = Informations permettant de récupérer les données le moment venu -post.parameters.0.description = Indenter le JSON résultant -post.parameters.0.in = query -post.parameters.0.name = indent -post.parameters.0.schema.type = boolean -post.parameters.1.description = URL pour signaler que le traitement est terminé -post.parameters.1.in = header -post.parameters.1.name = X-Webhook-Success -post.parameters.1.schema.type = string -post.parameters.1.schema.format = uri -post.parameters.1.required = false -post.parameters.2.description = URL pour signaler que le traitement a échoué -post.parameters.2.in = header -post.parameters.2.name = X-Webhook-Failure -post.parameters.2.schema.type = string -post.parameters.2.schema.format = uri -post.parameters.2.required = false - -[env] -path = generator -value = rapido-ws - -[use] -plugin = basics -plugin = spawn -# Step 1 (générique): Charger le fichier corpus - -[delegate] -file = charger-xml.cfg -# Step 2 (générique): Traiter de manière asynchnore les items reçus - -[fork] -standalone = true -logger = logger.cfg -# Step 2.1 (spécifique): Lancer un calcul sur tous les items reçus - -[fork/exec] -# command should be executable ! -command = ./v1/rapido-ws.py - -# Step 2.2 (générique): Enregister le résulat et signaler que le traitment est fini -[fork/delegate] -file = recorder.cfg - -# Step 3 : Renvoyer immédiatement un seul élément indiquant comment récupérer le résulat quand il sera prêt -[delegate] -file = recipient.cfg \ No newline at end of file diff --git a/data-computer/v1/rapido-ws.py b/data-computer/v1/rapido-ws.py deleted file mode 100755 index d7adcd7..0000000 --- a/data-computer/v1/rapido-ws.py +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/python3 - -import sys -import json -import sys -import rapido.preprocessing as preprocessing -import rapido.alignment as alignment -import rapido.export as export -import pandas as pd -import spacy - -path = "v1/rapido/" -tei_path = path + "new-persee-tei.xsl" -annotations_path = path + "annotations.csv" -ignore_path = path + "ignore.txt" - -def prePro(file,tei_path): - extractor = preprocessing.extractTei(file,tei_path) - extractor.extract_file() - df = extractor.df - remover = preprocessing.removeGreek(0.3) - - listTextWithoutGreek = [] - for listText in df["listText"].tolist(): - listTextWithoutGreek.append(remover.rmvGreek(listText)) - df["listTextWithoutGreek"] = listTextWithoutGreek - df['listTextWithoutGreekSplit'] = df['listTextWithoutGreek'].apply(preprocessing.dataToTxt) - df['listTitleSplit'] = df['Title'].apply(preprocessing.dataToTxt) - return df - -def pro(dfAnnotations,idText,listPageId,listText,listTitle): - dic = {} - aligner = alignment.alignWithText(dfAnnotations) - for j,text in enumerate(listText): - dic = aligner.isAnnotationInText(text,listPageId[j],dic,idText) - titleDic = {} - titleDic = aligner.isAnnotationInText(listTitle[0],"Title",titleDic,idText) - dic.update(titleDic) - postAligner = alignment.postProcessing(dfAnnotations,dic) - postAligner.removeIgnore() - postAligner.removeDuplicate() - postAligner.desambiguisation() - postAligner.confident() - return postAligner.dic,postAligner.rmv - -def rapido(dfAnnotations,dfText,ignoreWords,nlp): - exporter = export.exportJson(ignoreWords,nlp) - for index,row in dfText.iterrows(): - idText = row["ID"] - listPageId = row["listPageId"] - listText = row["listTextWithoutGreekSplit"] - listTitle = row["listTitleSplit"] - dic,rmv = pro(dfAnnotations,idText,listPageId,listText,listTitle) - print("PRO OK",file=sys.stderr) - newListText = [] - for text in listText: - text = " " + " ".join(text) + " " - text = text.lower() - newListText.append(text) - exporter.toJson(dic,rmv,newListText,listPageId, idText, listTitle) - print("EXPORT OK",file=sys.stderr) - return exporter.listPersee - -#Pre-loading -dfAnnotations = pd.read_csv(annotations_path) -with open(ignore_path) as f: - ignoreWords = [k[:-1].lower() for k in f.readlines()] -nlp = spacy.load('fr_core_news_sm') -nlp.add_pipe('melt_tagger', after='parser') -nlp.add_pipe('french_lemmatizer', after='melt_tagger') - - -for line in sys.stdin: - data = json.loads(line) - print(data,file=sys.stderr - ) - print("RECEIVE",file=sys.stderr) - df = prePro(data["value"],tei_path) - print("PREPRO OK",file=sys.stderr) - jsonResult = rapido(dfAnnotations,df,ignoreWords,nlp) - print("RESULT",file=sys.stderr) - print(jsonResult,file=sys.stderr) - sys.stdout.write(json.dumps(jsonResult)) - sys.stdout.write('\n') diff --git a/data-computer/v1/rapido.ini b/data-computer/v1/rapido.ini new file mode 100644 index 0000000..6777f17 --- /dev/null +++ b/data-computer/v1/rapido.ini @@ -0,0 +1,57 @@ +# Entrypoint output format +mimeType = application/json +# OpenAPI Documentation - JSON format (dot notation) +post.operationId = post-v1-rapido +post.description = Web service à destination du projet rapido. Ce web service prend en entrée un tar.gz comportant un dossier data incluant tout les documents xml à traiter. Il renvoit un json comportant les alignements que l'algorithme a pu faire entre le texte et le référentiel idRef. +post.summary = Applique un algorithme d'alignement avec idRef prévu dans le cadre du projet Rapido +post.tags.0 = data-computer +post.requestBody.content.application/x-tar.schema.type = string +post.requestBody.content.application/x-tar.schema.format = binary +post.requestBody.required = true +post.responses.default.description = Informations permettant de récupérer les données le moment venu +post.parameters.0.description = Indenter le JSON résultant +post.parameters.0.in = query +post.parameters.0.name = indent +post.parameters.0.schema.type = boolean +post.parameters.1.description = URL pour signaler que le traitement est terminé +post.parameters.1.in = header +post.parameters.1.name = X-Webhook-Success +post.parameters.1.schema.type = string +post.parameters.1.schema.format = uri +post.parameters.1.required = false +post.parameters.2.description = URL pour signaler que le traitement a échoué +post.parameters.2.in = header +post.parameters.2.name = X-Webhook-Failure +post.parameters.2.schema.type = string +post.parameters.2.schema.format = uri +post.parameters.2.required = false + +[env] +path = generator +value = rapido + +[use] +plugin = basics +plugin = spawn + +# Step 1 (générique): Charger le fichier corpus +[delegate] +file = charger-xml.cfg + +# Step 2 (générique): Traiter de manière asynchnore les items reçus +[fork] +standalone = true +logger = logger.cfg + +# Step 2.1 (spécifique): Lancer un calcul sur tous les items reçus +[fork/exec] +# command should be executable ! +command = ./v1/rapido.py + +# Step 2.2 (générique): Enregister le résulat et signaler que le traitment est fini +[fork/delegate] +file = recorder.cfg + +# Step 3 : Renvoyer immédiatement un seul élément indiquant comment récupérer le résulat quand il sera prêt +[delegate] +file = recipient.cfg \ No newline at end of file diff --git a/data-computer/v1/rapido.py b/data-computer/v1/rapido.py new file mode 100755 index 0000000..767c3bd --- /dev/null +++ b/data-computer/v1/rapido.py @@ -0,0 +1,80 @@ +#!/usr/bin/python3 + +import sys +import json +from rapido import preprocessing as preprocessing +from rapido import alignment as alignment +from rapido import export as export +import pandas as pd +import spacy +from datetime import datetime + +path = "v1/rapido/" +tei_path = path + "new-persee-tei.xsl" +annotations_path = path + "annotations.csv" +ignore_path = path + "ignore.txt" + +def prePro(file,tei_path): + extractor = preprocessing.extractTei(file,tei_path) + extractor.extract_file() + df = extractor.df + remover = preprocessing.removeGreek(0.3) + + listTextWithoutGreek = [] + for listText in df["listText"].tolist(): + listTextWithoutGreek.append(remover.rmvGreek(listText)) + df["listTextWithoutGreek"] = listTextWithoutGreek + df['listTextWithoutGreekSplit'] = df['listTextWithoutGreek'].apply(preprocessing.dataToTxt) + df['listTitleSplit'] = df['Title'].apply(preprocessing.dataToTxt) + return df + +def pro(dfAnnotations,idText,listPageId,listText,listTitle): + dic = {} + aligner = alignment.alignWithText(dfAnnotations) + for j,text in enumerate(listText): + dic = aligner.isAnnotationInText(text,listPageId[j],dic,idText) + titleDic = {} + titleDic = aligner.isAnnotationInText(listTitle[0],"Title",titleDic,idText) + dic.update(titleDic) + postAligner = alignment.postProcessing(dfAnnotations,dic) + postAligner.removeIgnore() + postAligner.removeDuplicate() + postAligner.desambiguisation() + postAligner.confident() + return postAligner.dic,postAligner.rmv + +def rapido(dfAnnotations,dfText,ignoreWords,nlp): + exporter = export.exportJson(ignoreWords,nlp) + for index,row in dfText.iterrows(): + idText = row["ID"] + listPageId = row["listPageId"] + listText = row["listTextWithoutGreekSplit"] + listTitle = row["listTitleSplit"] + dic,rmv = pro(dfAnnotations,idText,listPageId,listText,listTitle) + newListText = [] + for text in listText: + text = " " + " ".join(text) + " " + text = text.lower() + newListText.append(text) + exporter.toJson(dic,rmv,newListText,listPageId, idText, listTitle) + return exporter.listPersee + +#Pre-loading +dfAnnotations = pd.read_csv(annotations_path) +with open(ignore_path) as f: + ignoreWords = [k[:-1].lower() for k in f.readlines()] +nlp = spacy.load('fr_core_news_sm') +nlp.add_pipe('melt_tagger', after='parser') +nlp.add_pipe('french_lemmatizer', after='melt_tagger') + + +for line in sys.stdin: + data = json.loads(line) + id = data["id"] + print(id,":Data received ",datetime.now(),file=sys.stderr) + df = prePro(data["value"],tei_path) + jsonResult = rapido(dfAnnotations,df,ignoreWords,nlp) + print(id,":Data processed ",datetime.now(),file=sys.stderr) + sys.stdout.write(json.dumps(jsonResult)) + sys.stdout.write('\n') + print(id,":Result sent ",datetime.now(),file=sys.stderr) \ No newline at end of file diff --git a/data-computer/v1/rapido/__init__.py b/data-computer/v1/rapido/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/data-computer/v1/rapido/__init__.py diff --git a/data-computer/v1/rapido/preprocessing.py b/data-computer/v1/rapido/preprocessing.py index b0e7b06..1ccec8a 100644 --- a/data-computer/v1/rapido/preprocessing.py +++ b/data-computer/v1/rapido/preprocessing.py @@ -1,5 +1,4 @@ import re -import os import pandas as pd from itertools import chain from lxml import etree