web-services/data-computer/v1/rapido-ws.py at c159642aa1e97761cac8b6bec386fa89113619e0

Fork: 0
tdm / web-services
Find file
Newer
Older
web-services / data-computer / v1 / rapido-ws.py
Lucas Anki on 19 Dec 2023 2 KB Rapido web service branche
Raw Blame History
#!/usr/bin/python3

import sys
import json
import sys
import rapido.preprocessing as preprocessing
import rapido.alignment as alignment
import rapido.export as export
import pandas as pd
import spacy

path = "v1/rapido/"
tei_path = path + "new-persee-tei.xsl"
annotations_path = path + "annotations.csv"
ignore_path = path + "ignore.txt"

def prePro(file,tei_path):
    extractor = preprocessing.extractTei(file,tei_path)
    extractor.extract_file()
    df = extractor.df
    remover = preprocessing.removeGreek(0.3)

    listTextWithoutGreek = []
    for listText in df["listText"].tolist():
        listTextWithoutGreek.append(remover.rmvGreek(listText))
    df["listTextWithoutGreek"] = listTextWithoutGreek
    df['listTextWithoutGreekSplit'] = df['listTextWithoutGreek'].apply(preprocessing.dataToTxt)
    df['listTitleSplit'] = df['Title'].apply(preprocessing.dataToTxt)
    return df

def pro(dfAnnotations,idText,listPageId,listText,listTitle):
    dic = {}
    aligner = alignment.alignWithText(dfAnnotations)
    for j,text in enumerate(listText):
        dic = aligner.isAnnotationInText(text,listPageId[j],dic,idText)
    titleDic = {}
    titleDic = aligner.isAnnotationInText(listTitle[0],"Title",titleDic,idText)
    dic.update(titleDic)
    postAligner = alignment.postProcessing(dfAnnotations,dic)
    postAligner.removeIgnore()
    postAligner.removeDuplicate()
    postAligner.desambiguisation()
    postAligner.confident()
    return postAligner.dic,postAligner.rmv

def rapido(dfAnnotations,dfText,ignoreWords,nlp):
    exporter = export.exportJson(ignoreWords,nlp)
    for index,row in dfText.iterrows():
        idText = row["ID"]
        listPageId = row["listPageId"]
        listText = row["listTextWithoutGreekSplit"]
        listTitle = row["listTitleSplit"]
        dic,rmv = pro(dfAnnotations,idText,listPageId,listText,listTitle)
        print("PRO OK",file=sys.stderr)
        newListText = []
        for text in listText:
            text = " " + " ".join(text) + " "
            text = text.lower()
            newListText.append(text)
        exporter.toJson(dic,rmv,newListText,listPageId, idText, listTitle)
        print("EXPORT OK",file=sys.stderr)
    return exporter.listPersee

#Pre-loading
dfAnnotations = pd.read_csv(annotations_path)
with open(ignore_path) as f:
    ignoreWords = [k[:-1].lower() for k in f.readlines()]
nlp = spacy.load('fr_core_news_sm')
nlp.add_pipe('melt_tagger', after='parser')
nlp.add_pipe('french_lemmatizer', after='melt_tagger')


for line in sys.stdin:
    data = json.loads(line)
    print(data,file=sys.stderr
    )
    print("RECEIVE",file=sys.stderr)
    df = prePro(data["value"],tei_path)
    print("PREPRO OK",file=sys.stderr)
    jsonResult = rapido(dfAnnotations,df,ignoreWords,nlp)
    print("RESULT",file=sys.stderr)
    print(jsonResult,file=sys.stderr)
    sys.stdout.write(json.dumps(jsonResult))
    sys.stdout.write('\n')