#!/usr/bin/python3
import sys
import json
import sys
import rapido.preprocessing as preprocessing
import rapido.alignment as alignment
import rapido.export as export
import pandas as pd
import spacy
path = "v1/rapido/"
tei_path = path + "new-persee-tei.xsl"
annotations_path = path + "annotations.csv"
ignore_path = path + "ignore.txt"
def prePro(file,tei_path):
extractor = preprocessing.extractTei(file,tei_path)
extractor.extract_file()
df = extractor.df
remover = preprocessing.removeGreek(0.3)
listTextWithoutGreek = []
for listText in df["listText"].tolist():
listTextWithoutGreek.append(remover.rmvGreek(listText))
df["listTextWithoutGreek"] = listTextWithoutGreek
df['listTextWithoutGreekSplit'] = df['listTextWithoutGreek'].apply(preprocessing.dataToTxt)
df['listTitleSplit'] = df['Title'].apply(preprocessing.dataToTxt)
return df
def pro(dfAnnotations,idText,listPageId,listText,listTitle):
dic = {}
aligner = alignment.alignWithText(dfAnnotations)
for j,text in enumerate(listText):
dic = aligner.isAnnotationInText(text,listPageId[j],dic,idText)
titleDic = {}
titleDic = aligner.isAnnotationInText(listTitle[0],"Title",titleDic,idText)
dic.update(titleDic)
postAligner = alignment.postProcessing(dfAnnotations,dic)
postAligner.removeIgnore()
postAligner.removeDuplicate()
postAligner.desambiguisation()
postAligner.confident()
return postAligner.dic,postAligner.rmv
def rapido(dfAnnotations,dfText,ignoreWords,nlp):
exporter = export.exportJson(ignoreWords,nlp)
for index,row in dfText.iterrows():
idText = row["ID"]
listPageId = row["listPageId"]
listText = row["listTextWithoutGreekSplit"]
listTitle = row["listTitleSplit"]
dic,rmv = pro(dfAnnotations,idText,listPageId,listText,listTitle)
print("PRO OK",file=sys.stderr)
newListText = []
for text in listText:
text = " " + " ".join(text) + " "
text = text.lower()
newListText.append(text)
exporter.toJson(dic,rmv,newListText,listPageId, idText, listTitle)
print("EXPORT OK",file=sys.stderr)
return exporter.listPersee
#Pre-loading
dfAnnotations = pd.read_csv(annotations_path)
with open(ignore_path) as f:
ignoreWords = [k[:-1].lower() for k in f.readlines()]
nlp = spacy.load('fr_core_news_sm')
nlp.add_pipe('melt_tagger', after='parser')
nlp.add_pipe('french_lemmatizer', after='melt_tagger')
for line in sys.stdin:
data = json.loads(line)
print(data,file=sys.stderr
)
print("RECEIVE",file=sys.stderr)
df = prePro(data["value"],tei_path)
print("PREPRO OK",file=sys.stderr)
jsonResult = rapido(dfAnnotations,df,ignoreWords,nlp)
print("RESULT",file=sys.stderr)
print(jsonResult,file=sys.stderr)
sys.stdout.write(json.dumps(jsonResult))
sys.stdout.write('\n')