#!/usr/bin/python3

import sys
import json
from rapido import preprocessing as preprocessing
from rapido import alignment as alignment
from rapido import export as export
import pandas as pd
import spacy
from datetime import datetime

path = "v1/rapido/"
tei_path = path + "new-persee-tei.xsl"
annotations_path = path + "annotations.csv"
ignore_path = path + "ignore.txt"

def prePro(file,tei_path):
    extractor = preprocessing.extractTei(file,tei_path)
    extractor.extract_file()
    df = extractor.df
    remover = preprocessing.removeGreek(0.3)

    listTextWithoutGreek = []
    for listText in df["listText"].tolist():
        listTextWithoutGreek.append(remover.rmvGreek(listText))
    df["listTextWithoutGreek"] = listTextWithoutGreek
    df['listTextWithoutGreekSplit'] = df['listTextWithoutGreek'].apply(preprocessing.dataToTxt)
    df['listTitleSplit'] = df['Title'].apply(preprocessing.dataToTxt)
    return df

def pro(dfAnnotations,idText,listPageId,listText,listTitle):
    dic = {}
    aligner = alignment.alignWithText(dfAnnotations)
    for j,text in enumerate(listText):
        dic = aligner.isAnnotationInText(text,listPageId[j],dic,idText)
    titleDic = {}
    titleDic = aligner.isAnnotationInText(listTitle[0],"Title",titleDic,idText)
    dic.update(titleDic)
    postAligner = alignment.postProcessing(dfAnnotations,dic)
    postAligner.removeIgnore()
    postAligner.removeDuplicate()
    postAligner.desambiguisation()
    postAligner.confident()
    return postAligner.dic,postAligner.rmv

def rapido(dfAnnotations,dfText,ignoreWords,nlp):
    exporter = export.exportJson(ignoreWords,nlp)
    for index,row in dfText.iterrows():
        idText = row["ID"]
        listPageId = row["listPageId"]
        listText = row["listTextWithoutGreekSplit"]
        listTitle = row["listTitleSplit"]
        dic,rmv = pro(dfAnnotations,idText,listPageId,listText,listTitle)
        newListText = []
        for text in listText:
            text = " " + " ".join(text) + " "
            text = text.lower()
            newListText.append(text)
        exporter.toJson(dic,rmv,newListText,listPageId, idText, listTitle)
    return exporter.listPersee

#Pre-loading
dfAnnotations = pd.read_csv(annotations_path)
with open(ignore_path) as f:
    ignoreWords = [k[:-1].lower() for k in f.readlines()]
nlp = spacy.load('fr_core_news_sm')
nlp.add_pipe('melt_tagger', after='parser')
nlp.add_pipe('french_lemmatizer', after='melt_tagger')


for line in sys.stdin:
    data = json.loads(line)
    id = data["id"]
    print(id,":Data received ",datetime.now(),file=sys.stderr)
    df = prePro(data["value"],tei_path)
    jsonResult = rapido(dfAnnotations,df,ignoreWords,nlp)
    print(id,":Data processed ",datetime.now(),file=sys.stderr)
    sys.stdout.write(json.dumps(jsonResult))
    sys.stdout.write('\n')
    print(id,":Result sent ",datetime.now(),file=sys.stderr)