Newer
Older
web-services / data-computer / v1 / rapido / export.py
from spacy_lefff import LefffLemmatizer, POSTagger
from spacy.language import Language
import json

class exportJson:
    def __init__(self,ignoreWords,nlp):
        self.nlp = nlp
        self.ignoreWords = ignoreWords
        self.listInist = []
        self.listPersee = []

    @Language.factory('french_lemmatizer')
    def create_french_lemmatizer(nlp, name):
        return LefffLemmatizer(after_melt=True, default=True)

    @Language.factory('melt_tagger')  
    def create_melt_tagger(nlp, name):
        return POSTagger()

    def posVerif(self,word,sents):
        newSents = []
        for text in sents:
            doc = self.nlp(text)
            for d in doc:
                if d.text == word and d._.melt_tagger == "NC":
                    newSents.append(text)
                    break     
        return newSents

    def toJson(self,dic,rmv,listText,listPage,idText,listTitle):
        siteDic = {"amathonte":"https://www.idref.fr/027523217","délos":"https://www.idref.fr/183212118","thasos":"https://www.idref.fr/182710335","delphes":"https://www.idref.fr/027322505","rome":"https://www.idref.fr/02724301X"}
        delimiter = "@"
        delimiterPersee = "@"
        copyDIc = dic.copy()
        for key in rmv: #~check
            if key[0] not in self.ignoreWords:
                copyDIc[key] = [{"ID":"", "value":"","confident":""}]
        idArticle = []
        word = []
        page = []
        idRef = []
        confident = []
        text = []

        title = " ".join(listTitle[0])

        for key in copyDIc:
            lid = []
            lconf = []
            ltext = []
            lpage = []
            for subDic in copyDIc[key]:
                lid.append(subDic["ID"])
                lconf.append(subDic["confident"])
            for i,pg in enumerate(listPage):
                if pg == key[2]:
                    sents = [sentence + '.' for sentence in listText[i].split('.') if " "+key[0]+" " in sentence]

                    if key[0] in ["ferme","porte","base","fort"]:
                        sents = self.posVerif(key[0],sents)

                    for i,s in enumerate(sents):
                        sents[i] = s.replace(" "+key[0]+" "," **"+key[0]+"** ")
                    ltext += sents
                    lpage += len(sents)*[str(key[2])]
            if key[2] == "Title":
                ltext.append(title)
                lpage.append("Title")
            if len(ltext) > 0:
                idRef.append(lid)
                confident.append(lconf)
                idArticle.append(key[1])
                word.append(key[0])
                text.append(ltext)
                page.append(lpage)

        jsonDic = {}
        jsonDicPersee = {}

        jsonDic["idArticle"] = idText
        jsonDicPersee["idArticle"] = idText

        jsonDic["title"] = title
        jsonDicPersee["title"] = title

        listDic = []
        listDicPersee = []

        dejaUse = []
        sites = []
        for i in range(len(idArticle)):
            if word[i] in dejaUse:
                for subDic in listDic:
                    if subDic["name"] == word[i]:
                        subDic["page"] += delimiter + delimiter.join(page[i])
                        subDic["text"] += delimiter + delimiter.join(text[i])
                for subDicPersee in listDicPersee:
                    if subDicPersee["name"] == word[i]:
                        for k in range(len(text[i])):
                            subDicPersee["occurences"].append({"page" : page[i][k], "text" : text[i][k]}) #a tester 
                
            else:
                if word[i] in siteDic: # https://www.idref.fr/027523217  https://www.idref.fr/183212118  https://www.idref.fr/182710335  https://www.idref.fr/027322505
                    sites.append(word[i])

                subDic = {}
                subDic["name"] = word[i]
                subDic["page"] = delimiter.join(page[i])
                subDic["text"] = delimiter.join(text[i])

                subDicPersee = {}
                subDicPersee["name"] = word[i] 
                subDicPersee["occurences"] = []
                for k in range(len(text[i])):
                    subDicPersee["occurences"].append({"page" : page[i][k], "text" : text[i][k]})

                if word[i] in siteDic:
                    subDic["notice"] = siteDic[word[i]]
                    subDic["score"] = ""
                    subDicPersee["notice"] = siteDic[word[i]]
                    subDicPersee["score"] = ""
                else:
                    subDic["notice"] = delimiter.join(idRef[i])
                    subDic["score"] = delimiter.join(confident[i])
                    subDicPersee["notice"] = delimiterPersee.join(idRef[i])
                    subDicPersee["score"] = delimiterPersee.join(confident[i])                        

                dejaUse.append(word[i])
                listDic.append(subDic)
                listDicPersee.append(subDicPersee)

        jsonDic["sites"] = list(set(sites))
        jsonDic["entite"] = listDic

        jsonDicPersee["sites"] = list(set(sites))
        jsonDicPersee["entite"] = listDicPersee

        self.listInist.append(jsonDic)
        self.listPersee.append(jsonDicPersee)

    def writeJson(self):
        with open('json_data.json', 'w') as outfile:
            json.dump(self.listInist, outfile,ensure_ascii=False, indent=4)

        with open('json_data_persee.json', 'w') as outfile:
            json.dump(self.listPersee, outfile,ensure_ascii=False, indent=4)