import ast import re class alignWithText: def __init__(self,dfAnnotations): self.dfAnnotations = dfAnnotations pass def isAnnotationInText(self,text,page,dictionnary,idText): text = " " + " ".join(text) + " " text = text.lower() dic = dictionnary for index, row in self.dfAnnotations.iterrows(): annotations = ast.literal_eval(row["Annotation"]) trueAnnotations = ast.literal_eval(row["trueAnnotation"]) for i,annot in enumerate(annotations): annot = annot.lower() annot = " " + annot + " " if len(annot) < 5: continue if "(" in annot: continue starts = [m.span()[0] for m in re.finditer(annot,text)] if starts == []: continue for start in starts: inside = 0 if (annot[1:-1],idText,page) in dic: for dico in dic[(annot[1:-1],idText,page)]: if dico["ID"] == row["ID"] and dico["value"] == trueAnnotations[i]: inside = 1 if inside == 0: dic[(annot[1:-1],idText,page)] += [{"ID":row["ID"], "value":trueAnnotations[i], "start":start}] else: dic[(annot[1:-1],idText,page)] = [{"ID":row["ID"], "value":trueAnnotations[i], "start":start}] start += 1 return dic class postProcessing: def __init__(self,dfAnnotations,dic): self.dfAnnotations = dfAnnotations self.dic = dic def removeDuplicate(self): popList = [] for key1 in self.dic: for key2 in self.dic: if key1 != key2 and key1[2] == key2[2]: if self.dic[key1][0]["start"] == self.dic[key2][0]["start"] or (self.dic[key1][0]["value"] in self.dic[key2][0]["value"]) or (self.dic[key2][0]["value"] in self.dic[key1][0]["value"]): if len(self.dic[key1][0]["value"]) > len(self.dic[key2][0]["value"]): popList.append(key2) else: popList.append(key1) popList = list(set(popList)) for pop in popList: self.dic.pop(pop) def removeIgnore(self): allWord = [] for key in self.dic: dicList = self.dic[key] for subDic in dicList: allWord.append(subDic["value"]) cpyDic = {} rmvKey = [] for key in self.dic: dicList = self.dic[key] lSubDic = [] for subDic in dicList: inside = False ignoreList = ast.literal_eval(self.dfAnnotations[self.dfAnnotations["ID"] == subDic["ID"]].reset_index(drop=True)["Ignore"][0]) listIdRef = ast.literal_eval(self.dfAnnotations[self.dfAnnotations["ID"] == subDic["ID"]].reset_index(drop=True)["Annotation"][0]) if subDic["value"] in ignoreList: for annot in listIdRef: if annot != subDic["value"]: if annot in allWord: inside = True break if inside: lSubDic.append(subDic) else: lSubDic.append(subDic) if len(lSubDic) > 0: cpyDic[key] = lSubDic else: rmvKey.append(key) self.dic = cpyDic.copy() self.rmv = rmvKey def desambiguisation(self): finalDic = {} pageDic = {} countIdRef = {} for key in self.dic: pageDic[key] = self.dic[key] for value in self.dic[key]: if value["ID"] in countIdRef: countIdRef[value["ID"]] += 1 else: countIdRef[value["ID"]] = 1 countIdRefSorted = dict(sorted(countIdRef.items(), key=lambda item: item[1])) for key in pageDic: max = 1 trueIdRef = [] if len(pageDic[key]) > 1: for value in pageDic[key]: if value["ID"] in countIdRefSorted: nb = countIdRefSorted[value["ID"]] if nb > max: max = nb trueIdRef = [value] elif nb == max: trueIdRef.append(value) if max > 1: finalDic[key] = trueIdRef else: finalDic[key] = pageDic[key] else: finalDic[key] = pageDic[key] self.dic = finalDic.copy() def confident(self): listID = [] allLen = 0 apparatitionDic = {} for key in self.dic: for subDic in self.dic[key]: if subDic["ID"] not in listID: listIdRef = ast.literal_eval(self.dfAnnotations[self.dfAnnotations["ID"] == subDic["ID"]].reset_index(drop=True)["Annotation"][0]) allLen += len(listIdRef) for annot in listIdRef: if annot in apparatitionDic: apparatitionDic[annot] += 1 else: apparatitionDic[annot] = 1 listID.append(subDic["ID"]) for key in self.dic: for subDic in self.dic[key]: lenSubDic = len(subDic) apparition = 0 listIdRef = ast.literal_eval(self.dfAnnotations[self.dfAnnotations["ID"] == subDic["ID"]].reset_index(drop=True)["Annotation"][0]) for annot in listIdRef: if annot in apparatitionDic: apparition += apparatitionDic[annot] if (apparition - lenSubDic + 1 ) <= len(listIdRef): subDic["confident"] = "PP("+str(0)+")" elif (apparition - lenSubDic + 1 ) < 2*len(listIdRef): subDic["confident"] = "P("+str(apparition - lenSubDic + 1 - len(listIdRef))+")" else: subDic["confident"] = "TP("+str(apparition - lenSubDic + 1 - len(listIdRef))+")"