Newer
Older
web-services / diseases-ner / v1 / diseases / tagger.py
#!/opt/bitnami/python/bin/python3.7
# -*- coding: utf-8 -*-

import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
import sys
import json
import re
import unicodedata


# Normalize 
# Normalisation du texte :
def remove_accents(text):
    text = unicodedata.normalize("NFD", text)
    text = re.sub("[\u0300-\u036f]", "", text)
    return text

def normalizeText(text):
    text = text.lower()
    text = remove_accents(text).replace(" ","")
    return text


## Predicts developped formulas
# Load model
tokenizer = AutoTokenizer.from_pretrained('./v1/diseases/models')
model = AutoModelForTokenClassification.from_pretrained('./v1/diseases/models', config='./v1/diseases/models/config.json')


# predicts text
def predict_formula_ml(input_text):
    #tokenizer 
    tokens = tokenizer(input_text, return_tensors='pt', truncation=True, max_length=512)

    # Predicts
    with torch.no_grad():
        output = model(**tokens)

    predictions = torch.argmax(output.logits, dim=-1)

    # Get token that contains "disease"
    tokens = tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])
    disease_tokens_list = []
    i=0

    while i < len(predictions[0]):
        # prediction [0][i] depends of i : {0 : "B-disease" , 1 : "I-disease" , 2: "NOT a disease NE"}
        k=0
        if predictions[0][i] < 2:
            disease_tokens_toappend = []
            while predictions[0][i+k] < 2:
                disease_tokens_toappend.append(tokens[i+k])
                k+=1
            disease_tokens_list.append(disease_tokens_toappend)
        i+=k+1
    value = []
    for disease_tokens in disease_tokens_list:
        value.append(tokenizer.decode(tokenizer.convert_tokens_to_ids(disease_tokens)))
    return value

# if text too long
def split_text(text):
    if len(text)>=512:
        text_split = text.split('.')
    else:
        text_split = [text]
    return text_split

# predicts text after, either it is splitted or not
def predict_formula_ml_list(list):
    output = []
    for elt in list:
        output+= predict_formula_ml(elt)
    return output

# remove bad space in outputs
def curate_list(input_list):
    output_list = []
    for elt in input_list:
        if '#' not in elt:
            output_list.append(
                        elt.replace('- ','-').replace(' -','-').replace('( ','(').replace(' (','(').replace(') ',')').replace(' )',')').replace('[ ','[')
                        .replace(' [','[').replace('] ',']').replace(' ]',']')
                        )
    return output_list

#Disambigusate formulas :

#preprocessing : remove duplicates elements
def remove_duplicates(input_list):
    output_list = []
    normalized_list = []
    for elt in input_list:
        if normalizeText(elt) not in normalized_list:
            output_list.append(elt)
            normalized_list.append(normalizeText(elt))
    return output_list


# beginning of the ws  
for line in sys.stdin:
    data = json.loads(line)
    # Use the model to find NER
    value = remove_duplicates(curate_list(predict_formula_ml_list(split_text(data["value"]))))
    # Standardization
    data["value"] = {"diseases":value} # remove_duplicates(value)
    json.dump(data, sys.stdout, ensure_ascii=False)
    sys.stdout.write("\n")