#!/opt/bitnami/python/bin/python3.7 # -*- coding: utf-8 -*- import torch from transformers import AutoTokenizer, AutoModelForTokenClassification import sys import json import re import unicodedata # Normalize # Normalisation du texte : def remove_accents(text): text = unicodedata.normalize("NFD", text) text = re.sub("[\u0300-\u036f]", "", text) return text def normalizeText(text): text = text.lower() text = remove_accents(text).replace(" ","") return text ## Predicts developped formulas # Load model tokenizer = AutoTokenizer.from_pretrained('./v1/diseases/models') model = AutoModelForTokenClassification.from_pretrained('./v1/diseases/models', config='./v1/diseases/models/config.json') # predicts text def predict_formula_ml(input_text): #tokenizer tokens = tokenizer(input_text, return_tensors='pt', truncation=True, max_length=512) # Predicts with torch.no_grad(): output = model(**tokens) predictions = torch.argmax(output.logits, dim=-1) # Get token that contains "disease" tokens = tokenizer.convert_ids_to_tokens(tokens['input_ids'][0]) disease_tokens_list = [] i=0 while i < len(predictions[0]): # prediction [0][i] depends of i : {0 : "B-disease" , 1 : "I-disease" , 2: "NOT a disease NE"} k=0 if predictions[0][i] < 2: disease_tokens_toappend = [] while predictions[0][i+k] < 2: disease_tokens_toappend.append(tokens[i+k]) k+=1 disease_tokens_list.append(disease_tokens_toappend) i+=k+1 value = [] for disease_tokens in disease_tokens_list: value.append(tokenizer.decode(tokenizer.convert_tokens_to_ids(disease_tokens))) return value # if text too long def split_text(text): if len(text)>=512: text_split = text.split('.') else: text_split = [text] return text_split # predicts text after, either it is splitted or not def predict_formula_ml_list(list): output = [] for elt in list: output+= predict_formula_ml(elt) return output # remove bad space in outputs def curate_list(input_list): output_list = [] for elt in input_list: if '#' not in elt: output_list.append( elt.replace('- ','-').replace(' -','-').replace('( ','(').replace(' (','(').replace(') ',')').replace(' )',')').replace('[ ','[') .replace(' [','[').replace('] ',']').replace(' ]',']') ) return output_list #Disambigusate formulas : #preprocessing : remove duplicates elements def remove_duplicates(input_list): output_list = [] normalized_list = [] for elt in input_list: if normalizeText(elt) not in normalized_list: output_list.append(elt) normalized_list.append(normalizeText(elt)) return output_list # beginning of the ws for line in sys.stdin: data = json.loads(line) # Use the model to find NER value = remove_duplicates(curate_list(predict_formula_ml_list(split_text(data["value"])))) # Standardization data["value"] = {"diseases":value} # remove_duplicates(value) json.dump(data, sys.stdout, ensure_ascii=False) sys.stdout.write("\n")