diff --git a/biblio-ref/swagger.json b/biblio-ref/swagger.json index 1360d1a..63f90ee 100644 --- a/biblio-ref/swagger.json +++ b/biblio-ref/swagger.json @@ -3,7 +3,7 @@ "info": { "title": "biblio-ref - Valide une référence bibliographique", "summary": "Si un DOI est trouvé dans la référence bibliographique, valide la référence et indique si elle est rétractée", - "version": "1.0.0", + "version": "1.1.0", "termsOfService": "https://services.istex.fr/", "contact": { "name": "Inist-CNRS", diff --git a/chem-ner/README.md b/chem-ner/README.md index 6a3192b..4ff4314 100644 --- a/chem-ner/README.md +++ b/chem-ner/README.md @@ -1,35 +1,4 @@ # chem-ner -Cette instance propose un outil de reconnaissance d'entités nommées en chimie. - -## Configuration - -L'application à utiliser est XXX. - -## Utilisation - -- [v1/chem/tagger](#v1) - -### v1 - -Ce web-service renvoie la liste des entités nommées en chimie présentes dans le texte. - -Il prend en entrée du JSON avec deux champs, `id` et `value`, et renvoie un JSON avec le la liste des entités en fonction de leur étiquettes dans le champ `value`. - -#### Exemple de v1/first-name/gender - -Entrée - -```bash -$ cat < Le code source de ces services est désormais hébergé [sur +> GitHub](https://github.com/Inist-CNRS/web-services/tree/main/services/chem-ner#readme) \ No newline at end of file diff --git a/chem-ner/examples.http b/chem-ner/examples.http index 18305e9..967806c 100644 --- a/chem-ner/examples.http +++ b/chem-ner/examples.http @@ -1,18 +1,18 @@ -# Ces exemples peuvent être exécutés directement dans VSCode, en utilisant l'extension REST Client (humao.rest-client) +# These examples can be used directly in VSCode, using REST Client extension (humao.rest-client) +# They are important, because used to generate the tests.hurl file. + # Décommenter/commenter les lignes voulues pour tester localement -# @baseUrl=http://localhost:31976 -@baseUrl=http://chem-ner.tdmservices.intra.inist.fr/ -# @baseUrl=https://chem-ner.services.istex.fr/ +@host=http://localhost:31976 +# @host=https://chem-ner.services.istex.fr ### -# @name v1ChemNer -# Reconnaissance d'entités nommées de chimie -POST {{baseUrl}}/v1/chem/tagger?indent=true HTTP/1.1 +# @name v1routeInCamelCase +# Description de la route +POST {{host}}/v1/chem/tagger?indent=true HTTP/1.1 Content-Type: application/json [ - { - "id": 1, - "value": "This is three chemical name entity: methanol, 2-propanol, or CO2" - } + { + "id":1, + "value": "This is three chemical name entity: methanol, 2-propanol, or CO2" } ] diff --git a/chem-ner/swagger.json b/chem-ner/swagger.json index 52e48ec..c9ce9fe 100644 --- a/chem-ner/swagger.json +++ b/chem-ner/swagger.json @@ -1,33 +1,33 @@ { - "openapi": "3.0.0", - "info": { - "title": "chem-ner - Reconnaissance d'entités nommées en chimie", - "summary": "Reconnaissance des entités nommées de chimie", - "version": "1.0.6", - "termsOfService": "https://services.istex.fr/", - "contact": { - "name": "Inist-CNRS", - "url": "https://www.inist.fr/nous-contacter/" - } - }, - "servers": [ - { - "x-comment": "Will be automatically completed by the ezs server." - }, - { - "url": "http://vptdmservices.intra.inist.fr:49241/", - "description": "Latest version for production", - "x-profil": "Standard" - } - ], - "tags": [ - { - "name": "chem-ner", - "description": "Reconnaissance d'entités nommées en chimie", - "externalDocs": { - "description": "Plus de documentation", - "url": "https://gitbucket.inist.fr/tdm/web-services/chem-ner/README.md" - } - } - ] - } \ No newline at end of file + "openapi": "3.0.0", + "info": { + "title": "chem-ner - Trouve des entités nommées de Chimie dans un texte", + "summary": "Renvoie un Json composé comportant d'un champ \"chemical\" et d'un champ \"chemical_disambiguisate\"", + "version": "3.0.4", + "termsOfService": "https://services.istex.fr/", + "contact": { + "name": "Inist-CNRS", + "url": "https://www.inist.fr/nous-contacter/" + } + }, + "servers": [ + { + "x-comment": "Will be automatically completed by the ezs server." + }, + { + "url": "http://vptdmservices.intra.inist.fr:49249/", + "description": "Latest version for production", + "x-profil": "Standard" + } + ], + "tags": [ + { + "name": "chem-ner", + "description": "Trouve des entités nommées de Chimie dans un texte", + "externalDocs": { + "description": "Plus de documentation", + "url": "https://github.com/inist-cnrs/web-services/tree/main/services/chem-ner" + } + } + ] +} diff --git a/chem-ner/tests.hurl b/chem-ner/tests.hurl new file mode 100644 index 0000000..9331492 --- /dev/null +++ b/chem-ner/tests.hurl @@ -0,0 +1,24 @@ +POST {{host}}/v1/chem/tagger?indent=true +content-type: application/json +[ + { + "id":1, + "value": "This is three chemical name entity: methanol, 2-propanol, or CO2" } +] + + +HTTP 200 +[{ + "id": 1, + "value": { + "chemical": [ + "methanol", + "2-propanol", + "CO2" + ], + "chemical_disambiguisate": [ + "methanol", + "propan-2-ol" + ] + } +}] \ No newline at end of file diff --git a/chem-ner/v1/chem/models.dvc b/chem-ner/v1/chem/models.dvc deleted file mode 100644 index 024c62a..0000000 --- a/chem-ner/v1/chem/models.dvc +++ /dev/null @@ -1,5 +0,0 @@ -outs: -- md5: cf11342b09012680a08f928f8182c597.dir - size: 462540534 - nfiles: 8 - path: models diff --git a/chem-ner/v1/chem/tagger.ini b/chem-ner/v1/chem/tagger.ini deleted file mode 100644 index eab2d85..0000000 --- a/chem-ner/v1/chem/tagger.ini +++ /dev/null @@ -1,45 +0,0 @@ -# OpenAPI Documentation - JSON format (dot notation) -mimeType = application/json - -post.description = Trouve des entités nommées de Chimie dans un texte -post.responses.default.description = Renvoie un Json composé comportant d'un champs "chemical" et d'un champ "chemical_disambiguisate" -post.responses.default.content.application/json.schema.$ref = #/components/schemas/JSONStream -post.summary = Recherche d'entité nommées de Chimie -post.requestBody.required = true -post.requestBody.content.application/json.schema.$ref = #/components/schemas/JSONStream -post.parameters.0.in = query -post.parameters.0.name = indent -post.parameters.0.schema.type = boolean -post.parameters.0.description = Indent or not the JSON Result - -# Examples - -post.requestBody.content.application/json.example.0.id = 1 -post.requestBody.content.application/json.example.0.value = This is three chemical name entity: methanol, 2-propanol, or CO2. -post.responses.default.content.application/json.example.0.id = 1 -post.responses.default.content.application/json.example.0.value.chemical.0 = methanol -post.responses.default.content.application/json.example.0.value.chemical.1 = 2-propanol -post.responses.default.content.application/json.example.0.value.chemical.2 = CO2 -post.responses.default.content.application/json.example.0.value.chemical_siambiguisate.0 = methanol -post.responses.default.content.application/json.example.0.value.chemical_siambiguisate.1 = propan-2-ol - - -[use] -# exec -plugin = @ezs/spawn -# JSONParse -plugin = @ezs/basics - -[JSONParse] -separator = * - -[expand] -path = value -size = 100 - -[expand/exec] -# command should be executable ! -command = ./v1/chem/tagger.py - -[dump] -indent = env('indent', false) diff --git a/chem-ner/v1/chem/tagger.py b/chem-ner/v1/chem/tagger.py deleted file mode 100644 index 0ba3402..0000000 --- a/chem-ner/v1/chem/tagger.py +++ /dev/null @@ -1,127 +0,0 @@ -#!/opt/bitnami/python/bin/python3.7 -# -*- coding: utf-8 -*- - -import torch -from transformers import AutoTokenizer, AutoModelForTokenClassification -import re -import os -import sys -import json -import logging -import unicodedata -import pickle - -# Remove logs from TF -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' -logging.getLogger('tensorflow').setLevel(logging.ERROR) - -# Normalize -# Normalisation du texte : -def remove_accents(text): - text = unicodedata.normalize("NFD", text) - text = re.sub("[\u0300-\u036f]", "", text) - return text - -def normalizeText(text): - text = text.lower() - text = remove_accents(text).replace(" ","") - return text - - -with open("./v1/chem/models/dic-name-iupac-filtered-enrich.pkl",'rb') as f_dic: - dict_name_iupac = pickle.load(f_dic) - -## Predicts developped formulas -# Load model -tokenizer = AutoTokenizer.from_pretrained('./v1/chem/models') -model = AutoModelForTokenClassification.from_pretrained('./v1/chem/models', config='./v1/chem/models/config.json') - -# predicts text -def predict_formula_ml(input_text): - #tokenizer - tokens = tokenizer(input_text, return_tensors='pt', truncation=True, max_length=512) - - # Predicts - with torch.no_grad(): - output = model(**tokens) - - predictions = torch.argmax(output.logits, dim=-1) - - # Get token that contains "CHEMICAL" - tokens = tokenizer.convert_ids_to_tokens(tokens['input_ids'][0]) - chemical_tokens_list = [] - i=0 - - while i < len(predictions[0]): - # prediction [0][i] depends of i : {0 : "B-CHEMICAL" , 1 : "I-CHEMICAL" , 2: "NOT a chemical NE"} - k=0 - if predictions[0][i] < 2: - chemical_tokens_toappend = [] - while predictions[0][i+k] < 2: - chemical_tokens_toappend.append(tokens[i+k]) - k+=1 - chemical_tokens_list.append(chemical_tokens_toappend) - i+=k+1 - value = [] - for chemical_tokens in chemical_tokens_list: - value.append(tokenizer.decode(tokenizer.convert_tokens_to_ids(chemical_tokens))) - return value - -# if text too long -def split_text(text): - if len(text)>=512: - text_split = text.split('.') - else: - text_split = [text] - return text_split - -# predicts text after, either it is splitted or not -def predict_formula_ml_list(list): - output = [] - for elt in list: - output+= predict_formula_ml(elt) - return output - -# remove bad space in outputs -def curate_list(input_list): - output_list = [] - for elt in input_list: - if '#' not in elt: - output_list.append( - elt.replace('- ','-').replace(' -','-').replace('( ','(').replace(' (','(').replace(') ',')').replace(' )',')').replace('[ ','[') - .replace(' [','[').replace('] ',']').replace(' ]',']') - ) - return output_list - -#Disambigusate formulas : - -#preprocessing : remove duplicates elements -def remove_duplicates(input_list): - output_list = [] - normalized_list = [] - for elt in input_list: - if normalizeText(elt) not in normalized_list: - output_list.append(elt) - normalized_list.append(normalizeText(elt)) - return output_list - -def disambiguisate_formula(input_list): - output_list = [] - for elt in input_list: - try: - output_list.append(dict_name_iupac[normalizeText(elt)]) - except: - continue - return output_list - - - -# beginning of the ws -for line in sys.stdin: - data = json.loads(line) - # Use the model to find NER - value = remove_duplicates(curate_list(predict_formula_ml_list(split_text(data["value"])))) - # Standardization - data["value"] = {"chemical":value, "chemical_disambiguisate":remove_duplicates(disambiguisate_formula(value))} - json.dump(data, sys.stdout, ensure_ascii=False) - sys.stdout.write("\n") diff --git a/diseases-ner/README.md b/diseases-ner/README.md index a360f01..bf5c6c3 100644 --- a/diseases-ner/README.md +++ b/diseases-ner/README.md @@ -1,35 +1,4 @@ # diseases-ner -Cette instance propose un outil de reconnaissance d'entités nommées de maladies. - -## Configuration - -L'application à utiliser est XXX. - -## Utilisation - -- [v1/diseases/tagger](#v1) - -### v1 - -Ce web-service renvoie la liste des entités nommées en chimie présentes dans le texte. - -Il prend en entrée du JSON avec deux champs, `id` et `value`, et renvoie un JSON avec le la liste des entités en fonction de leur étiquettes dans le champ `value`. - -#### Exemple de v1/first-name/gender - -Entrée - -```bash -$ cat < Le code source de ces services est désormais hébergé [sur +> GitHub](https://github.com/Inist-CNRS/web-services/tree/main/services/diseases-ner#readme) \ No newline at end of file diff --git a/diseases-ner/examples.http b/diseases-ner/examples.http index 8cc8a30..0f021ec 100644 --- a/diseases-ner/examples.http +++ b/diseases-ner/examples.http @@ -1,18 +1,19 @@ -# Ces exemples peuvent être exécutés directement dans VSCode, en utilisant l'extension REST Client (humao.rest-client) +# These examples can be used directly in VSCode, using REST Client extension (humao.rest-client) +# They are important, because used to generate the tests.hurl file. + # Décommenter/commenter les lignes voulues pour tester localement -# @baseUrl=http://localhost:31976 -@baseUrl=http://diseases-ner.tdmservices.intra.inist.fr/ -# @baseUrl=https://diseases-ner.services.istex.fr/ +@host=http://localhost:31976 +# @host=https://diseases-ner.services.istex.fr ### -# @name v1DiseasesNer -# Reconnaissance d'entités nommées de maladies -POST {{baseUrl}}/v1/diseases/tagger?indent=true HTTP/1.1 +# @name v1diseasesTagger +# Trouve les entités nommées de maladies +POST {{host}}/v1/diseases/tagger?indent=true HTTP/1.1 Content-Type: application/json [ - { - "id": 1, - "value":"They have been used in the prevention and treatment of malaria and autoimmune diseases, including systemic lupus erythematosus and rheumatoid arthritis." - } + { + "id":1, + "value": "They have been used in the prevention and treatment of malaria and autoimmune diseases, including systemic lupus erythematosus and rheumatoid arthritis." + } ] diff --git a/diseases-ner/swagger.json b/diseases-ner/swagger.json index 7c414b6..a3c13e0 100644 --- a/diseases-ner/swagger.json +++ b/diseases-ner/swagger.json @@ -1,33 +1,33 @@ { - "openapi": "3.0.0", - "info": { - "title": "diseases-ner - Reconnaissance d'entités nommées de maladies", - "summary": "Reconnaissance des entités nommées de maladies", - "version": "1.0.6", - "termsOfService": "https://services.istex.fr/", - "contact": { - "name": "Inist-CNRS", - "url": "https://www.inist.fr/nous-contacter/" - } - }, - "servers": [ - { - "x-comment": "Will be automatically completed by the ezs server." - }, - { - "url": "http://vptdmservices.intra.inist.fr:49242/", - "description": "Latest version for production", - "x-profil": "Standard" - } - ], - "tags": [ - { - "name": "diseases-ner", - "description": "Reconnaissance d'entités nommées de maladies", - "externalDocs": { - "description": "Plus de documentation", - "url": "https://gitbucket.inist.fr/tdm/web-services/chem-ner/README.md" - } - } - ] - } \ No newline at end of file + "openapi": "3.0.0", + "info": { + "title": "diseases-ner - Trouve des entités nommées de maladies dans un texte", + "summary": "Renvoie un Json composé comportant un champs `diseases` correspondant aux entités nommées de maladies trouvées", + "version": "1.0.9", + "termsOfService": "https://services.istex.fr/", + "contact": { + "name": "Inist-CNRS", + "url": "https://www.inist.fr/nous-contacter/" + } + }, + "servers": [ + { + "x-comment": "Will be automatically completed by the ezs server." + }, + { + "url": "http://vptdmservices.intra.inist.fr:49250/", + "description": "Latest version for production", + "x-profil": "Standard" + } + ], + "tags": [ + { + "name": "diseases-ner", + "description": "Trouve des entités nommées de maladies dans un texte", + "externalDocs": { + "description": "Plus de documentation", + "url": "https://github.com/inist-cnrs/web-services/tree/main/services/diseases-ner" + } + } + ] +} diff --git a/diseases-ner/tests.hurl b/diseases-ner/tests.hurl new file mode 100644 index 0000000..4afa7ac --- /dev/null +++ b/diseases-ner/tests.hurl @@ -0,0 +1,22 @@ +POST {{host}}/v1/diseases/tagger?indent=true +content-type: application/json +[ + { + "id":1, + "value": "They have been used in the prevention and treatment of malaria and autoimmune diseases, including systemic lupus erythematosus and rheumatoid arthritis." + } +] + + +HTTP 200 +[{ + "id": 1, + "value": { + "diseases": [ + "malaria", + "autoimmune diseases", + "systemic lupus erythematosus", + "rheumatoid arthritis" + ] + } +}] \ No newline at end of file diff --git a/diseases-ner/v1/diseases/models.dvc b/diseases-ner/v1/diseases/models.dvc deleted file mode 100644 index b6c0e73..0000000 --- a/diseases-ner/v1/diseases/models.dvc +++ /dev/null @@ -1,6 +0,0 @@ -outs: -- md5: ddf4fda3fe13cd7c131205d4e34b6de9.dir - size: 432104605 - nfiles: 11 - hash: md5 - path: models diff --git a/diseases-ner/v1/diseases/tagger.ini b/diseases-ner/v1/diseases/tagger.ini deleted file mode 100644 index cd5deb4..0000000 --- a/diseases-ner/v1/diseases/tagger.ini +++ /dev/null @@ -1,42 +0,0 @@ -# OpenAPI Documentation - JSON format (dot notation) -mimeType = application/json - -post.description = Trouve des entités nommées de maladies dans un texte -post.responses.default.description = Renvoie un Json composé comportant un champs `value` correspondant aux entités de maladies trouvées -post.responses.default.content.application/json.schema.$ref = #/components/schemas/JSONStream -post.summary = Recherche d'entité nommées de maladies -post.requestBody.required = true -post.requestBody.content.application/json.schema.$ref = #/components/schemas/JSONStream -post.parameters.0.in = query -post.parameters.0.name = indent -post.parameters.0.schema.type = boolean -post.parameters.0.description = Indent or not the JSON Result - -# Examples - -post.requestBody.content.application/json.example.0.id: 1 -post.requestBody.content.application/json.example.0.value: They have been used in the prevention and treatment of malaria and autoimmune diseases, including systemic lupus erythematosus and rheumatoid arthritis. -post.responses.default.content.application/json.example.0.id: 1 -post.responses.default.content.application/json.example.0.value: -post.responses.default.content.application/json.example.0.value.diseases: ["malaria", "autoimmune diseases", "systemic lupus erythematosus", "rheumatoid arthritis"] - - -[use] -# exec -plugin = @ezs/spawn -# JSONParse -plugin = @ezs/basics - -[JSONParse] -separator = * - -[expand] -path = value -size = 100 - -[expand/exec] -# command should be executable ! -command = ./v1/diseases/tagger.py - -[dump] -indent = env('indent', false) diff --git a/diseases-ner/v1/diseases/tagger.py b/diseases-ner/v1/diseases/tagger.py deleted file mode 100644 index beaa50f..0000000 --- a/diseases-ner/v1/diseases/tagger.py +++ /dev/null @@ -1,109 +0,0 @@ -#!/opt/bitnami/python/bin/python3.7 -# -*- coding: utf-8 -*- - -import torch -from transformers import AutoTokenizer, AutoModelForTokenClassification -import sys -import json -import re -import unicodedata - - -# Normalize -# Normalisation du texte : -def remove_accents(text): - text = unicodedata.normalize("NFD", text) - text = re.sub("[\u0300-\u036f]", "", text) - return text - -def normalizeText(text): - text = text.lower() - text = remove_accents(text).replace(" ","") - return text - - -## Predicts developped formulas -# Load model -tokenizer = AutoTokenizer.from_pretrained('./v1/diseases/models') -model = AutoModelForTokenClassification.from_pretrained('./v1/diseases/models', config='./v1/diseases/models/config.json') - - -# predicts text -def predict_formula_ml(input_text): - #tokenizer - tokens = tokenizer(input_text, return_tensors='pt', truncation=True, max_length=512) - - # Predicts - with torch.no_grad(): - output = model(**tokens) - - predictions = torch.argmax(output.logits, dim=-1) - - # Get token that contains "disease" - tokens = tokenizer.convert_ids_to_tokens(tokens['input_ids'][0]) - disease_tokens_list = [] - i=0 - - while i < len(predictions[0]): - # prediction [0][i] depends of i : {0 : "B-disease" , 1 : "I-disease" , 2: "NOT a disease NE"} - k=0 - if predictions[0][i] < 2: - disease_tokens_toappend = [] - while predictions[0][i+k] < 2: - disease_tokens_toappend.append(tokens[i+k]) - k+=1 - disease_tokens_list.append(disease_tokens_toappend) - i+=k+1 - value = [] - for disease_tokens in disease_tokens_list: - value.append(tokenizer.decode(tokenizer.convert_tokens_to_ids(disease_tokens))) - return value - -# if text too long -def split_text(text): - if len(text)>=512: - text_split = text.split('.') - else: - text_split = [text] - return text_split - -# predicts text after, either it is splitted or not -def predict_formula_ml_list(list): - output = [] - for elt in list: - output+= predict_formula_ml(elt) - return output - -# remove bad space in outputs -def curate_list(input_list): - output_list = [] - for elt in input_list: - if '#' not in elt: - output_list.append( - elt.replace('- ','-').replace(' -','-').replace('( ','(').replace(' (','(').replace(') ',')').replace(' )',')').replace('[ ','[') - .replace(' [','[').replace('] ',']').replace(' ]',']') - ) - return output_list - -#Disambigusate formulas : - -#preprocessing : remove duplicates elements -def remove_duplicates(input_list): - output_list = [] - normalized_list = [] - for elt in input_list: - if normalizeText(elt) not in normalized_list: - output_list.append(elt) - normalized_list.append(normalizeText(elt)) - return output_list - - -# beginning of the ws -for line in sys.stdin: - data = json.loads(line) - # Use the model to find NER - value = remove_duplicates(curate_list(predict_formula_ml_list(split_text(data["value"])))) - # Standardization - data["value"] = {"diseases":value} # remove_duplicates(value) - json.dump(data, sys.stdout, ensure_ascii=False) - sys.stdout.write("\n")