diff --git a/affiliation-rnsr/README.md b/affiliation-rnsr/README.md index 1946baf..556e095 100644 --- a/affiliation-rnsr/README.md +++ b/affiliation-rnsr/README.md @@ -1,3 +1,4 @@ # affiliation-rnsr -Le code de ce web service se trouve sur github à ce lien : https://github.com/Inist-CNRS/web-services/tree/main/services/affiliation-rnsr#readme \ No newline at end of file +> Le code source de ces services est désormais hébergé [sur +> GitHub](https://github.com/Inist-CNRS/web-services/tree/main/services/affiliation-rnsr#readme) \ No newline at end of file diff --git a/biblio-ref/README.md b/biblio-ref/README.md index a7df4c2..1aa01af 100644 --- a/biblio-ref/README.md +++ b/biblio-ref/README.md @@ -1,3 +1,3 @@ -# affiliation-rnsr +# biblio-ref -Le code de ce web service se trouve sur github à ce lien : https://github.com/Inist-CNRS/web-services/tree/main/services/biblio-ref#readme \ No newline at end of file +Le code de ce web service se trouve sur github à ce lien : https://github.com/Inist-CNRS/web-services/tree/main/services/biblio-ref#readme diff --git a/biblio-ref/swagger.json b/biblio-ref/swagger.json index b40d45f..f7993cc 100644 --- a/biblio-ref/swagger.json +++ b/biblio-ref/swagger.json @@ -3,7 +3,7 @@ "info": { "title": "biblio-ref - Valide une référence bibliographique", "summary": "Si un DOI est trouvé dans la référence bibliographique, valide la référence et indique si elle est rétractée", - "version": "1.0.0", + "version": "1.1.0", "termsOfService": "https://services.istex.fr/", "contact": { "name": "Inist-CNRS", @@ -15,7 +15,7 @@ "x-comment": "Will be automatically completed by the ezs server." }, { - "url": "http://vptdmservices.intra.inist.fr:49247/", + "url": "http://vptdmservices.intra.inist.fr:49251/", "description": "Latest version for production", "x-profil": "Standard" } @@ -30,4 +30,4 @@ } } ] -} \ No newline at end of file +} diff --git a/chem-ner/README.md b/chem-ner/README.md index 6a3192b..4ff4314 100644 --- a/chem-ner/README.md +++ b/chem-ner/README.md @@ -1,35 +1,4 @@ # chem-ner -Cette instance propose un outil de reconnaissance d'entités nommées en chimie. - -## Configuration - -L'application à utiliser est XXX. - -## Utilisation - -- [v1/chem/tagger](#v1) - -### v1 - -Ce web-service renvoie la liste des entités nommées en chimie présentes dans le texte. - -Il prend en entrée du JSON avec deux champs, `id` et `value`, et renvoie un JSON avec le la liste des entités en fonction de leur étiquettes dans le champ `value`. - -#### Exemple de v1/first-name/gender - -Entrée - -```bash -$ cat < Le code source de ces services est désormais hébergé [sur +> GitHub](https://github.com/Inist-CNRS/web-services/tree/main/services/chem-ner#readme) \ No newline at end of file diff --git a/chem-ner/examples.http b/chem-ner/examples.http index 18305e9..967806c 100644 --- a/chem-ner/examples.http +++ b/chem-ner/examples.http @@ -1,18 +1,18 @@ -# Ces exemples peuvent être exécutés directement dans VSCode, en utilisant l'extension REST Client (humao.rest-client) +# These examples can be used directly in VSCode, using REST Client extension (humao.rest-client) +# They are important, because used to generate the tests.hurl file. + # Décommenter/commenter les lignes voulues pour tester localement -# @baseUrl=http://localhost:31976 -@baseUrl=http://chem-ner.tdmservices.intra.inist.fr/ -# @baseUrl=https://chem-ner.services.istex.fr/ +@host=http://localhost:31976 +# @host=https://chem-ner.services.istex.fr ### -# @name v1ChemNer -# Reconnaissance d'entités nommées de chimie -POST {{baseUrl}}/v1/chem/tagger?indent=true HTTP/1.1 +# @name v1routeInCamelCase +# Description de la route +POST {{host}}/v1/chem/tagger?indent=true HTTP/1.1 Content-Type: application/json [ - { - "id": 1, - "value": "This is three chemical name entity: methanol, 2-propanol, or CO2" - } + { + "id":1, + "value": "This is three chemical name entity: methanol, 2-propanol, or CO2" } ] diff --git a/chem-ner/swagger.json b/chem-ner/swagger.json index 52e48ec..c9ce9fe 100644 --- a/chem-ner/swagger.json +++ b/chem-ner/swagger.json @@ -1,33 +1,33 @@ { - "openapi": "3.0.0", - "info": { - "title": "chem-ner - Reconnaissance d'entités nommées en chimie", - "summary": "Reconnaissance des entités nommées de chimie", - "version": "1.0.6", - "termsOfService": "https://services.istex.fr/", - "contact": { - "name": "Inist-CNRS", - "url": "https://www.inist.fr/nous-contacter/" - } - }, - "servers": [ - { - "x-comment": "Will be automatically completed by the ezs server." - }, - { - "url": "http://vptdmservices.intra.inist.fr:49241/", - "description": "Latest version for production", - "x-profil": "Standard" - } - ], - "tags": [ - { - "name": "chem-ner", - "description": "Reconnaissance d'entités nommées en chimie", - "externalDocs": { - "description": "Plus de documentation", - "url": "https://gitbucket.inist.fr/tdm/web-services/chem-ner/README.md" - } - } - ] - } \ No newline at end of file + "openapi": "3.0.0", + "info": { + "title": "chem-ner - Trouve des entités nommées de Chimie dans un texte", + "summary": "Renvoie un Json composé comportant d'un champ \"chemical\" et d'un champ \"chemical_disambiguisate\"", + "version": "3.0.4", + "termsOfService": "https://services.istex.fr/", + "contact": { + "name": "Inist-CNRS", + "url": "https://www.inist.fr/nous-contacter/" + } + }, + "servers": [ + { + "x-comment": "Will be automatically completed by the ezs server." + }, + { + "url": "http://vptdmservices.intra.inist.fr:49249/", + "description": "Latest version for production", + "x-profil": "Standard" + } + ], + "tags": [ + { + "name": "chem-ner", + "description": "Trouve des entités nommées de Chimie dans un texte", + "externalDocs": { + "description": "Plus de documentation", + "url": "https://github.com/inist-cnrs/web-services/tree/main/services/chem-ner" + } + } + ] +} diff --git a/chem-ner/tests.hurl b/chem-ner/tests.hurl new file mode 100644 index 0000000..9331492 --- /dev/null +++ b/chem-ner/tests.hurl @@ -0,0 +1,24 @@ +POST {{host}}/v1/chem/tagger?indent=true +content-type: application/json +[ + { + "id":1, + "value": "This is three chemical name entity: methanol, 2-propanol, or CO2" } +] + + +HTTP 200 +[{ + "id": 1, + "value": { + "chemical": [ + "methanol", + "2-propanol", + "CO2" + ], + "chemical_disambiguisate": [ + "methanol", + "propan-2-ol" + ] + } +}] \ No newline at end of file diff --git a/chem-ner/v1/chem/models.dvc b/chem-ner/v1/chem/models.dvc deleted file mode 100644 index 024c62a..0000000 --- a/chem-ner/v1/chem/models.dvc +++ /dev/null @@ -1,5 +0,0 @@ -outs: -- md5: cf11342b09012680a08f928f8182c597.dir - size: 462540534 - nfiles: 8 - path: models diff --git a/chem-ner/v1/chem/tagger.ini b/chem-ner/v1/chem/tagger.ini deleted file mode 100644 index eab2d85..0000000 --- a/chem-ner/v1/chem/tagger.ini +++ /dev/null @@ -1,45 +0,0 @@ -# OpenAPI Documentation - JSON format (dot notation) -mimeType = application/json - -post.description = Trouve des entités nommées de Chimie dans un texte -post.responses.default.description = Renvoie un Json composé comportant d'un champs "chemical" et d'un champ "chemical_disambiguisate" -post.responses.default.content.application/json.schema.$ref = #/components/schemas/JSONStream -post.summary = Recherche d'entité nommées de Chimie -post.requestBody.required = true -post.requestBody.content.application/json.schema.$ref = #/components/schemas/JSONStream -post.parameters.0.in = query -post.parameters.0.name = indent -post.parameters.0.schema.type = boolean -post.parameters.0.description = Indent or not the JSON Result - -# Examples - -post.requestBody.content.application/json.example.0.id = 1 -post.requestBody.content.application/json.example.0.value = This is three chemical name entity: methanol, 2-propanol, or CO2. -post.responses.default.content.application/json.example.0.id = 1 -post.responses.default.content.application/json.example.0.value.chemical.0 = methanol -post.responses.default.content.application/json.example.0.value.chemical.1 = 2-propanol -post.responses.default.content.application/json.example.0.value.chemical.2 = CO2 -post.responses.default.content.application/json.example.0.value.chemical_siambiguisate.0 = methanol -post.responses.default.content.application/json.example.0.value.chemical_siambiguisate.1 = propan-2-ol - - -[use] -# exec -plugin = @ezs/spawn -# JSONParse -plugin = @ezs/basics - -[JSONParse] -separator = * - -[expand] -path = value -size = 100 - -[expand/exec] -# command should be executable ! -command = ./v1/chem/tagger.py - -[dump] -indent = env('indent', false) diff --git a/chem-ner/v1/chem/tagger.py b/chem-ner/v1/chem/tagger.py deleted file mode 100644 index 0ba3402..0000000 --- a/chem-ner/v1/chem/tagger.py +++ /dev/null @@ -1,127 +0,0 @@ -#!/opt/bitnami/python/bin/python3.7 -# -*- coding: utf-8 -*- - -import torch -from transformers import AutoTokenizer, AutoModelForTokenClassification -import re -import os -import sys -import json -import logging -import unicodedata -import pickle - -# Remove logs from TF -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' -logging.getLogger('tensorflow').setLevel(logging.ERROR) - -# Normalize -# Normalisation du texte : -def remove_accents(text): - text = unicodedata.normalize("NFD", text) - text = re.sub("[\u0300-\u036f]", "", text) - return text - -def normalizeText(text): - text = text.lower() - text = remove_accents(text).replace(" ","") - return text - - -with open("./v1/chem/models/dic-name-iupac-filtered-enrich.pkl",'rb') as f_dic: - dict_name_iupac = pickle.load(f_dic) - -## Predicts developped formulas -# Load model -tokenizer = AutoTokenizer.from_pretrained('./v1/chem/models') -model = AutoModelForTokenClassification.from_pretrained('./v1/chem/models', config='./v1/chem/models/config.json') - -# predicts text -def predict_formula_ml(input_text): - #tokenizer - tokens = tokenizer(input_text, return_tensors='pt', truncation=True, max_length=512) - - # Predicts - with torch.no_grad(): - output = model(**tokens) - - predictions = torch.argmax(output.logits, dim=-1) - - # Get token that contains "CHEMICAL" - tokens = tokenizer.convert_ids_to_tokens(tokens['input_ids'][0]) - chemical_tokens_list = [] - i=0 - - while i < len(predictions[0]): - # prediction [0][i] depends of i : {0 : "B-CHEMICAL" , 1 : "I-CHEMICAL" , 2: "NOT a chemical NE"} - k=0 - if predictions[0][i] < 2: - chemical_tokens_toappend = [] - while predictions[0][i+k] < 2: - chemical_tokens_toappend.append(tokens[i+k]) - k+=1 - chemical_tokens_list.append(chemical_tokens_toappend) - i+=k+1 - value = [] - for chemical_tokens in chemical_tokens_list: - value.append(tokenizer.decode(tokenizer.convert_tokens_to_ids(chemical_tokens))) - return value - -# if text too long -def split_text(text): - if len(text)>=512: - text_split = text.split('.') - else: - text_split = [text] - return text_split - -# predicts text after, either it is splitted or not -def predict_formula_ml_list(list): - output = [] - for elt in list: - output+= predict_formula_ml(elt) - return output - -# remove bad space in outputs -def curate_list(input_list): - output_list = [] - for elt in input_list: - if '#' not in elt: - output_list.append( - elt.replace('- ','-').replace(' -','-').replace('( ','(').replace(' (','(').replace(') ',')').replace(' )',')').replace('[ ','[') - .replace(' [','[').replace('] ',']').replace(' ]',']') - ) - return output_list - -#Disambigusate formulas : - -#preprocessing : remove duplicates elements -def remove_duplicates(input_list): - output_list = [] - normalized_list = [] - for elt in input_list: - if normalizeText(elt) not in normalized_list: - output_list.append(elt) - normalized_list.append(normalizeText(elt)) - return output_list - -def disambiguisate_formula(input_list): - output_list = [] - for elt in input_list: - try: - output_list.append(dict_name_iupac[normalizeText(elt)]) - except: - continue - return output_list - - - -# beginning of the ws -for line in sys.stdin: - data = json.loads(line) - # Use the model to find NER - value = remove_duplicates(curate_list(predict_formula_ml_list(split_text(data["value"])))) - # Standardization - data["value"] = {"chemical":value, "chemical_disambiguisate":remove_duplicates(disambiguisate_formula(value))} - json.dump(data, sys.stdout, ensure_ascii=False) - sys.stdout.write("\n") diff --git a/data-wrapper/README.md b/data-wrapper/README.md index 8fe1d88..c91c0ef 100644 --- a/data-wrapper/README.md +++ b/data-wrapper/README.md @@ -1,7 +1,10 @@ # data-wrapper -Ces services permettent de convertir divers fichiers dans un format corpus (tar.gz) compatible avec tous les webservices "asynchnrones" +Ces services permettent de convertir divers fichiers dans un format corpus (tar.gz) compatible avec tous les services web "asynchrones" +> Le code source de ces services est désormais hébergé [sur +> GitHub](https://github.com/Inist-CNRS/web-services/tree/main/services/data-wrapper#readme) +> ## v1/csv Convertir un fichier csv en fichier corpus. @@ -10,7 +13,7 @@ ```bash curl -X 'POST' \ -'http://data-wrapper.tdmservices.intra.inist.fr/v1/csv' \ +'http://data-wrapper.services.istex.fr/v1/csv' \ -H 'accept: application/x-tar' \ -H 'Content-Type: text/csv' \ -d 'title,year,director/firstNane,director/LastNane,actors,rating,imdb @@ -27,15 +30,15 @@ ### Exemple ```bash -cat ./example-tei.tar.gz |curl --data-binary @- "http://data-wrapper.tdmservices.intra.inist.fr/v1/tar-tei2json"> out.tar.gz +cat ./example-tei.tar.gz |curl --data-binary @- "http://data-wrapper.services.istex.fr/v1/tar-tei2json"> out.tar.gz ``` -## v1/tar-tei2jxml +## v1/tar-tei2xml Convertir un fichier tar.gz contenant des fichiers tei. Le corpus produit contiendra les fichiers tei transformés dans un format XML simplifié. ### Exemple ```bash -cat ./example-tei.tar.gz |curl --data-binary @- "http://data-wrapper.tdmservices.intra.inist.fr/v1/tar-tei2xml"> out.tar.gz +cat ./example-tei.tar.gz |curl --data-binary @- "http://data-wrapper.services.istex.fr/v1/tar-tei2xml"> out.tar.gz ``` diff --git a/data-wrapper/examples.http b/data-wrapper/examples.http index 95138bd..931d100 100644 --- a/data-wrapper/examples.http +++ b/data-wrapper/examples.http @@ -1,44 +1,48 @@ -# File Global Variables: Variables defined in Region without name or request -@baseUrl = http://localhost:31976 -#@baseUrl = http://data-wrapper-2.tdmservices.intra.inist.fr/ +# These examples can be used directly in VSCode, using REST Client extension (humao.rest-client) +# They are important, because used to generate the tests.hurl file. + +# Décommenter/commenter les lignes voulues pour tester localement +# @host=http://localhost:31976 +@host=https://data-wrapper.services.istex.fr ### -# @name csv -# @save -POST {{baseUrl}}/v1/csv?compress=false HTTP/1.1 +# @name v1Csv +# Convertit un CSV en .tar +POST {{host}}/v1/csv?id=title&value=rating HTTP/1.1 Content-Type: text/csv -title,year,director/firstNane,director/LastNane,actors,rating,imdb +title,year,director/firstName,director/LastName,actors,rating,imdb Rocky,1976,John G.,Avildsen,Sylvester Stallone/Talia Shire/Carl Weathers/Burt Young,"8,1",http://www.imdb.com/title/tt0075148/ Rocky 2,1979,Sylvester,Stallone,Sylvester Stallone/Talia Shire/Carl Weathers/Burt Young/Burgess Meredith/Tony Burton/Frank Stallone/Stu Nahan,"7,2",http://www.imdb.com/title/tt0079817/ Rocky 3,1982,Sylvester,Stallone,Sylvester Stallone/Talia Shire/Carl Weathers/Burt Young/Burgess Meredith/Mister T./Hulk Hogan/Tony Burton,"6,7",http://www.imdb.com/title/tt0084602/ Last Action Hero,1993,John,McTierman,Arnold Schwarzenegger/Austin O'Brien/Charles Dance/Bridget Wilson-Sampras,"6,2",http://www.imdb.com/title/tt0107362/ ### -# @name fields/csv -POST {{baseUrl}}/v1/fields/csv HTTP/1.1 +# @name v1FieldsCsv +# Récupère les noms des colonnes du CSV +POST {{host}}/v1/fields/csv HTTP/1.1 Content-Type: text/csv -title,year,director/firstNane,director/LastNane,actors,rating,imdb +title,year,director/firstName,director/LastName,actors,rating,imdb Rocky,1976,John G.,Avildsen,Sylvester Stallone/Talia Shire/Carl Weathers/Burt Young,"8,1",http://www.imdb.com/title/tt0075148/ Rocky 2,1979,Sylvester,Stallone,Sylvester Stallone/Talia Shire/Carl Weathers/Burt Young/Burgess Meredith/Tony Burton/Frank Stallone/Stu Nahan,"7,2",http://www.imdb.com/title/tt0079817/ Rocky 3,1982,Sylvester,Stallone,Sylvester Stallone/Talia Shire/Carl Weathers/Burt Young/Burgess Meredith/Mister T./Hulk Hogan/Tony Burton,"6,7",http://www.imdb.com/title/tt0084602/ Last Action Hero,1993,John,McTierman,Arnold Schwarzenegger/Austin O'Brien/Charles Dance/Bridget Wilson-Sampras,"6,2",http://www.imdb.com/title/tt0107362/ ### -# @name tarTei2json -# @save +# @name v1TarTei2json +# Convertit un fichier .tar.gz contenant des fichiers TEI en fichier .tar.gz contenant des fichiers JSON # @extension tar.gz -POST {{baseUrl}}/v1/tar-tei2json.ini HTTP/1.1 +POST {{host}}/v1/tar-tei2json HTTP/1.1 Content-Type: application/x-tar < ./example-tei.tar.gz ### -# @name tarTei2xml -# @save +# @name v1TarTei2xml +# Convertit un fichier .tar.gz contenant des fichiers TEI en fichier .tar.gz contenant des fichiers XML simplifiés # @extension tar.gz -POST {{baseUrl}}/v1/tar-tei2xml.ini HTTP/1.1 +POST {{host}}/v1/tar-tei2xml HTTP/1.1 Content-Type: application/x-tar < ./example-tei.tar.gz diff --git a/data-wrapper/swagger.json b/data-wrapper/swagger.json index 82805a0..fa88430 100644 --- a/data-wrapper/swagger.json +++ b/data-wrapper/swagger.json @@ -15,8 +15,8 @@ "x-comment": "Will be automatically completed by the ezs server." }, { - "url": "http://vptdmjobs.intra.inist.fr:49155/", - "description": "production release", + "url": "http://vptdmjobs.intra.inist.fr:49161/", + "description": "Latest version for production", "x-profil": "Standard" }, { diff --git a/data-wrapper/tests.hurl b/data-wrapper/tests.hurl new file mode 100644 index 0000000..8d1622c --- /dev/null +++ b/data-wrapper/tests.hurl @@ -0,0 +1,65 @@ +POST https://data-wrapper.services.istex.fr/v1/csv?id=title&value=rating +content-type: text/csv +``` +title,year,director/firstName,director/LastName,actors,rating,imdb +Rocky,1976,John G.,Avildsen,Sylvester Stallone/Talia Shire/Carl Weathers/Burt Young,"8,1",http://www.imdb.com/title/tt0075148/ +Rocky 2,1979,Sylvester,Stallone,Sylvester Stallone/Talia Shire/Carl Weathers/Burt Young/Burgess Meredith/Tony Burton/Frank Stallone/Stu Nahan,"7,2",http://www.imdb.com/title/tt0079817/ +Rocky 3,1982,Sylvester,Stallone,Sylvester Stallone/Talia Shire/Carl Weathers/Burt Young/Burgess Meredith/Mister T./Hulk Hogan/Tony Burton,"6,7",http://www.imdb.com/title/tt0084602/ +Last Action Hero,1993,John,McTierman,Arnold Schwarzenegger/Austin O'Brien/Charles Dance/Bridget Wilson-Sampras,"6,2",http://www.imdb.com/title/tt0107362/ +``` + +HTTP 200 +Content-type: application/gzip + +########################################## + +POST https://data-wrapper.services.istex.fr/v1/fields/csv?indent=true +content-type: text/csv +``` +title,year,director/firstName,director/LastName,actors,rating,imdb +Rocky,1976,John G.,Avildsen,Sylvester Stallone/Talia Shire/Carl Weathers/Burt Young,"8,1",http://www.imdb.com/title/tt0075148/ +Rocky 2,1979,Sylvester,Stallone,Sylvester Stallone/Talia Shire/Carl Weathers/Burt Young/Burgess Meredith/Tony Burton/Frank Stallone/Stu Nahan,"7,2",http://www.imdb.com/title/tt0079817/ +Rocky 3,1982,Sylvester,Stallone,Sylvester Stallone/Talia Shire/Carl Weathers/Burt Young/Burgess Meredith/Mister T./Hulk Hogan/Tony Burton,"6,7",http://www.imdb.com/title/tt0084602/ +Last Action Hero,1993,John,McTierman,Arnold Schwarzenegger/Austin O'Brien/Charles Dance/Bridget Wilson-Sampras,"6,2",http://www.imdb.com/title/tt0107362/ +``` + +HTTP 200 +[{ + "value": "title" +}, +{ + "value": "year" +}, +{ + "value": "director/firstName" +}, +{ + "value": "director/LastName" +}, +{ + "value": "actors" +}, +{ + "value": "rating" +}, +{ + "value": "imdb" +}] + +########################################## + +POST https://data-wrapper.services.istex.fr/v1/tar-tei2json +content-type: application/x-tar +file,example-tei.tar.gz; + +HTTP 200 +Content-type: application/gzip + +########################################## + +POST https://data-wrapper.services.istex.fr/v1/tar-tei2xml +content-type: application/x-tar +file,example-tei.tar.gz; + +HTTP 200 +Content-type: application/gzip diff --git a/data-wrapper/v1/csv.ini b/data-wrapper/v1/csv.ini deleted file mode 100644 index 069b718..0000000 --- a/data-wrapper/v1/csv.ini +++ /dev/null @@ -1,67 +0,0 @@ -# Entrypoint output format -mimeType = application/x-gzip -extension = tar.gz - -# OpenAPI Documentation - JSON format (dot notation) -post.operationId = post-v1-csv -post.summary = Transformation d'un fichier CSV en fichier corpus -post.description = Le fichier est transformé en fichier corpus exploitable par un web service asynchrone -post.tags.0 = data-wrapper -post.requestBody.content.text/csv.schema.type = string -post.requestBody.content.text/csv.schema.format = binary -post.requestBody.required = true -post.responses.default.description = Fichier corpus au format tar.gz -post.responses.default.content.application/x-gzip.schema.type = string -post.responses.default.content.application/x-gzip.schema.format = binary -post.parameters.0.description = Nom du champ à exploiter comme identifiant de colonne -post.parameters.0.in = query -post.parameters.0.name = id -post.parameters.0.schema.type = string -post.parameters.0.schema.default = id -post.parameters.0.required = false -post.parameters.1.description = Nom du champ à exploiter comme identifiant de ligne -post.parameters.1.in = query -post.parameters.1.name = value -post.parameters.1.schema.type = string -post.parameters.1.schema.default = value -post.parameters.1.required = false -post.parameters.2.description = chaque ligne est réduite à un object contenant 2 champs (id, value) -post.parameters.2.in = query -post.parameters.2.name = slim -post.parameters.2.schema.type = boolean -post.parameters.2.schema.default = true -post.parameters.2.required = false - -[env] -path = slim -value = env('slim').thru(x => (x === 'false' ? false : true)) - -[use] -plugin = basics - -[CSVParse] - -[CSVObject] - -[assign] -path = id -value = get(env('id', 'id')) -path = value -value = get(env('value', 'value')) - -[exchange] -value = self().thru(x => _.env(null, 'slim') ? _.pick(x, ['id', 'value']) : x) - -# in the hope that all the lines look the same -[singleton] -[singleton/validate] -path = value -rule = required - -[TARDump] -compress = true -manifest = fix({version: '1'}) -manifest = fix({generator: 'v1/csv'}) -manifest = fix({parameters: _.omit(_.env(), 'headers')}) -manifest = fix({hostAgent: _.get(_.env(), 'headers.host')}) -manifest = fix({userAgent: _.get(_.env(), 'headers.user-agent')}) diff --git a/data-wrapper/v1/fields/csv.ini b/data-wrapper/v1/fields/csv.ini deleted file mode 100644 index aa29059..0000000 --- a/data-wrapper/v1/fields/csv.ini +++ /dev/null @@ -1,41 +0,0 @@ -# Entrypoint output format -mimeType = application/json -extension = json - -# OpenAPI Documentation - JSON format (dot notation) -post.operationId = post-v1-fields-csv -post.description = Récupération des colonnes d'un fichier CSV -post.summary = Le fichier est analysé pour lister les colonnes utilisées -post.tags.0 = data-wrapper -post.requestBody.content.text/csv.schema.type = string -post.requestBody.content.text/csv.schema.format = binary -post.requestBody.required = true -post.responses.default.description = Liste des colonnes trouvées -post.responses.default.content.application/json.schema.$ref = #/components/schemas/JSONStream -post.responses.default.content.application/json.example.0.value = Title -post.responses.default.content.application/json.example.1.value = Keywords -post.parameters.0.description = Indenter le JSON résultant -post.parameters.0.in = query -post.parameters.0.name = indent -post.parameters.0.schema.type = boolean - -[use] -plugin = basics - -[CSVParse] - -[CSVObject] - -[shift] - -[exchange] -value = self().keys() - -[ungroup] - -[replace] -path = value -value = self() - -[dump] -indent = env('indent', false) diff --git a/data-wrapper/v1/istex-tar-gz.ini b/data-wrapper/v1/istex-tar-gz.ini deleted file mode 100644 index 0c01461..0000000 --- a/data-wrapper/v1/istex-tar-gz.ini +++ /dev/null @@ -1,64 +0,0 @@ -# Entrypoint output format -mimeType = application/x-gzip -extension = tar.gz - -# OpenAPI Documentation - JSON format (dot notation) -post.operationId = post-v1-tar-gz -post.summary = Transformation d'un fichier ISTEX (format tar.gz) en fichier corpus -post.description = Le fichier est transformé en fichier corpus exploitable par un web service asynchrone -post.tags.0 = data-wrapper -post.requestBody.content.application/x-gzip.schema.type = string -post.requestBody.content.application/x-gzip.schema.format = binary -post.requestBody.content.application/x-tar.schema.type = string -post.requestBody.content.application/x-tar.schema.format = binary -post.requestBody.required = true -post.responses.default.description = Fichier corpus au format tar.gz -post.responses.default.content.application/x-gzip.schema.type = string -post.responses.default.content.application/x-gzip.schema.format = binary -post.parameters.0.description = Nom du champ à exploiter contenant l'identifiant -post.parameters.0.in = query -post.parameters.0.name = id -post.parameters.0.schema.type = string -post.parameters.0.schema.default = ark.0 -post.parameters.0.required = false -post.parameters.1.description = Nom du champ à exploiter contenant la valeur -post.parameters.1.in = query -post.parameters.1.name = value -post.parameters.1.schema.type = string -post.parameters.1.schema.default = abstract -post.parameters.1.required = false -post.parameters.2.description = chaque objet est réduit à un object contenant 2 champs (id, value) -post.parameters.2.in = query -post.parameters.2.name = slim -post.parameters.2.schema.type = boolean -post.parameters.2.schema.default = true -post.parameters.2.required = false - -[env] -path = slim -value = env('slim').thru(x => (x === 'false' ? false : true)) - -[use] -plugin = basics - -[TARExtract] -compress = true -path = */*.json - -[assign] -path = id -value = get(env('id', 'ark.0')) - -path = value -value = get(env('value', 'abstract')) - -[exchange] -value = self().thru(x => _.env(null, 'slim') ? _.pick(x, ['id', 'value']) : x) - -[TARDump] -compress = true -manifest = fix({version: '1'}) -manifest = fix({generator: 'v1/istex-tar-gz'}) -manifest = fix({parameters: _.omit(_.env(), 'headers')}) -manifest = fix({hostAgent: _.get(_.env(), 'headers.host')}) -manifest = fix({userAgent: _.get(_.env(), 'headers.user-agent')}) diff --git a/data-wrapper/v1/new-persee-tei.xsl b/data-wrapper/v1/new-persee-tei.xsl deleted file mode 100644 index 69e9523..0000000 --- a/data-wrapper/v1/new-persee-tei.xsl +++ /dev/null @@ -1,57 +0,0 @@ - - - - - - - - - - - - - - - - - - <xsl:value-of - select="//tei:fileDesc/tei:titleStmt/tei:title"/> - - - - - - - - - - - - - - - - - - -

- -

-
- - - - - - - - - - - -
diff --git a/data-wrapper/v1/query-conditor.ini b/data-wrapper/v1/query-conditor.ini deleted file mode 100644 index a44ebe0..0000000 --- a/data-wrapper/v1/query-conditor.ini +++ /dev/null @@ -1,73 +0,0 @@ -# Entrypoint output format -mimeType = application/x-gzip -extension = tar.gz - -# OpenAPI Documentation - JSON format (dot notation) -post.operationId = post-v1-query-conditor -post.summary = Téléchargement des documents Conditor répondant à une requête -post.description = Le fichier en entrée contient une requête dont le résultat produira un fichier corpus exploitable par un web service asynchrone -post.tags.0 = data-wrapper -post.requestBody.content.text/plain.schema.type = string -post.requestBody.content.text/plain.schema.format = binary -post.requestBody.required = true -post.responses.default.description = Fichier corpus au format tar.gz -post.responses.default.content.application/x-gzip.schema.type = string -post.responses.default.content.application/x-gzip.schema.format = binary -post.parameters.0.description = Nom du champ à exploiter comme identifiant de colonne -post.parameters.0.in = query -post.parameters.0.name = id -post.parameters.0.schema.type = string -post.parameters.0.schema.default = business.sourceUidChain -post.parameters.0.required = false -post.parameters.1.description = Nom du champ à exploiter comme identifiant de ligne -post.parameters.1.in = query -post.parameters.1.name = value -post.parameters.1.schema.type = string -post.parameters.1.schema.default = title.en -post.parameters.1.required = false -post.parameters.2.description = chaque ligne est réduite à un object contenant 2 champs (id, value) -post.parameters.2.in = query -post.parameters.2.name = slim -post.parameters.2.schema.type = boolean -post.parameters.2.schema.default = true -post.parameters.2.required = false - -[env] -path = slim -value = env('slim').thru(x => (x === 'false' ? false : true)) - -[use] -plugin = basics -plugin = conditor - -[TXTConcat] - -[replace] -path = q -value = self().trim() - -[CORHALFetch] -url = https://corhal-api.inist.fr -retries = 3 -timeout = 60000 - -[assign] -path = id -value = get(env('id', 'business.sourceUidChain')) -path = value -value = get(env('value', 'title.en')) -value = self() - -[exchange] -value = self().thru(x => _.env(null, 'slim') ? _.pick(x, ['id', 'value']) : x) - -[TARDump] -compress = true -manifest = fix({version: '1'}) -manifest = fix({generator: 'v1/query-conditor'}) -manifest = fix({parameters: _.omit(_.env(), 'headers')}) -manifest = fix({hostAgent: _.get(_.env(), 'headers.host')}) -manifest = fix({userAgent: _.get(_.env(), 'headers.user-agent')}) - - - diff --git a/data-wrapper/v1/query-istex.ini b/data-wrapper/v1/query-istex.ini deleted file mode 100644 index 7ee7546..0000000 --- a/data-wrapper/v1/query-istex.ini +++ /dev/null @@ -1,74 +0,0 @@ -# Entrypoint output format -mimeType = application/x-gzip -extension = tar.gz - -# OpenAPI Documentation - JSON format (dot notation) -post.operationId = post-v1-query-istex -post.summary = Téléchargement des documents ISTEX répondant à une requete -post.description = Le fichier en entrée contient une requête dont le résultat produira un fichier corpus exploitable par un web service asynchrone -post.tags.0 = data-wrapper -post.requestBody.content.text/plain.schema.type = string -post.requestBody.content.text/plain.schema.format = binary -post.requestBody.required = true -post.responses.default.description = Fichier corpus au format tar.gz -post.responses.default.content.application/x-gzip.schema.type = string -post.responses.default.content.application/x-gzip.schema.format = binary -post.parameters.0.description = Nom du champ à exploiter comme identifiant de colonne -post.parameters.0.in = query -post.parameters.0.name = id -post.parameters.0.schema.type = string -post.parameters.0.schema.default = arkIstex -post.parameters.0.required = false -post.parameters.1.description = Nom du champ à exploiter comme identifiant de ligne -post.parameters.1.in = query -post.parameters.1.name = value -post.parameters.1.schema.type = string -post.parameters.1.schema.default = title -post.parameters.1.required = false -post.parameters.2.description = chaque ligne est réduite à un object contenant 2 champs (id, value) -post.parameters.2.in = query -post.parameters.2.name = slim -post.parameters.2.schema.type = boolean -post.parameters.2.schema.default = true -post.parameters.2.required = false - -[env] -path = slim -value = env('slim').thru(x => (x === 'false' ? false : true)) - -[use] -plugin = basics -plugin = istex - -[TXTConcat] - -[replace] -path = q -value = self().trim() - -[ISTEXScroll] -query = get('q') -field = * - -[ISTEXResult] - -[assign] -path = id -value = get(env('id', 'arkIstex')) -path = value -value = get(env('value', 'title')) -value = self() - -[exchange] -value = self().thru(x => _.env(null, 'slim') ? _.pick(x, ['id', 'value']) : x) - -[TARDump] -compress = true -manifest = fix({version: '1'}) -manifest = fix({generator: 'v1/query-istex'}) -manifest = fix({parameters: _.omit(_.env(), 'headers')}) -manifest = fix({hostAgent: _.get(_.env(), 'headers.host')}) -manifest = fix({userAgent: _.get(_.env(), 'headers.user-agent')}) - - - diff --git a/data-wrapper/v1/tar-tei2json.ini b/data-wrapper/v1/tar-tei2json.ini deleted file mode 100644 index 83489ea..0000000 --- a/data-wrapper/v1/tar-tei2json.ini +++ /dev/null @@ -1,43 +0,0 @@ -# Entrypoint output format -mimeType = application/x-gzip -extension = tar.gz - -# OpenAPI Documentation - JSON format (dot notation) -post.operationId = post-v1-tei2json -post.summary = Transformation d'un fichier TEI en fichier corpus -post.description = Le fichier est transformé en fichier corpus exploitable par un web service asynchrone, chaque document TEI est préalablement transformé en JSON -post.tags.0 = data-wrapper -post.requestBody.content.application/x-gzip.schema.type = string -post.requestBody.content.application/x-gzip.schema.format = binary -post.requestBody.content.application/x-tar.schema.type = string -post.requestBody.content.application/x-tar.schema.format = binary -post.requestBody.required = true -post.responses.default.description = Fichier corpus au format tar.gz -post.responses.default.content.application/x-gzip.schema.type = string -post.responses.default.content.application/x-gzip.schema.format = binary - -[use] -plugin = basics - -[TARExtract] -path = **/*.xml -compress = true -json = false - -[map] -path = value -[map/XMLParse] -separator = /tei:TEI -separator = /tei - -[replace] -path = fileName -value = get('id') - -path = fileContent -value = get('value.0') - -[TARDump] -compress = true -manifest = fix({version: '1'}) -manifest = fix({generator: 'v1/tar-tei2json'}) diff --git a/data-wrapper/v1/tar-tei2xml.ini b/data-wrapper/v1/tar-tei2xml.ini deleted file mode 100644 index 624bce9..0000000 --- a/data-wrapper/v1/tar-tei2xml.ini +++ /dev/null @@ -1,41 +0,0 @@ -# Entrypoint output format -mimeType = application/x-gzip -extension = tar.gz - -# OpenAPI Documentation - JSON format (dot notation) -post.operationId = post-v1-tar-tei2xml -post.summary = Transformation d'un fichier TEI en fichier corpus -post.description = Le fichier est transformé en fichier corpus exploitable par un web service asynchrone, chaque document TEI est préalablement simplifié en fichier XML minimal -post.tags.0 = data-wrapper -post.requestBody.content.application/x-gzip.schema.type = string -post.requestBody.content.application/x-gzip.schema.format = binary -post.requestBody.content.application/x-tar.schema.type = string -post.requestBody.content.application/x-tar.schema.format = binary -post.requestBody.required = true -post.responses.default.description = Fichier corpus au format tar.gz -post.responses.default.content.application/x-gzip.schema.type = string -post.responses.default.content.application/x-gzip.schema.format = binary - -[use] -plugin = basics -plugin = xslt - -[TARExtract] -path = **/*.xml -compress = true -json = false - -[map] -path = value -[map/xslt] -stylesheet = ./v1/new-persee-tei.xsl - -[exchange] -value = get('value.0') - -[TARDump] -compress = true -extension = xml -json = false -manifest = fix({version: '1'}) -manifest = fix({generator: 'v1/tar-tei2xml'}) diff --git a/diseases-ner/README.md b/diseases-ner/README.md index a360f01..bf5c6c3 100644 --- a/diseases-ner/README.md +++ b/diseases-ner/README.md @@ -1,35 +1,4 @@ # diseases-ner -Cette instance propose un outil de reconnaissance d'entités nommées de maladies. - -## Configuration - -L'application à utiliser est XXX. - -## Utilisation - -- [v1/diseases/tagger](#v1) - -### v1 - -Ce web-service renvoie la liste des entités nommées en chimie présentes dans le texte. - -Il prend en entrée du JSON avec deux champs, `id` et `value`, et renvoie un JSON avec le la liste des entités en fonction de leur étiquettes dans le champ `value`. - -#### Exemple de v1/first-name/gender - -Entrée - -```bash -$ cat < Le code source de ces services est désormais hébergé [sur +> GitHub](https://github.com/Inist-CNRS/web-services/tree/main/services/diseases-ner#readme) \ No newline at end of file diff --git a/diseases-ner/examples.http b/diseases-ner/examples.http index 8cc8a30..0f021ec 100644 --- a/diseases-ner/examples.http +++ b/diseases-ner/examples.http @@ -1,18 +1,19 @@ -# Ces exemples peuvent être exécutés directement dans VSCode, en utilisant l'extension REST Client (humao.rest-client) +# These examples can be used directly in VSCode, using REST Client extension (humao.rest-client) +# They are important, because used to generate the tests.hurl file. + # Décommenter/commenter les lignes voulues pour tester localement -# @baseUrl=http://localhost:31976 -@baseUrl=http://diseases-ner.tdmservices.intra.inist.fr/ -# @baseUrl=https://diseases-ner.services.istex.fr/ +@host=http://localhost:31976 +# @host=https://diseases-ner.services.istex.fr ### -# @name v1DiseasesNer -# Reconnaissance d'entités nommées de maladies -POST {{baseUrl}}/v1/diseases/tagger?indent=true HTTP/1.1 +# @name v1diseasesTagger +# Trouve les entités nommées de maladies +POST {{host}}/v1/diseases/tagger?indent=true HTTP/1.1 Content-Type: application/json [ - { - "id": 1, - "value":"They have been used in the prevention and treatment of malaria and autoimmune diseases, including systemic lupus erythematosus and rheumatoid arthritis." - } + { + "id":1, + "value": "They have been used in the prevention and treatment of malaria and autoimmune diseases, including systemic lupus erythematosus and rheumatoid arthritis." + } ] diff --git a/diseases-ner/swagger.json b/diseases-ner/swagger.json index 7c414b6..a3c13e0 100644 --- a/diseases-ner/swagger.json +++ b/diseases-ner/swagger.json @@ -1,33 +1,33 @@ { - "openapi": "3.0.0", - "info": { - "title": "diseases-ner - Reconnaissance d'entités nommées de maladies", - "summary": "Reconnaissance des entités nommées de maladies", - "version": "1.0.6", - "termsOfService": "https://services.istex.fr/", - "contact": { - "name": "Inist-CNRS", - "url": "https://www.inist.fr/nous-contacter/" - } - }, - "servers": [ - { - "x-comment": "Will be automatically completed by the ezs server." - }, - { - "url": "http://vptdmservices.intra.inist.fr:49242/", - "description": "Latest version for production", - "x-profil": "Standard" - } - ], - "tags": [ - { - "name": "diseases-ner", - "description": "Reconnaissance d'entités nommées de maladies", - "externalDocs": { - "description": "Plus de documentation", - "url": "https://gitbucket.inist.fr/tdm/web-services/chem-ner/README.md" - } - } - ] - } \ No newline at end of file + "openapi": "3.0.0", + "info": { + "title": "diseases-ner - Trouve des entités nommées de maladies dans un texte", + "summary": "Renvoie un Json composé comportant un champs `diseases` correspondant aux entités nommées de maladies trouvées", + "version": "1.0.9", + "termsOfService": "https://services.istex.fr/", + "contact": { + "name": "Inist-CNRS", + "url": "https://www.inist.fr/nous-contacter/" + } + }, + "servers": [ + { + "x-comment": "Will be automatically completed by the ezs server." + }, + { + "url": "http://vptdmservices.intra.inist.fr:49250/", + "description": "Latest version for production", + "x-profil": "Standard" + } + ], + "tags": [ + { + "name": "diseases-ner", + "description": "Trouve des entités nommées de maladies dans un texte", + "externalDocs": { + "description": "Plus de documentation", + "url": "https://github.com/inist-cnrs/web-services/tree/main/services/diseases-ner" + } + } + ] +} diff --git a/diseases-ner/tests.hurl b/diseases-ner/tests.hurl new file mode 100644 index 0000000..4afa7ac --- /dev/null +++ b/diseases-ner/tests.hurl @@ -0,0 +1,22 @@ +POST {{host}}/v1/diseases/tagger?indent=true +content-type: application/json +[ + { + "id":1, + "value": "They have been used in the prevention and treatment of malaria and autoimmune diseases, including systemic lupus erythematosus and rheumatoid arthritis." + } +] + + +HTTP 200 +[{ + "id": 1, + "value": { + "diseases": [ + "malaria", + "autoimmune diseases", + "systemic lupus erythematosus", + "rheumatoid arthritis" + ] + } +}] \ No newline at end of file diff --git a/diseases-ner/v1/diseases/models.dvc b/diseases-ner/v1/diseases/models.dvc deleted file mode 100644 index b6c0e73..0000000 --- a/diseases-ner/v1/diseases/models.dvc +++ /dev/null @@ -1,6 +0,0 @@ -outs: -- md5: ddf4fda3fe13cd7c131205d4e34b6de9.dir - size: 432104605 - nfiles: 11 - hash: md5 - path: models diff --git a/diseases-ner/v1/diseases/tagger.ini b/diseases-ner/v1/diseases/tagger.ini deleted file mode 100644 index cd5deb4..0000000 --- a/diseases-ner/v1/diseases/tagger.ini +++ /dev/null @@ -1,42 +0,0 @@ -# OpenAPI Documentation - JSON format (dot notation) -mimeType = application/json - -post.description = Trouve des entités nommées de maladies dans un texte -post.responses.default.description = Renvoie un Json composé comportant un champs `value` correspondant aux entités de maladies trouvées -post.responses.default.content.application/json.schema.$ref = #/components/schemas/JSONStream -post.summary = Recherche d'entité nommées de maladies -post.requestBody.required = true -post.requestBody.content.application/json.schema.$ref = #/components/schemas/JSONStream -post.parameters.0.in = query -post.parameters.0.name = indent -post.parameters.0.schema.type = boolean -post.parameters.0.description = Indent or not the JSON Result - -# Examples - -post.requestBody.content.application/json.example.0.id: 1 -post.requestBody.content.application/json.example.0.value: They have been used in the prevention and treatment of malaria and autoimmune diseases, including systemic lupus erythematosus and rheumatoid arthritis. -post.responses.default.content.application/json.example.0.id: 1 -post.responses.default.content.application/json.example.0.value: -post.responses.default.content.application/json.example.0.value.diseases: ["malaria", "autoimmune diseases", "systemic lupus erythematosus", "rheumatoid arthritis"] - - -[use] -# exec -plugin = @ezs/spawn -# JSONParse -plugin = @ezs/basics - -[JSONParse] -separator = * - -[expand] -path = value -size = 100 - -[expand/exec] -# command should be executable ! -command = ./v1/diseases/tagger.py - -[dump] -indent = env('indent', false) diff --git a/diseases-ner/v1/diseases/tagger.py b/diseases-ner/v1/diseases/tagger.py deleted file mode 100644 index beaa50f..0000000 --- a/diseases-ner/v1/diseases/tagger.py +++ /dev/null @@ -1,109 +0,0 @@ -#!/opt/bitnami/python/bin/python3.7 -# -*- coding: utf-8 -*- - -import torch -from transformers import AutoTokenizer, AutoModelForTokenClassification -import sys -import json -import re -import unicodedata - - -# Normalize -# Normalisation du texte : -def remove_accents(text): - text = unicodedata.normalize("NFD", text) - text = re.sub("[\u0300-\u036f]", "", text) - return text - -def normalizeText(text): - text = text.lower() - text = remove_accents(text).replace(" ","") - return text - - -## Predicts developped formulas -# Load model -tokenizer = AutoTokenizer.from_pretrained('./v1/diseases/models') -model = AutoModelForTokenClassification.from_pretrained('./v1/diseases/models', config='./v1/diseases/models/config.json') - - -# predicts text -def predict_formula_ml(input_text): - #tokenizer - tokens = tokenizer(input_text, return_tensors='pt', truncation=True, max_length=512) - - # Predicts - with torch.no_grad(): - output = model(**tokens) - - predictions = torch.argmax(output.logits, dim=-1) - - # Get token that contains "disease" - tokens = tokenizer.convert_ids_to_tokens(tokens['input_ids'][0]) - disease_tokens_list = [] - i=0 - - while i < len(predictions[0]): - # prediction [0][i] depends of i : {0 : "B-disease" , 1 : "I-disease" , 2: "NOT a disease NE"} - k=0 - if predictions[0][i] < 2: - disease_tokens_toappend = [] - while predictions[0][i+k] < 2: - disease_tokens_toappend.append(tokens[i+k]) - k+=1 - disease_tokens_list.append(disease_tokens_toappend) - i+=k+1 - value = [] - for disease_tokens in disease_tokens_list: - value.append(tokenizer.decode(tokenizer.convert_tokens_to_ids(disease_tokens))) - return value - -# if text too long -def split_text(text): - if len(text)>=512: - text_split = text.split('.') - else: - text_split = [text] - return text_split - -# predicts text after, either it is splitted or not -def predict_formula_ml_list(list): - output = [] - for elt in list: - output+= predict_formula_ml(elt) - return output - -# remove bad space in outputs -def curate_list(input_list): - output_list = [] - for elt in input_list: - if '#' not in elt: - output_list.append( - elt.replace('- ','-').replace(' -','-').replace('( ','(').replace(' (','(').replace(') ',')').replace(' )',')').replace('[ ','[') - .replace(' [','[').replace('] ',']').replace(' ]',']') - ) - return output_list - -#Disambigusate formulas : - -#preprocessing : remove duplicates elements -def remove_duplicates(input_list): - output_list = [] - normalized_list = [] - for elt in input_list: - if normalizeText(elt) not in normalized_list: - output_list.append(elt) - normalized_list.append(normalizeText(elt)) - return output_list - - -# beginning of the ws -for line in sys.stdin: - data = json.loads(line) - # Use the model to find NER - value = remove_duplicates(curate_list(predict_formula_ml_list(split_text(data["value"])))) - # Standardization - data["value"] = {"diseases":value} # remove_duplicates(value) - json.dump(data, sys.stdout, ensure_ascii=False) - sys.stdout.write("\n")