diff --git a/address-kit-conf.json b/address-kit-conf.json deleted file mode 100644 index a005851..0000000 --- a/address-kit-conf.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "environnement": { - "EZS_TITLE": "address-kit", - "EZS_DESCRIPTION": "Web service for affiliations", - "EZS_METRICS": true, - "EZS_CONCURRENCY": 2, - "EZS_NSHARDS": 32, - "EZS_CACHE": true, - "NODE_OPTIONS": "--max_old_space_size=1024", - "NODE_ENV": "production", - "PIP_NO_DEPENDENCIES": 0, - "PIP_DISABLE_PIP_VERSION_CHECK": 1, - "PIP_NO_CACHE_DIR": 0 - }, - "packages": [ - "@ezs/core@3.0.6", - "@ezs/spawn@1.4.5", - "@ezs/basics@2.5.4" - ], - "files": { - "zip": "https://gitbucket.inist.fr/tdm/web-services/archive/address-kit/address-kit@2.0.3.zip" - } -} \ No newline at end of file diff --git a/address-kit/config.json b/address-kit/config.json new file mode 100644 index 0000000..a005851 --- /dev/null +++ b/address-kit/config.json @@ -0,0 +1,23 @@ +{ + "environnement": { + "EZS_TITLE": "address-kit", + "EZS_DESCRIPTION": "Web service for affiliations", + "EZS_METRICS": true, + "EZS_CONCURRENCY": 2, + "EZS_NSHARDS": 32, + "EZS_CACHE": true, + "NODE_OPTIONS": "--max_old_space_size=1024", + "NODE_ENV": "production", + "PIP_NO_DEPENDENCIES": 0, + "PIP_DISABLE_PIP_VERSION_CHECK": 1, + "PIP_NO_CACHE_DIR": 0 + }, + "packages": [ + "@ezs/core@3.0.6", + "@ezs/spawn@1.4.5", + "@ezs/basics@2.5.4" + ], + "files": { + "zip": "https://gitbucket.inist.fr/tdm/web-services/archive/address-kit/address-kit@2.0.3.zip" + } +} \ No newline at end of file diff --git a/data-computer/swagger.json b/data-computer/swagger.json index 983d689..0ea400c 100644 --- a/data-computer/swagger.json +++ b/data-computer/swagger.json @@ -3,7 +3,7 @@ "info": { "title": "data-computer - Calculs sur fichier corpus compressé", "summary": "Algorithmes de calculs sur un corpus compressé", - "version": "2.3.0", + "version": "2.6.0", "termsOfService": "https://services.istex.fr/", "contact": { "name": "Inist-CNRS", @@ -15,12 +15,12 @@ "x-comment": "Will be automatically completed by the ezs server." }, { - "url": "http://vptdmjobs.intra.inist.fr:49157/", + "url": "http://vptdmjobs.intra.inist.fr:49159/", "description": "Production release", "x-profil": "Standard" }, { - "url": "http://vitdmservices.intra.inist.fr:49303/", + "url": "http://vitdmservices.intra.inist.fr:49313/", "description": "For internal tests" } ], diff --git a/data-computer/tests.hurl b/data-computer/tests.hurl index 6b6362a..9f60dc6 100644 --- a/data-computer/tests.hurl +++ b/data-computer/tests.hurl @@ -19,8 +19,13 @@ # Fortunately, as the data is sparse, and the computing time is small, # the need is small. +# Version 4.1.0 of hurl added a delay option, which value is milliseconds. +# https://hurl.dev/blog/2023/09/24/announcing-hurl-4.1.0.html#add-delay-between-requests + POST https://data-computer.services.istex.fr/v1/retrieve content-type: application/json +[Options] +delay: 1000 ``` [ { diff --git a/data-computer/v1/base-line.ini b/data-computer/v1/base-line.ini index ecdf3db..a7814a5 100644 --- a/data-computer/v1/base-line.ini +++ b/data-computer/v1/base-line.ini @@ -5,7 +5,7 @@ post.operationId = post-v1-base-line post.description = Chargement et analyse d'un fichier corpus post.summary = Le corpus est analysé et restitué sans modification des données -post.tags.0 = data-computer +post.tags.0 = data-workflow post.requestBody.content.application/x-tar.schema.type = string post.requestBody.content.application/x-tar.schema.format = binary post.requestBody.required = true @@ -31,10 +31,6 @@ path = generator value = base-line -[use] -plugin = basics -plugin = analytics - # Step 1 (générique): Charger le fichier corpus [delegate] file = charger.cfg @@ -44,6 +40,10 @@ standalone = true logger = logger.cfg +# Step 2.0 (optionnel): Accélére le détachement du fork si l'enrichissement est lent +[fork/delegate] +file = buffer.cfg + # Step 2.1 (spécifique): Lancer un calcul sur tous les items reçus [fork/exchange] value = self().omit('uri') diff --git a/data-computer/v1/buffer.cfg b/data-computer/v1/buffer.cfg new file mode 100644 index 0000000..f8b16b5 --- /dev/null +++ b/data-computer/v1/buffer.cfg @@ -0,0 +1,26 @@ +[use] +plugin = basics + +# On sauvegarde sur disque pour accepter rapidement tous les objets en entrée +# et répondre rapidement au client que le traitmenent asynchnrone est lancé. +# +# Le "fork" se détache uniquement quand tous les objets sont "rentrés" dans le fork +# Si le traitement est plus lent que la sauvegarde sur disque +# il est nécessaire de créer un fichier temporaire +[pack] +[FILESave] +identifier = env('identifier') +location = /tmp/upload +compress = true + +[exchange] +value = get('filename') + +[FILELoad] +compress = true +location = /tmp/upload +[unpack] + +[metrics] +bucket = buffer + diff --git a/data-computer/v1/charger.cfg b/data-computer/v1/charger.cfg index c1f8093..7bff09a 100644 --- a/data-computer/v1/charger.cfg +++ b/data-computer/v1/charger.cfg @@ -1,6 +1,5 @@ [use] plugin = basics -plugin = analytics # Step 0 (générique) : Lire le fichier standard tar.gz [TARExtract] @@ -9,8 +8,20 @@ # Step 1 (générique) : Créer un identifiant unique pour le corpus reçu [singleton] + +# Step 1.1 : On évite de récupere un champ uri existant +[singleton/env] +path = pid +value = fix(`PID${Date.now()}`) + +# Step 1.2 : On génére un identifiant unique [singleton/identify] +path = env('pid') + +# Step 1.3: On garde en mémoire l'identifiant généré (en le simplifiant) [singleton/env] path = identifier -value = get('uri').replace('uid:/', '') +value = get(env('pid')).replace('uid:/', '') +[metrics] +bucket = charger diff --git a/data-computer/v1/lda.py b/data-computer/v1/lda.py index b7206a1..426a137 100755 --- a/data-computer/v1/lda.py +++ b/data-computer/v1/lda.py @@ -22,7 +22,6 @@ return text_with_no_accent def uniformize(text): - # del accents, using remove_accents function text = remove_accents(text) # remove punctuation except " ' " @@ -41,12 +40,14 @@ def tokenize(text): tokens = [word for word in text.replace("'"," ").split() if word not in stopwords and len(word)>2] if len(tokens)==0: - return ["n/a"] + return [] return tokens # Max topic def max_topic(dico): - # for a dictionary of topics, return a json with a single key "best topic" and his value is the value of the dictionary. + """ + for a dictionary of topics, return a json with a single key "best_topic" and its value is the value of this topic in the dictionary. + """ best_topic = {} best_proba = 0 for topic in dico: @@ -67,11 +68,11 @@ # following parameters depends of the size of the corpus : num_topics and num_iterations -n = len(all_data) -if n< 1001: +len_data = len(all_data) +if len_data< 1001: num_topics = 10 num_iterations=150 -elif n < 20001: +elif len_data < 20001: num_topics = 15 num_iterations=200 else: @@ -81,16 +82,52 @@ # training LDA texts = [] -for line in all_data: - if "value" in line: - texts.append(tokenize(lemmatize(uniformize(line["value"])))) +index_without_value = [] +for i in range(len_data): + line = all_data[i] + if "value" in line and type(line["value"])==str: + tokens = tokenize(lemmatize(uniformize(line["value"]))) + if tokens != []: + texts.append(tokenize(lemmatize(uniformize(line["value"])))) + else: + index_without_value.append(i) else: - texts.append("n/a") + index_without_value.append(i) dictionary = corpora.Dictionary(texts) # Create a tf dictionary, but replace text by an id : [ [(id_token,numb_token),...] , [....] ]. The list represent docs of corpus -dictionary.filter_extremes(no_below=3,no_above=0.6) +dictionary.filter_extremes(no_below=3,no_above=0.8) corpus = [dictionary.doc2bow(text) for text in texts] -lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,iterations=num_iterations,alpha="symmetric", eta = "auto",minimum_probability=0.1) +try: + lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,iterations=num_iterations,alpha="symmetric", eta = "auto",minimum_probability=0.1) +except: + index_without_value = [i for i in range(len_data)] + + +# extract infos +for i in range(len_data): + + #return n/a if docs wasn't in model + if i in index_without_value: + line["value"]="n/a" + sys.stdout.write(json.dumps(line)) + sys.stdout.write("\n") + else: + line = all_data[i] + doc = line["value"] + doc_bow = dictionary.doc2bow(tokenize(uniformize(line["value"]))) + topics = lda_model[doc_bow] + topic_info = {} + for topic_id, topic_weight in topics: + topic_info[f"topic_{topic_id + 1}"] = {} + topic_words = [{"word":word, "word_weight":str(word_weight)} for word, word_weight in lda_model.show_topic(topic_id)] + topic_info[f"topic_{topic_id + 1}"]["words"] = topic_words + topic_info[f"topic_{topic_id + 1}"]["weight"] = str(topic_weight) + + line["value"]={} + line["value"]["topics"]=topic_info + line["value"]["best_topic"]=max_topic(topic_info) + sys.stdout.write(json.dumps(line)) + sys.stdout.write("\n") # #To see topics (to test it with a jsonl file) @@ -100,25 +137,3 @@ # cm = models.coherencemodel.CoherenceModel(model=lda_model, texts=texts, coherence='c_v') # cm.get_coherence() # exit() - - -# extract infos -for line in all_data: - doc = line["value"] - doc_bow = dictionary.doc2bow(tokenize(uniformize(line["value"]))) - topics = lda_model[doc_bow] - topic_info = {} - for topic_id, topic_weight in topics: - topic_info[f"topic_{topic_id + 1}"] = {} - topic_words = [word for word, _ in lda_model.show_topic(topic_id)] - topic_info[f"topic_{topic_id + 1}"]["words"] = topic_words - topic_info[f"topic_{topic_id + 1}"]["weight"] = str(topic_weight) - - line["value"]={} - line["value"]["topics"]=topic_info - line["value"]["best_topic"]=max_topic(topic_info) - -# Write output -for line in all_data: - sys.stdout.write(json.dumps(line)) - sys.stdout.write("\n") diff --git a/data-computer/v1/logger.cfg b/data-computer/v1/logger.cfg index b46e88a..6377dd9 100644 --- a/data-computer/v1/logger.cfg +++ b/data-computer/v1/logger.cfg @@ -2,6 +2,9 @@ plugin = basics plugin = analytics +[metrics] +bucket = logger + # On ne garde que la première erreur déclénchée [shift] diff --git a/data-computer/v1/recorder.cfg b/data-computer/v1/recorder.cfg index f768491..7811144 100644 --- a/data-computer/v1/recorder.cfg +++ b/data-computer/v1/recorder.cfg @@ -2,6 +2,9 @@ plugin = basics plugin = analytics +[metrics] +bucket = recorder + # Step 2.2 (générique): Création d'un fichier résulat standard [TARDump] compress = true @@ -34,10 +37,10 @@ # Step 2.4.3 (faculatif) : Ajouter une trace dans log [swing/debug] -text = webhook triggered +text = WebHook triggered # Step 2.5 (faculatif) : Ajouter une trace dans log [debug] -text = process completed +text = Process completed diff --git a/data-workflow/swagger.json b/data-workflow/swagger.json index 6653a59..e58cb39 100644 --- a/data-workflow/swagger.json +++ b/data-workflow/swagger.json @@ -3,7 +3,7 @@ "info": { "title": "data-workflow - Enchainement de traitements asynchrones", "summary": "Les worflows permettent de traiter des fichiers corpus compressés en appelant des webservices d'enrichissement par documents (webservices synchrones)", - "version": "1.1.3", + "version": "1.2.3", "termsOfService": "https://services.istex.fr/", "contact": { "name": "Inist-CNRS", diff --git a/data-workflow/v1/base-line.ini b/data-workflow/v1/base-line.ini index e3bb2ff..a7814a5 100644 --- a/data-workflow/v1/base-line.ini +++ b/data-workflow/v1/base-line.ini @@ -31,10 +31,6 @@ path = generator value = base-line -[use] -plugin = basics -plugin = analytics - # Step 1 (générique): Charger le fichier corpus [delegate] file = charger.cfg @@ -44,6 +40,10 @@ standalone = true logger = logger.cfg +# Step 2.0 (optionnel): Accélére le détachement du fork si l'enrichissement est lent +[fork/delegate] +file = buffer.cfg + # Step 2.1 (spécifique): Lancer un calcul sur tous les items reçus [fork/exchange] value = self().omit('uri') diff --git a/data-workflow/v1/buffer.cfg b/data-workflow/v1/buffer.cfg index a56c958..f8b16b5 100644 --- a/data-workflow/v1/buffer.cfg +++ b/data-workflow/v1/buffer.cfg @@ -21,4 +21,6 @@ location = /tmp/upload [unpack] +[metrics] +bucket = buffer diff --git a/data-workflow/v1/charger.cfg b/data-workflow/v1/charger.cfg index c1f8093..7bff09a 100644 --- a/data-workflow/v1/charger.cfg +++ b/data-workflow/v1/charger.cfg @@ -1,6 +1,5 @@ [use] plugin = basics -plugin = analytics # Step 0 (générique) : Lire le fichier standard tar.gz [TARExtract] @@ -9,8 +8,20 @@ # Step 1 (générique) : Créer un identifiant unique pour le corpus reçu [singleton] + +# Step 1.1 : On évite de récupere un champ uri existant +[singleton/env] +path = pid +value = fix(`PID${Date.now()}`) + +# Step 1.2 : On génére un identifiant unique [singleton/identify] +path = env('pid') + +# Step 1.3: On garde en mémoire l'identifiant généré (en le simplifiant) [singleton/env] path = identifier -value = get('uri').replace('uid:/', '') +value = get(env('pid')).replace('uid:/', '') +[metrics] +bucket = charger diff --git a/data-workflow/v1/conditormetrie.cfg b/data-workflow/v1/conditormetrie.cfg index 1a0d9ac..c89ac5b 100644 --- a/data-workflow/v1/conditormetrie.cfg +++ b/data-workflow/v1/conditormetrie.cfg @@ -6,7 +6,6 @@ path = number2labelDR value = fix({"01": "DR01 Ile-de-France Villejuif","02": "DR02 Paris-Centre","04": "DR04 Ile-de-France Gif-sur-Yvette","05": "DR05 Ile-de-France Meudon","16": "DR16 Paris-Normandie","06": "DR06 Centre Est","10": "DR10 Alsace","08": "DR08 Centre Limousin Poitou Charente","17": "DR17 Bretagne et Pays de la Loire","18": "DR18 Hauts-de-France","07": "DR07 Rhône Auvergne","11": "DR11 Alpes","12": "DR12 Provence et Corse","20": "DR20 Côte d'Azur","13": "DR13 Occitanie Est","14": "DR14 Occitanie Ouest","15": "DR15 Aquitaine"}) - [assign] # Récupère electronicPublicationDate et publicationDate # Prend la plus ancienne (= la plus petite) @@ -166,7 +165,7 @@ #Transformer des données inconnues de 'HostType' en repository si absence d'un DOI mais présence de Hal dans 'fulltext' [assign] path=ApilOaLocationsHal -value=get("enrichments.openAccess.unpaywall.oaLocations").map("hostType").concat([self.fulltextUrl].map((value)=>value && value.replace(/^((?!hal).)*$/,"@@@@").replace(/.*hal.*/,"repository"))).uniq().filter((value, index, collection)=>{if(!(value === "OA - Inconnu" && collection[index+1] === "repository" )){return true}}).filter(value=>value!=="@@@@").compact() +value=get("enrichments.openAccess.unpaywall.oaLocations").map("hostType").concat([self.fulltextUrl].map((value)=>value && String(value).replace(/^((?!hal).)*$/,"@@@@").replace(/.*hal.*/,"repository"))).uniq().filter((value, index, collection)=>{if(!(value === "OA - Inconnu" && collection[index+1] === "repository" )){return true}}).filter(value=>value!=="@@@@").compact() #Transformer des données inconnues en "green" si absence d'un DOI mais présence de "repository" dans 'ApilOaLocationsHal' [assign] diff --git a/data-workflow/v1/conditormetrie.ini b/data-workflow/v1/conditormetrie.ini index 65fc99c..6f64081 100644 --- a/data-workflow/v1/conditormetrie.ini +++ b/data-workflow/v1/conditormetrie.ini @@ -31,13 +31,6 @@ path = generator value = conditormetrie -path = language -value = en - -[use] -plugin = basics -plugin = analytics - # Step 1 (générique): Charger le fichier corpus [delegate] file = charger.cfg diff --git a/data-workflow/v1/logger.cfg b/data-workflow/v1/logger.cfg index b46e88a..6377dd9 100644 --- a/data-workflow/v1/logger.cfg +++ b/data-workflow/v1/logger.cfg @@ -2,6 +2,9 @@ plugin = basics plugin = analytics +[metrics] +bucket = logger + # On ne garde que la première erreur déclénchée [shift] diff --git a/data-workflow/v1/recorder.cfg b/data-workflow/v1/recorder.cfg index 66e4702..7811144 100644 --- a/data-workflow/v1/recorder.cfg +++ b/data-workflow/v1/recorder.cfg @@ -2,6 +2,9 @@ plugin = basics plugin = analytics +[metrics] +bucket = recorder + # Step 2.2 (générique): Création d'un fichier résulat standard [TARDump] compress = true diff --git a/diseases-ner/README.md b/diseases-ner/README.md index 6607594..a360f01 100644 --- a/diseases-ner/README.md +++ b/diseases-ner/README.md @@ -1,4 +1,4 @@ -# chem-ner +# diseases-ner Cette instance propose un outil de reconnaissance d'entités nommées de maladies. diff --git a/fund-ner/README.md b/fund-ner/README.md index 40dddd7..1eab109 100644 --- a/fund-ner/README.md +++ b/fund-ner/README.md @@ -8,7 +8,7 @@ ## Utilisation -- [v1/funderTagger/funderTagger](#v1) +- [v1/tagger](#v1) ### v1 diff --git a/fund-ner/examples.http b/fund-ner/examples.http index 6085f08..a150612 100644 --- a/fund-ner/examples.http +++ b/fund-ner/examples.http @@ -5,7 +5,7 @@ @baseUrl=http://fund-ner.tdm-services.intra.inist.fr/ ## WS Affiliation - RNSR - v2 -POST {{baseUrl}}/v1/funderTagger/funderTagger?indent=true HTTP/1.1 +POST {{baseUrl}}/v1/tagger?indent=true HTTP/1.1 Content-Type: application/json [ diff --git a/fund-ner/swagger.json b/fund-ner/swagger.json index 2ecb2c6..d374e1e 100644 --- a/fund-ner/swagger.json +++ b/fund-ner/swagger.json @@ -3,7 +3,7 @@ "info": { "title": "fund-ner - Detection de financeurs", "summary": "Detection de financeurs", - "version": "1.0.0", + "version": "1.0.1", "termsOfService": "https://services.istex.fr/", "contact": { "name": "Inist-CNRS", @@ -15,7 +15,7 @@ "x-comment": "Will be automatically completed by the ezs server." }, { - "url": "http://vptdmservices.intra.inist.fr:49241/", + "url": "http://vptdmservices.intra.inist.fr:49243/", "description": "Latest version for production", "x-profil": "Standard" } diff --git a/fund-ner/v1/best-model.pt.dvc b/fund-ner/v1/best-model.pt.dvc new file mode 100644 index 0000000..a514b78 --- /dev/null +++ b/fund-ner/v1/best-model.pt.dvc @@ -0,0 +1,5 @@ +outs: +- md5: fdcb80f28184d1fb839f0575a9cffd39 + size: 419082833 + hash: md5 + path: best-model.pt diff --git a/fund-ner/v1/funderTagger/best-model.pt.dvc b/fund-ner/v1/funderTagger/best-model.pt.dvc deleted file mode 100644 index a514b78..0000000 --- a/fund-ner/v1/funderTagger/best-model.pt.dvc +++ /dev/null @@ -1,5 +0,0 @@ -outs: -- md5: fdcb80f28184d1fb839f0575a9cffd39 - size: 419082833 - hash: md5 - path: best-model.pt diff --git a/fund-ner/v1/funderTagger/funderTagger.ini b/fund-ner/v1/funderTagger/funderTagger.ini deleted file mode 100644 index 7a5ca20..0000000 --- a/fund-ner/v1/funderTagger/funderTagger.ini +++ /dev/null @@ -1,41 +0,0 @@ -# OpenAPI Documentation - JSON format (dot notation) -mimeType = application/json -post.description = Trouve des financeurs dans un texte -post.responses.default.description = Renvoie un Json id/value, où value est une liste de financeurs -post.responses.default.content.application/json.schema.$ref = #/components/schemas/JSONStream -post.summary = Recherche de financeurs -post.requestBody.required = true -post.requestBody.content.application/json.schema.$ref = #/components/schemas/JSONStream -post.parameters.0.in = query -post.parameters.0.name = indent -post.parameters.0.schema.type = boolean -post.parameters.0.description = Indent or not the JSON Result - -# Examples - -post.requestBody.content.application/json.example.0.id = 1 -post.requestBody.content.application/json.example.0.value = This study was funded by the CNRS and INIST. -post.responses.default.content.application/json.example.0.id = 1 -post.responses.default.content.application/json.example.0.value.0 = CNRS -post.responses.default.content.application/json.example.0.value.1 = INIST - - -[use] -plugin = @ezs/spawn -plugin = @ezs/basics -plugin = @ezs/storage -plugin = @ezs/analytics -[JSONParse] -separator = * -[expand] -path = env('path', 'value') -size = 100 -# in production mode, uncomment the following line -# cache = boost -[expand/exec] -# command should be executable ! -command = ./v1/funderTagger/funderTagger.py - -#command = ./expand.py -[dump] -indent = env('indent', false) \ No newline at end of file diff --git a/fund-ner/v1/funderTagger/funderTagger.py b/fund-ner/v1/funderTagger/funderTagger.py deleted file mode 100644 index 65e0f1b..0000000 --- a/fund-ner/v1/funderTagger/funderTagger.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/opt/bitnami/python/bin/python3.7 - -from flair.data import Sentence -from flair.models import SequenceTagger -from normalize import normalize -import json -import sys -import logging - -logging.getLogger("flair").handlers[0].stream = sys.stderr -tagger = SequenceTagger.load('./v1/funderTagger/best-model.pt') -for line in sys.stdin: - data = json.loads(line) - lSent = normalize([data["value"]])[0].split() - sentence = Sentence() - for token in lSent: - sentence.add_token(token) - tagger.predict(sentence) - data["value"] = str([entity.text for entity in sentence.get_spans('ner')]) - sys.stdout.write(json.dumps(data)) - sys.stdout.write('\n') diff --git a/fund-ner/v1/funderTagger/normalize.py b/fund-ner/v1/funderTagger/normalize.py deleted file mode 100644 index b4391a2..0000000 --- a/fund-ner/v1/funderTagger/normalize.py +++ /dev/null @@ -1,37 +0,0 @@ -def normalize(words): - L = [] - for word in words: - save = word - finalFile = [] - end = ['.',',',";",":","»",")","’"] - start = ["«","(",".","‘"] - middle = ["l'","d'","j'","L'","D'","J'","l’","d’","j’","L’","D’","J’"] - queue = [] - - File = word.split() - for Word in File: - word = Word - for execp in start: - if word.startswith(execp): - finalFile.append(word[0]) - word = word[1:] - for execp in middle: - if word.startswith(execp): - finalFile.append(word[:2]) - word = word[2:] - for execp in end: - if word.endswith(execp): - queue.insert(0,word[-1]) - word = word[:-1] - - finalFile.append(word) - for i in queue: - finalFile.append(i) - queue = [] - - if finalFile == ["a"]: - #print("word = ",save) - pass - - L.append(" ".join(finalFile)) - return L \ No newline at end of file diff --git a/fund-ner/v1/normalize.py b/fund-ner/v1/normalize.py new file mode 100644 index 0000000..b4391a2 --- /dev/null +++ b/fund-ner/v1/normalize.py @@ -0,0 +1,37 @@ +def normalize(words): + L = [] + for word in words: + save = word + finalFile = [] + end = ['.',',',";",":","»",")","’"] + start = ["«","(",".","‘"] + middle = ["l'","d'","j'","L'","D'","J'","l’","d’","j’","L’","D’","J’"] + queue = [] + + File = word.split() + for Word in File: + word = Word + for execp in start: + if word.startswith(execp): + finalFile.append(word[0]) + word = word[1:] + for execp in middle: + if word.startswith(execp): + finalFile.append(word[:2]) + word = word[2:] + for execp in end: + if word.endswith(execp): + queue.insert(0,word[-1]) + word = word[:-1] + + finalFile.append(word) + for i in queue: + finalFile.append(i) + queue = [] + + if finalFile == ["a"]: + #print("word = ",save) + pass + + L.append(" ".join(finalFile)) + return L \ No newline at end of file diff --git a/fund-ner/v1/tagger.ini b/fund-ner/v1/tagger.ini new file mode 100644 index 0000000..7f761f3 --- /dev/null +++ b/fund-ner/v1/tagger.ini @@ -0,0 +1,41 @@ +# OpenAPI Documentation - JSON format (dot notation) +mimeType = application/json +post.description = Trouve des financeurs dans un texte +post.responses.default.description = Renvoie un Json id/value, où value est une liste de financeurs +post.responses.default.content.application/json.schema.$ref = #/components/schemas/JSONStream +post.summary = Recherche de financeurs +post.requestBody.required = true +post.requestBody.content.application/json.schema.$ref = #/components/schemas/JSONStream +post.parameters.0.in = query +post.parameters.0.name = indent +post.parameters.0.schema.type = boolean +post.parameters.0.description = Indent or not the JSON Result + +# Examples + +post.requestBody.content.application/json.example.0.id = 1 +post.requestBody.content.application/json.example.0.value = This study was funded by the CNRS and INIST. +post.responses.default.content.application/json.example.0.id = 1 +post.responses.default.content.application/json.example.0.value.0 = CNRS +post.responses.default.content.application/json.example.0.value.1 = INIST + + +[use] +plugin = @ezs/spawn +plugin = @ezs/basics +plugin = @ezs/storage +plugin = @ezs/analytics +[JSONParse] +separator = * +[expand] +path = env('path', 'value') +size = 100 +# in production mode, uncomment the following line +# cache = boost +[expand/exec] +# command should be executable ! +command = ./v1/tagger.py + +#command = ./expand.py +[dump] +indent = env('indent', false) diff --git a/fund-ner/v1/tagger.py b/fund-ner/v1/tagger.py new file mode 100644 index 0000000..bdd493c --- /dev/null +++ b/fund-ner/v1/tagger.py @@ -0,0 +1,21 @@ +#!/opt/bitnami/python/bin/python3.7 + +from flair.data import Sentence +from flair.models import SequenceTagger +from normalize import normalize +import json +import sys +import logging + +logging.getLogger("flair").handlers[0].stream = sys.stderr +tagger = SequenceTagger.load('./v1/best-model.pt') +for line in sys.stdin: + data = json.loads(line) + lSent = normalize([data["value"]])[0].split() + sentence = Sentence() + for token in lSent: + sentence.add_token(token) + tagger.predict(sentence) + data["value"] = str([entity.text for entity in sentence.get_spans('ner')]) + sys.stdout.write(json.dumps(data)) + sys.stdout.write('\n') diff --git a/loterre-resolvers-conf.json b/loterre-resolvers-conf.json deleted file mode 100644 index 832e0cd..0000000 --- a/loterre-resolvers-conf.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "environnement": { - "EZS_TITLE": "Loterre resolvers", - "EZS_DESCRIPTION": "Résolveurs pour des terminologies Loterre", - "EZS_METRICS": true, - "EZS_CONCURRENCY": 4, - "EZS_NSHARDS": 32, - "EZS_CACHE": true, - "EZS_VERBOSE": false, - "CRON_VERBOSE": false, - "NODE_OPTIONS": "--max_old_space_size=1024", - "NODE_ENV": "production" - }, - "packages": [ - "@ezs/core@3.0.0", - "@ezs/basics@2.1.0", - "@ezs/analytics@2.0.22", - "@ezs/xslt@1.3.2", - "@ezs/storage@3.1.0" - ], - "files" : { - "zip": "https://gitbucket.inist.fr/tdm/web-services/archive/loterre-resolvers/loterre-resolvers@4.4.2.zip" - } -} diff --git a/loterre-resolvers/config.json b/loterre-resolvers/config.json new file mode 100644 index 0000000..832e0cd --- /dev/null +++ b/loterre-resolvers/config.json @@ -0,0 +1,24 @@ +{ + "environnement": { + "EZS_TITLE": "Loterre resolvers", + "EZS_DESCRIPTION": "Résolveurs pour des terminologies Loterre", + "EZS_METRICS": true, + "EZS_CONCURRENCY": 4, + "EZS_NSHARDS": 32, + "EZS_CACHE": true, + "EZS_VERBOSE": false, + "CRON_VERBOSE": false, + "NODE_OPTIONS": "--max_old_space_size=1024", + "NODE_ENV": "production" + }, + "packages": [ + "@ezs/core@3.0.0", + "@ezs/basics@2.1.0", + "@ezs/analytics@2.0.22", + "@ezs/xslt@1.3.2", + "@ezs/storage@3.1.0" + ], + "files" : { + "zip": "https://gitbucket.inist.fr/tdm/web-services/archive/loterre-resolvers/loterre-resolvers@4.4.2.zip" + } +} diff --git a/pdf-tools-conf.json b/pdf-tools-conf.json deleted file mode 100644 index d2c53b0..0000000 --- a/pdf-tools-conf.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "environnement": { - "EZS_TITLE": "xp", - "EZS_DESCRIPTION": "Web services XP", - "EZS_METRICS": true, - "EZS_CONCURRENCY": 4, - "EZS_NSHARDS": 32, - "EZS_CACHE": true, - "NODE_OPTIONS": "--max_old_space_size=1024", - "NODE_ENV": "production" - }, - "packages": [ - "@ezs/spawn@1.3.3" - ] -} diff --git a/pdf-tools/config.json b/pdf-tools/config.json new file mode 100644 index 0000000..d2c53b0 --- /dev/null +++ b/pdf-tools/config.json @@ -0,0 +1,15 @@ +{ + "environnement": { + "EZS_TITLE": "xp", + "EZS_DESCRIPTION": "Web services XP", + "EZS_METRICS": true, + "EZS_CONCURRENCY": 4, + "EZS_NSHARDS": 32, + "EZS_CACHE": true, + "NODE_OPTIONS": "--max_old_space_size=1024", + "NODE_ENV": "production" + }, + "packages": [ + "@ezs/spawn@1.3.3" + ] +}