diff --git a/fund-ner/README.md b/fund-ner/README.md index 40dddd7..1eab109 100644 --- a/fund-ner/README.md +++ b/fund-ner/README.md @@ -8,7 +8,7 @@ ## Utilisation -- [v1/funderTagger/funderTagger](#v1) +- [v1/tagger](#v1) ### v1 diff --git a/fund-ner/examples.http b/fund-ner/examples.http index 6085f08..a150612 100644 --- a/fund-ner/examples.http +++ b/fund-ner/examples.http @@ -5,7 +5,7 @@ @baseUrl=http://fund-ner.tdm-services.intra.inist.fr/ ## WS Affiliation - RNSR - v2 -POST {{baseUrl}}/v1/funderTagger/funderTagger?indent=true HTTP/1.1 +POST {{baseUrl}}/v1/tagger?indent=true HTTP/1.1 Content-Type: application/json [ diff --git a/fund-ner/v1/best-model.pt.dvc b/fund-ner/v1/best-model.pt.dvc new file mode 100644 index 0000000..a514b78 --- /dev/null +++ b/fund-ner/v1/best-model.pt.dvc @@ -0,0 +1,5 @@ +outs: +- md5: fdcb80f28184d1fb839f0575a9cffd39 + size: 419082833 + hash: md5 + path: best-model.pt diff --git a/fund-ner/v1/funderTagger/best-model.pt.dvc b/fund-ner/v1/funderTagger/best-model.pt.dvc deleted file mode 100644 index a514b78..0000000 --- a/fund-ner/v1/funderTagger/best-model.pt.dvc +++ /dev/null @@ -1,5 +0,0 @@ -outs: -- md5: fdcb80f28184d1fb839f0575a9cffd39 - size: 419082833 - hash: md5 - path: best-model.pt diff --git a/fund-ner/v1/funderTagger/funderTagger.ini b/fund-ner/v1/funderTagger/funderTagger.ini deleted file mode 100644 index 7a5ca20..0000000 --- a/fund-ner/v1/funderTagger/funderTagger.ini +++ /dev/null @@ -1,41 +0,0 @@ -# OpenAPI Documentation - JSON format (dot notation) -mimeType = application/json -post.description = Trouve des financeurs dans un texte -post.responses.default.description = Renvoie un Json id/value, où value est une liste de financeurs -post.responses.default.content.application/json.schema.$ref = #/components/schemas/JSONStream -post.summary = Recherche de financeurs -post.requestBody.required = true -post.requestBody.content.application/json.schema.$ref = #/components/schemas/JSONStream -post.parameters.0.in = query -post.parameters.0.name = indent -post.parameters.0.schema.type = boolean -post.parameters.0.description = Indent or not the JSON Result - -# Examples - -post.requestBody.content.application/json.example.0.id = 1 -post.requestBody.content.application/json.example.0.value = This study was funded by the CNRS and INIST. -post.responses.default.content.application/json.example.0.id = 1 -post.responses.default.content.application/json.example.0.value.0 = CNRS -post.responses.default.content.application/json.example.0.value.1 = INIST - - -[use] -plugin = @ezs/spawn -plugin = @ezs/basics -plugin = @ezs/storage -plugin = @ezs/analytics -[JSONParse] -separator = * -[expand] -path = env('path', 'value') -size = 100 -# in production mode, uncomment the following line -# cache = boost -[expand/exec] -# command should be executable ! -command = ./v1/funderTagger/funderTagger.py - -#command = ./expand.py -[dump] -indent = env('indent', false) \ No newline at end of file diff --git a/fund-ner/v1/funderTagger/funderTagger.py b/fund-ner/v1/funderTagger/funderTagger.py deleted file mode 100644 index 65e0f1b..0000000 --- a/fund-ner/v1/funderTagger/funderTagger.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/opt/bitnami/python/bin/python3.7 - -from flair.data import Sentence -from flair.models import SequenceTagger -from normalize import normalize -import json -import sys -import logging - -logging.getLogger("flair").handlers[0].stream = sys.stderr -tagger = SequenceTagger.load('./v1/funderTagger/best-model.pt') -for line in sys.stdin: - data = json.loads(line) - lSent = normalize([data["value"]])[0].split() - sentence = Sentence() - for token in lSent: - sentence.add_token(token) - tagger.predict(sentence) - data["value"] = str([entity.text for entity in sentence.get_spans('ner')]) - sys.stdout.write(json.dumps(data)) - sys.stdout.write('\n') diff --git a/fund-ner/v1/funderTagger/normalize.py b/fund-ner/v1/funderTagger/normalize.py deleted file mode 100644 index b4391a2..0000000 --- a/fund-ner/v1/funderTagger/normalize.py +++ /dev/null @@ -1,37 +0,0 @@ -def normalize(words): - L = [] - for word in words: - save = word - finalFile = [] - end = ['.',',',";",":","»",")","’"] - start = ["«","(",".","‘"] - middle = ["l'","d'","j'","L'","D'","J'","l’","d’","j’","L’","D’","J’"] - queue = [] - - File = word.split() - for Word in File: - word = Word - for execp in start: - if word.startswith(execp): - finalFile.append(word[0]) - word = word[1:] - for execp in middle: - if word.startswith(execp): - finalFile.append(word[:2]) - word = word[2:] - for execp in end: - if word.endswith(execp): - queue.insert(0,word[-1]) - word = word[:-1] - - finalFile.append(word) - for i in queue: - finalFile.append(i) - queue = [] - - if finalFile == ["a"]: - #print("word = ",save) - pass - - L.append(" ".join(finalFile)) - return L \ No newline at end of file diff --git a/fund-ner/v1/normalize.py b/fund-ner/v1/normalize.py new file mode 100644 index 0000000..b4391a2 --- /dev/null +++ b/fund-ner/v1/normalize.py @@ -0,0 +1,37 @@ +def normalize(words): + L = [] + for word in words: + save = word + finalFile = [] + end = ['.',',',";",":","»",")","’"] + start = ["«","(",".","‘"] + middle = ["l'","d'","j'","L'","D'","J'","l’","d’","j’","L’","D’","J’"] + queue = [] + + File = word.split() + for Word in File: + word = Word + for execp in start: + if word.startswith(execp): + finalFile.append(word[0]) + word = word[1:] + for execp in middle: + if word.startswith(execp): + finalFile.append(word[:2]) + word = word[2:] + for execp in end: + if word.endswith(execp): + queue.insert(0,word[-1]) + word = word[:-1] + + finalFile.append(word) + for i in queue: + finalFile.append(i) + queue = [] + + if finalFile == ["a"]: + #print("word = ",save) + pass + + L.append(" ".join(finalFile)) + return L \ No newline at end of file diff --git a/fund-ner/v1/tagger.ini b/fund-ner/v1/tagger.ini new file mode 100644 index 0000000..7f761f3 --- /dev/null +++ b/fund-ner/v1/tagger.ini @@ -0,0 +1,41 @@ +# OpenAPI Documentation - JSON format (dot notation) +mimeType = application/json +post.description = Trouve des financeurs dans un texte +post.responses.default.description = Renvoie un Json id/value, où value est une liste de financeurs +post.responses.default.content.application/json.schema.$ref = #/components/schemas/JSONStream +post.summary = Recherche de financeurs +post.requestBody.required = true +post.requestBody.content.application/json.schema.$ref = #/components/schemas/JSONStream +post.parameters.0.in = query +post.parameters.0.name = indent +post.parameters.0.schema.type = boolean +post.parameters.0.description = Indent or not the JSON Result + +# Examples + +post.requestBody.content.application/json.example.0.id = 1 +post.requestBody.content.application/json.example.0.value = This study was funded by the CNRS and INIST. +post.responses.default.content.application/json.example.0.id = 1 +post.responses.default.content.application/json.example.0.value.0 = CNRS +post.responses.default.content.application/json.example.0.value.1 = INIST + + +[use] +plugin = @ezs/spawn +plugin = @ezs/basics +plugin = @ezs/storage +plugin = @ezs/analytics +[JSONParse] +separator = * +[expand] +path = env('path', 'value') +size = 100 +# in production mode, uncomment the following line +# cache = boost +[expand/exec] +# command should be executable ! +command = ./v1/tagger.py + +#command = ./expand.py +[dump] +indent = env('indent', false) diff --git a/fund-ner/v1/tagger.py b/fund-ner/v1/tagger.py new file mode 100644 index 0000000..bdd493c --- /dev/null +++ b/fund-ner/v1/tagger.py @@ -0,0 +1,21 @@ +#!/opt/bitnami/python/bin/python3.7 + +from flair.data import Sentence +from flair.models import SequenceTagger +from normalize import normalize +import json +import sys +import logging + +logging.getLogger("flair").handlers[0].stream = sys.stderr +tagger = SequenceTagger.load('./v1/best-model.pt') +for line in sys.stdin: + data = json.loads(line) + lSent = normalize([data["value"]])[0].split() + sentence = Sentence() + for token in lSent: + sentence.add_token(token) + tagger.predict(sentence) + data["value"] = str([entity.text for entity in sentence.get_spans('ner')]) + sys.stdout.write(json.dumps(data)) + sys.stdout.write('\n')