diff --git a/geoEntity-tagger/README.md b/geoEntity-tagger/README.md new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/geoEntity-tagger/README.md diff --git a/geoEntity-tagger/examples.http b/geoEntity-tagger/examples.http new file mode 100644 index 0000000..49f1255 --- /dev/null +++ b/geoEntity-tagger/examples.http @@ -0,0 +1,14 @@ +# Détection d'entité géographique d'un texte +POST https://geoEntity-tagger.services.inist.fr/v1/geoTagger?indent=true HTTP/1.1 +Content-Type: application/json + +[ + { + "id":"1", + "value":"In the southern French Massif Central, the Montagne Noire axial zone is a NE-SW elongated granite-migmatite dome emplaced within Visean south-verging recumbent folds and intruded by syn- to late-migmatization granitoids. The tectonic setting of this dome is still disputed, thus several models have been proposed. In order to better understand the emplacement mechanism of this dome, petrofabric and Anisotropy of Magnetic Susceptibility (AMS) studies have been carried out. In the granites and migmatites that form the dome core, magmatic texture and to a lesser extent weak solid-state texture are dominant. As a paramagnetic mineral, biotite is the main carrier of the magnetic susceptibility. On the basis of 135 AMS sites, the magnetic fabrics appear as independent of the lithology but related to the dome architecture. Coupling our results with previous structural and geochronological studies, allows us to propose a new emplacement model. Between 340-325 Ma, the Palaeozoic series underwent a compressional deformation represented by nappes and recumbent folds involving the thermal event leading to partial melting. Until ~325-310 Ma, the dome emplacement was assisted by diapiric processes. An extensional event took place at 300 Ma, after the emplacement of the late to post-migmatitic granitic plutons. In the northeast side of the dome, a brittle normal-dextral faulting controlled the opening of the Graissessac coal-basin." + }, + { + "id":"2", + "value":"The COVID-19 pandemic, also known as the coronavirus pandemic, is an ongoing global pandemic of coronavirus disease 2019 (COVID-19) caused by severe acute respiratory syndrome coronavirus2 (SARS-CoV-2). It was first identified in December 2019 in Wuhan, China. The World Health Organization declared the outbreak a Public Health Emergency of International Concern on 20 January 2020, and later a pandemic on 11 March 2020. As of 2 April 2021, more than 129 million cases have been confirmed, with more than 2.82 million deaths attributed to COVID-19, making it one of the deadliest pandemics in history." + } +] \ No newline at end of file diff --git a/geoEntity-tagger/requirements.txt b/geoEntity-tagger/requirements.txt new file mode 100644 index 0000000..843973f --- /dev/null +++ b/geoEntity-tagger/requirements.txt @@ -0,0 +1 @@ +flair==0.10 \ No newline at end of file diff --git a/geoEntity-tagger/swagger.json b/geoEntity-tagger/swagger.json new file mode 100644 index 0000000..2010e98 --- /dev/null +++ b/geoEntity-tagger/swagger.json @@ -0,0 +1,20 @@ +{ + "servers": [{ + "url": "{scheme}://{hostname}", + "variables": { + "scheme": { + "default": "https" + } + } + }], + "tags": [ + { + "name": "geoTagger", + "description": "Identification d'entité géographique", + "externalDocs": { + "description": "Plus de documentation", + "url": "https://gitbucket.inist.fr/tdm/web-services/tree/master/geoEntity-tagger" + } + } + ] +} \ No newline at end of file diff --git a/geoEntity-tagger/v1/geoTagger.ini b/geoEntity-tagger/v1/geoTagger.ini new file mode 100644 index 0000000..6fbe33d --- /dev/null +++ b/geoEntity-tagger/v1/geoTagger.ini @@ -0,0 +1,37 @@ +# OpenAPI Documentation - JSON format (dot notation) +mimeType = application/json + +post.description = Détecte les entités géographiques d'un texte en anglais +post.responses.default.description = Renvoie un Json composé d'`id`, `value` avec `value` la liste des entités géographiques trouvées +post.responses.default.content.application/json.schema.$ref = #/components/schemas/JSONStream +post.summary = Détection d'entité géographique +post.requestBody.required = true +post.requestBody.content.application/json.schema.$ref = #/components/schemas/JSONStream +post.parameters.0.in = query +post.parameters.0.name = path +post.parameters.0.schema.type = string +post.parameters.0.description = The path in each object to enrich with an Python script +post.parameters.1.in = query +post.parameters.1.name = indent +post.parameters.1.schema.type = boolean +post.parameters.1.description = Indent or not the JSON Result + +[use] +plugin = @ezs/spawn +plugin = @ezs/basics +plugin = @ezs/storage +plugin = @ezs/analytics + +[JSONParse] +separator = * + +[expand] +path = env('path', 'value') +size = 100 + +[expand/exec] +# command should be executable ! +command = ./v1/geoTagger.py + +[dump] +indent = env('indent', false) \ No newline at end of file diff --git a/geoEntity-tagger/v1/geoTagger.py b/geoEntity-tagger/v1/geoTagger.py new file mode 100644 index 0000000..277d3df --- /dev/null +++ b/geoEntity-tagger/v1/geoTagger.py @@ -0,0 +1,20 @@ +import sys +import json + +from flair.models import SequenceTagger +from flair.data import Sentence + +tagger = SequenceTagger.load("flair/ner-english") + +for line in sys.stdin: + data = json.loads(line) + text=data['value'] + text = Sentence(text) + tagger.predict(text) + geo = [] + for entity in text.get_spans('ner'): + if entity.tag == "LOC": + geo.append(entity.text) + data['value'] = geo + sys.stdout.write(json.dumps(data)) + sys.stdout.write('\n') \ No newline at end of file