diff --git a/chem-ner/README.md b/chem-ner/README.md new file mode 100644 index 0000000..6a3192b --- /dev/null +++ b/chem-ner/README.md @@ -0,0 +1,35 @@ +# chem-ner + +Cette instance propose un outil de reconnaissance d'entités nommées en chimie. + +## Configuration + +L'application à utiliser est XXX. + +## Utilisation + +- [v1/chem/tagger](#v1) + +### v1 + +Ce web-service renvoie la liste des entités nommées en chimie présentes dans le texte. + +Il prend en entrée du JSON avec deux champs, `id` et `value`, et renvoie un JSON avec le la liste des entités en fonction de leur étiquettes dans le champ `value`. + +#### Exemple de v1/first-name/gender + +Entrée + +```bash +$ cat <=512: + text_split = text.split('.') + else: + text_split = [text] + return text_split + +# predicts text after, either it is splitted or not +def predict_formula_ml_list(list): + output = [] + for elt in list: + output+= predict_formula_ml(elt) + return output + +# remove bad space in outputs +def curate_list(input_list): + output_list = [] + for elt in input_list: + if '#' not in elt: + output_list.append( + elt.replace('- ','-').replace(' -','-').replace('( ','(').replace(' (','(').replace(') ',')').replace(' )',')').replace('[ ','[') + .replace(' [','[').replace('] ',']').replace(' ]',']') + ) + return output_list + +#Disambigusate formulas : + +#preprocessing : remove duplicates elements +def remove_duplicates(input_list): + output_list = [] + normalized_list = [] + for elt in input_list: + if normalizeText(elt) not in normalized_list: + output_list.append(elt) + normalized_list.append(normalizeText(elt)) + return output_list + +def disambiguisate_formula(input_list): + output_list = [] + for elt in input_list: + try: + output_list.append(dict_name_iupac[normalizeText(elt)]) + except: + continue + return output_list + + + +# beginning of the ws +for line in sys.stdin: + data = json.loads(line) + # Use the model to find NER + value = remove_duplicates(curate_list(predict_formula_ml_list(split_text(data["value"])))) + # Standardization + data["value"] = {"chemical":value, "chemical_disambiguisate":remove_duplicates(disambiguisate_formula(value))} + json.dump(data, sys.stdout, ensure_ascii=False) + sys.stdout.write("\n") diff --git a/data-wrapper/v1/csv.ini b/data-wrapper/v1/csv.ini index e9ead95..6cc3eb2 100644 --- a/data-wrapper/v1/csv.ini +++ b/data-wrapper/v1/csv.ini @@ -50,9 +50,12 @@ value = get(env('value', 'value')) [exchange] -value = self().thru(x => _.env('slim') ? _.pick(x, ['id', 'value']) : x) +value = self().thru(x => _.env(null, 'slim') ? _.pick(x, ['id', 'value']) : x) [TARDump] compress = true manifest = fix({version: '1'}) manifest = fix({generator: 'v1/csv'}) +manifest = fix({parameters: _.omit(_.env(), 'headers')}) +manifest = fix({hostAgent: _.get(_.env(), 'headers.host')}) +manifest = fix({userAgent: _.get(_.env(), 'headers.user-agent')}) diff --git a/data-wrapper/v1/istex-tar-gz.ini b/data-wrapper/v1/istex-tar-gz.ini index 95ef476..d72d223 100644 --- a/data-wrapper/v1/istex-tar-gz.ini +++ b/data-wrapper/v1/istex-tar-gz.ini @@ -32,7 +32,6 @@ post.parameters.2.schema.default = true post.parameters.2.required = false - [env] path = slim value = env('slim').thru(x => (x === 'false' ? false : true)) @@ -52,9 +51,12 @@ value = get(env('value', 'abstract')) [exchange] -value = self().thru(x => _.env('slim') ? _.pick(x, ['id', 'value']) : x) +value = self().thru(x => _.env(null, 'slim') ? _.pick(x, ['id', 'value']) : x) [TARDump] compress = true manifest = fix({version: '1'}) manifest = fix({generator: 'v1/istex-tar-gz'}) +manifest = fix({parameters: _.omit(_.env(), 'headers')}) +manifest = fix({hostAgent: _.get(_.env(), 'headers.host')}) +manifest = fix({userAgent: _.get(_.env(), 'headers.user-agent')}) diff --git a/diseases-ner/README.md b/diseases-ner/README.md new file mode 100644 index 0000000..6607594 --- /dev/null +++ b/diseases-ner/README.md @@ -0,0 +1,35 @@ +# chem-ner + +Cette instance propose un outil de reconnaissance d'entités nommées de maladies. + +## Configuration + +L'application à utiliser est XXX. + +## Utilisation + +- [v1/diseases/tagger](#v1) + +### v1 + +Ce web-service renvoie la liste des entités nommées en chimie présentes dans le texte. + +Il prend en entrée du JSON avec deux champs, `id` et `value`, et renvoie un JSON avec le la liste des entités en fonction de leur étiquettes dans le champ `value`. + +#### Exemple de v1/first-name/gender + +Entrée + +```bash +$ cat <=512: + text_split = text.split('.') + else: + text_split = [text] + return text_split + +# predicts text after, either it is splitted or not +def predict_formula_ml_list(list): + output = [] + for elt in list: + output+= predict_formula_ml(elt) + return output + +# remove bad space in outputs +def curate_list(input_list): + output_list = [] + for elt in input_list: + if '#' not in elt: + output_list.append( + elt.replace('- ','-').replace(' -','-').replace('( ','(').replace(' (','(').replace(') ',')').replace(' )',')').replace('[ ','[') + .replace(' [','[').replace('] ',']').replace(' ]',']') + ) + return output_list + +#Disambigusate formulas : + +#preprocessing : remove duplicates elements +def remove_duplicates(input_list): + output_list = [] + normalized_list = [] + for elt in input_list: + if normalizeText(elt) not in normalized_list: + output_list.append(elt) + normalized_list.append(normalizeText(elt)) + return output_list + + +# beginning of the ws +for line in sys.stdin: + data = json.loads(line) + # Use the model to find NER + value = remove_duplicates(curate_list(predict_formula_ml_list(split_text(data["value"])))) + # Standardization + data["value"] = {"diseases":value} # remove_duplicates(value) + json.dump(data, sys.stdout, ensure_ascii=False) + sys.stdout.write("\n")