diff --git a/co-deduplicate/conditor.ini b/co-deduplicate/conditor.ini new file mode 100644 index 0000000..47dbf82 --- /dev/null +++ b/co-deduplicate/conditor.ini @@ -0,0 +1,52 @@ +# OpenAPI Documentation - JSON format (dot notation) +post.responses.default.description = Return all objects with enrich fields +post.responses.default.content.application/json.schema.$ref = #/components/schemas/JSONStream +post.summary = Enrich one field of each Object with a Python function +post.requestBody.required = true +post.requestBody.content.application/json.schema.$ref = #/components/schemas/JSONStream +post.parameters.0.in = query +post.parameters.0.name = path +post.parameters.0.schema.type = string +post.parameters.0.description = The path in each object to enrich with an Python script +post.parameters.1.in = query +post.parameters.1.name = indent +post.parameters.1.schema.type = boolean +post.parameters.1.description = Indent or not the JSON Result + + +[use] +plugin = @ezs/local +plugin = @ezs/basics +plugin = @ezs/storage +plugin = @ezs/analytics + +[JSONParse] +separator = * + +[expand] +path = env('path', 'value') +size = 100 + +# in production mode, uncomment the following line +# cache = boost + +[expand/exec] +# command should be executable ! +command = ./v1/conditor.py +args = fix('--url') +args = env('--url',"http://vp-conditor-es.intra.inist.fr:9200") +args = fix('--index') +args = env('--index','records-202012') +; args = fix('--login') +; args = env('--login','elastic') +; args = fix('--password') +; args = env('--password','secret') +args = fix('--size') +args = env('--size','100') +; args = fix('--http_proxy') +; args = env('--http_proxy','') +; args = fix('--https_proxy') +; args = env('--https_proxy','') + +[dump] +indent = env('indent', false) diff --git a/co-deduplicate/corhal.ini b/co-deduplicate/corhal.ini new file mode 100644 index 0000000..cf118dd --- /dev/null +++ b/co-deduplicate/corhal.ini @@ -0,0 +1,52 @@ +# OpenAPI Documentation - JSON format (dot notation) +post.responses.default.description = Return all objects with enrich fields +post.responses.default.content.application/json.schema.$ref = #/components/schemas/JSONStream +post.summary = Enrich one field of each Object with a Python function +post.requestBody.required = true +post.requestBody.content.application/json.schema.$ref = #/components/schemas/JSONStream +post.parameters.0.in = query +post.parameters.0.name = path +post.parameters.0.schema.type = string +post.parameters.0.description = The path in each object to enrich with an Python script +post.parameters.1.in = query +post.parameters.1.name = indent +post.parameters.1.schema.type = boolean +post.parameters.1.description = Indent or not the JSON Result + + +[use] +plugin = @ezs/local +plugin = @ezs/basics +plugin = @ezs/storage +plugin = @ezs/analytics + +[JSONParse] +separator = * + +[expand] +path = env('path', 'value') +size = 100 + +# in production mode, uncomment the following line +# cache = boost + +[expand/exec] +# command should be executable ! +command = ./v1/corhal.py +args = fix('--url') +args = env('--url',"https://vicorhales1.intra.inist.fr:9200") +args = fix('--index') +args = env('--index','records-fromconditor') +args = fix('--login') +args = env('--login','elastic') +args = fix('--password') +args = env('--password','secret') +args = fix('--size') +args = env('--size','100') +; args = fix('--http_proxy') +; args = env('--http_proxy','') +; args = fix('--https_proxy') +; args = env('--https_proxy','') + +[dump] +indent = env('indent', false) diff --git a/co-deduplicate/dduplicate.ini b/co-deduplicate/dduplicate.ini deleted file mode 100644 index fb5784e..0000000 --- a/co-deduplicate/dduplicate.ini +++ /dev/null @@ -1,48 +0,0 @@ -# OpenAPI Documentation - JSON format (dot notation) -post.responses.default.description = Return all objects with enrich fields -post.responses.default.content.application/json.schema.$ref = #/components/schemas/JSONStream -post.summary = Enrich one field of each Object with a Python function -post.requestBody.required = true -post.requestBody.content.application/json.schema.$ref = #/components/schemas/JSONStream -post.parameters.0.in = query -post.parameters.0.name = path -post.parameters.0.schema.type = string -post.parameters.0.description = The path in each object to enrich with an Python script -post.parameters.1.in = query -post.parameters.1.name = indent -post.parameters.1.schema.type = boolean -post.parameters.1.description = Indent or not the JSON Result - - -[use] -plugin = @ezs/local -plugin = @ezs/basics -plugin = @ezs/storage -plugin = @ezs/analytics - -[JSONParse] -separator = * - -[expand] -path = env('path', 'value') -size = 100 - -# in production mode, uncomment the following line -# cache = boost - -[expand/exec] -# command should be executable ! -command = ./v1/dduplicate.py -args = fix('--url') -args = env('--url',"http://vp-conditor-es.intra.inist.fr:9200") -args = fix('--index') -args = env('--index','records-202012') -args = fix('--size') -args = env('--size','100') -; args = fix('--http_proxy') -; args = env('--http_proxy','') -; args = fix('--https_proxy') -; args = env('--https_proxy','') - -[dump] -indent = env('indent', false) diff --git a/co-deduplicate/requirements.txt b/co-deduplicate/requirements.txt index 20a95d1..39de335 100644 --- a/co-deduplicate/requirements.txt +++ b/co-deduplicate/requirements.txt @@ -17,5 +17,5 @@ tqdm==4.62.3 Unidecode==1.3.2 urllib3==1.26.7 -co-deduplicate==0.0.3 +co-deduplicate==1.0.1 argparse==1.4.0 diff --git a/co-deduplicate/v1/conditor.py b/co-deduplicate/v1/conditor.py new file mode 100644 index 0000000..e46108b --- /dev/null +++ b/co-deduplicate/v1/conditor.py @@ -0,0 +1,42 @@ +#!/usr/bin/python3 + +import sys +import json +from deduplicate.conditor_dduplicate import ConditorRecordDeduplicate +from deduplicate.connect2elastic import ES_request, Connection +import argparse + +parser = argparse.ArgumentParser("Python script deduplicate conditor record") +parser.add_argument("--url", "-u", help= "Elasticsearch database URL where search request") +parser.add_argument("--index", help="Elasticsearch index") +parser.add_argument("--https_proxy", default = None, help="https proxy. Default value in None") +parser.add_argument("--http_proxy", default = None, help ="http proxy. Default value in None") +parser.add_argument("--size", "-s", help="output file", default=100) +parser.add_argument("--input", help="A Corpus to deduplicate at json format") +#parser.add_argument("--output", help="A result corpus from algorithm") +args = parser.parse_args() + +URL = args.url +INDEX = args.index +SIZE = args.size +HTTPS_PROXY = args.https_proxy +HTTP_PROXY = args.http_proxy + +proxies = {"https" : HTTPS_PROXY, "http" : HTTP_PROXY} + +# Instanciate elastic client and its request class +es = ES_request( + es_url=URL, + connection_class= Connection, + proxies = proxies, + size = SIZE +) + + +for line in sys.stdin : + data = json.loads(line) + record = ConditorRecordDeduplicate(data['value'], index = INDEX, es = es) + dup = record.deduplicate() + data["value"] = dup + sys.stdout.write(json.dumps(data)) + sys.stdout.write('\n') diff --git a/co-deduplicate/v1/corhal.py b/co-deduplicate/v1/corhal.py new file mode 100644 index 0000000..ba068d2 --- /dev/null +++ b/co-deduplicate/v1/corhal.py @@ -0,0 +1,52 @@ +#!/usr/bin/python3 + +import sys +import json +from deduplicate.corhal_dduplicate import CorhalRecordDeduplicate +from deduplicate.connect2elastic import ES_request, Connection +import argparse +import warnings + +warnings.filterwarnings("ignore") + +parser = argparse.ArgumentParser("Python script deduplicate conditor record") +parser.add_argument("--url", "-u", help= "Elasticsearch database URL where search request") +parser.add_argument("--index", help="Elasticsearch index") +parser.add_argument("--login", "-l", default = None, help="Login to connect to elasticsearch") +parser.add_argument("--password", "-pw", default = None, help="Login to connect to elasticsearch") +parser.add_argument("--https_proxy", default = None, help="https proxy. Default value in None") +parser.add_argument("--http_proxy", default = None, help ="http proxy. Default value in None") +parser.add_argument("--size", "-s", help="output file", default=100) +parser.add_argument("--input", help="A Corpus to deduplicate at json format") +#parser.add_argument("--output", help="A result corpus from algorithm") +args = parser.parse_args() + +URL = args.url +INDEX = args.index +LOGIN = args.login +PASSWORD = args.password +SIZE = args.size +HTTPS_PROXY = args.https_proxy +HTTP_PROXY = args.http_proxy + +proxies = {"https" : HTTPS_PROXY, "http" : HTTP_PROXY} + +# Instanciate elastic client and its request class +es = ES_request( + es_url=URL, + connection_class= Connection, + proxies = proxies, + size = SIZE, + http_auth=(LOGIN, PASSWORD), + use_ssl=True, + verify_certs=False +) + + +for line in sys.stdin : + data = json.loads(line) + record = CorhalRecordDeduplicate(data['value'], index = INDEX, es = es) + dup = record.deduplicate() + data["value"] = dup + sys.stdout.write(json.dumps(data)) + sys.stdout.write('\n') diff --git a/co-deduplicate/v1/dduplicate.py b/co-deduplicate/v1/dduplicate.py deleted file mode 100644 index ee609d1..0000000 --- a/co-deduplicate/v1/dduplicate.py +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/python3 - -import sys -import json -import os -from deduplicate.deduplicate import RecordDeduplicate -from deduplicate.connect2elastic import ES_request, Connection -import argparse - -parser = argparse.ArgumentParser("Python script deduplicate conditor record") -parser.add_argument("--url", "-u", help= "Elasticsearch database URL where search request") -parser.add_argument("--index", help="Elasticsearch index") -parser.add_argument("--https_proxy", default = None, help="https proxy. Default value in None") -parser.add_argument("--http_proxy", default = None, help ="http proxy. Default value in None") -parser.add_argument("--size", "-s", help="output file", default=100) -parser.add_argument("--input", help="A Corpus to deduplicate at json format") -#parser.add_argument("--output", help="A result corpus from algorithm") -args = parser.parse_args() - -URL = args.url -INDEX = args.index -SIZE = args.size -HTTPS_PROXY = args.https_proxy -HTTP_PROXY = args.http_proxy - -proxies = {"https" : HTTPS_PROXY, "http" : HTTP_PROXY} - -# Instanciate elastic client and its request class -es = ES_request( - es_url=URL, - connection_class= Connection, - proxies = proxies, - size = SIZE -) - - -for line in sys.stdin : - data = json.loads(line) - record = RecordDeduplicate(data['value'], index = INDEX, es = es) - dup = record.deduplicate() - data["value"] = dup - sys.stdout.write(json.dumps(data)) - sys.stdout.write('\n')