diff --git a/co-deduplicate/v1/dduplicate.py b/co-deduplicate/v1/dduplicate.py new file mode 100644 index 0000000..ee609d1 --- /dev/null +++ b/co-deduplicate/v1/dduplicate.py @@ -0,0 +1,43 @@ +#!/usr/bin/python3 + +import sys +import json +import os +from deduplicate.deduplicate import RecordDeduplicate +from deduplicate.connect2elastic import ES_request, Connection +import argparse + +parser = argparse.ArgumentParser("Python script deduplicate conditor record") +parser.add_argument("--url", "-u", help= "Elasticsearch database URL where search request") +parser.add_argument("--index", help="Elasticsearch index") +parser.add_argument("--https_proxy", default = None, help="https proxy. Default value in None") +parser.add_argument("--http_proxy", default = None, help ="http proxy. Default value in None") +parser.add_argument("--size", "-s", help="output file", default=100) +parser.add_argument("--input", help="A Corpus to deduplicate at json format") +#parser.add_argument("--output", help="A result corpus from algorithm") +args = parser.parse_args() + +URL = args.url +INDEX = args.index +SIZE = args.size +HTTPS_PROXY = args.https_proxy +HTTP_PROXY = args.http_proxy + +proxies = {"https" : HTTPS_PROXY, "http" : HTTP_PROXY} + +# Instanciate elastic client and its request class +es = ES_request( + es_url=URL, + connection_class= Connection, + proxies = proxies, + size = SIZE +) + + +for line in sys.stdin : + data = json.loads(line) + record = RecordDeduplicate(data['value'], index = INDEX, es = es) + dup = record.deduplicate() + data["value"] = dup + sys.stdout.write(json.dumps(data)) + sys.stdout.write('\n')