diff --git a/co-deduplicate/deduplicate.py b/co-deduplicate/deduplicate.py deleted file mode 100644 index ee609d1..0000000 --- a/co-deduplicate/deduplicate.py +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/python3 - -import sys -import json -import os -from deduplicate.deduplicate import RecordDeduplicate -from deduplicate.connect2elastic import ES_request, Connection -import argparse - -parser = argparse.ArgumentParser("Python script deduplicate conditor record") -parser.add_argument("--url", "-u", help= "Elasticsearch database URL where search request") -parser.add_argument("--index", help="Elasticsearch index") -parser.add_argument("--https_proxy", default = None, help="https proxy. Default value in None") -parser.add_argument("--http_proxy", default = None, help ="http proxy. Default value in None") -parser.add_argument("--size", "-s", help="output file", default=100) -parser.add_argument("--input", help="A Corpus to deduplicate at json format") -#parser.add_argument("--output", help="A result corpus from algorithm") -args = parser.parse_args() - -URL = args.url -INDEX = args.index -SIZE = args.size -HTTPS_PROXY = args.https_proxy -HTTP_PROXY = args.http_proxy - -proxies = {"https" : HTTPS_PROXY, "http" : HTTP_PROXY} - -# Instanciate elastic client and its request class -es = ES_request( - es_url=URL, - connection_class= Connection, - proxies = proxies, - size = SIZE -) - - -for line in sys.stdin : - data = json.loads(line) - record = RecordDeduplicate(data['value'], index = INDEX, es = es) - dup = record.deduplicate() - data["value"] = dup - sys.stdout.write(json.dumps(data)) - sys.stdout.write('\n') diff --git a/co-deduplicate/v1/duplicate.py b/co-deduplicate/v1/duplicate.py new file mode 100644 index 0000000..ee609d1 --- /dev/null +++ b/co-deduplicate/v1/duplicate.py @@ -0,0 +1,43 @@ +#!/usr/bin/python3 + +import sys +import json +import os +from deduplicate.deduplicate import RecordDeduplicate +from deduplicate.connect2elastic import ES_request, Connection +import argparse + +parser = argparse.ArgumentParser("Python script deduplicate conditor record") +parser.add_argument("--url", "-u", help= "Elasticsearch database URL where search request") +parser.add_argument("--index", help="Elasticsearch index") +parser.add_argument("--https_proxy", default = None, help="https proxy. Default value in None") +parser.add_argument("--http_proxy", default = None, help ="http proxy. Default value in None") +parser.add_argument("--size", "-s", help="output file", default=100) +parser.add_argument("--input", help="A Corpus to deduplicate at json format") +#parser.add_argument("--output", help="A result corpus from algorithm") +args = parser.parse_args() + +URL = args.url +INDEX = args.index +SIZE = args.size +HTTPS_PROXY = args.https_proxy +HTTP_PROXY = args.http_proxy + +proxies = {"https" : HTTPS_PROXY, "http" : HTTP_PROXY} + +# Instanciate elastic client and its request class +es = ES_request( + es_url=URL, + connection_class= Connection, + proxies = proxies, + size = SIZE +) + + +for line in sys.stdin : + data = json.loads(line) + record = RecordDeduplicate(data['value'], index = INDEX, es = es) + dup = record.deduplicate() + data["value"] = dup + sys.stdout.write(json.dumps(data)) + sys.stdout.write('\n')