#!/usr/bin/python3
import sys
import json
import os
from deduplicate.deduplicate import RecordDeduplicate
from deduplicate.connect2elastic import ES_request, Connection
import argparse
parser = argparse.ArgumentParser("Python script deduplicate conditor record")
parser.add_argument("--url", "-u", help= "Elasticsearch database URL where search request")
parser.add_argument("--index", help="Elasticsearch index")
parser.add_argument("--https_proxy", default = None, help="https proxy. Default value in None")
parser.add_argument("--http_proxy", default = None, help ="http proxy. Default value in None")
parser.add_argument("--size", "-s", help="output file", default=100)
parser.add_argument("--input", help="A Corpus to deduplicate at json format")
#parser.add_argument("--output", help="A result corpus from algorithm")
args = parser.parse_args()
URL = args.url
INDEX = args.index
SIZE = args.size
HTTPS_PROXY = args.https_proxy
HTTP_PROXY = args.http_proxy
proxies = {"https" : HTTPS_PROXY, "http" : HTTP_PROXY}
# Instanciate elastic client and its request class
es = ES_request(
es_url=URL,
connection_class= Connection,
proxies = proxies,
size = SIZE
)
for line in sys.stdin :
data = json.loads(line)
record = RecordDeduplicate(data['value'], index = INDEX, es = es)
dup = record.deduplicate()
data["value"] = dup
sys.stdout.write(json.dumps(data))
sys.stdout.write('\n')