Newer
Older
web-services / co-deduplicate / v1 / duplicate.py
#!/usr/bin/python3

import sys
import json
import os
from deduplicate.deduplicate import RecordDeduplicate
from deduplicate.connect2elastic import ES_request, Connection
import argparse

parser = argparse.ArgumentParser("Python script deduplicate conditor record")
parser.add_argument("--url", "-u", help= "Elasticsearch database URL where search request")
parser.add_argument("--index", help="Elasticsearch index")
parser.add_argument("--https_proxy", default = None, help="https proxy. Default value in None")
parser.add_argument("--http_proxy", default = None, help ="http proxy. Default value in None")
parser.add_argument("--size", "-s", help="output file", default=100)
parser.add_argument("--input", help="A Corpus to deduplicate at json format")
#parser.add_argument("--output", help="A result corpus from algorithm")
args = parser.parse_args()

URL =  args.url
INDEX = args.index
SIZE = args.size
HTTPS_PROXY = args.https_proxy
HTTP_PROXY = args.http_proxy

proxies = {"https" : HTTPS_PROXY, "http" : HTTP_PROXY}

# Instanciate elastic client and its request class
es = ES_request(
    es_url=URL,
    connection_class= Connection,
    proxies = proxies,
    size = SIZE
)


for line in sys.stdin :
    data = json.loads(line)
    record = RecordDeduplicate(data['value'], index = INDEX, es = es)
    dup = record.deduplicate()
    data["value"] = dup
    sys.stdout.write(json.dumps(data))
    sys.stdout.write('\n')