#!/usr/bin/python3 import sys import json from deduplicate.corhal_dduplicate import CorhalRecordDeduplicate from deduplicate.connect2elastic import ES_request, Connection import argparse import warnings warnings.filterwarnings("ignore") parser = argparse.ArgumentParser("Python script deduplicate conditor record") parser.add_argument("--url", "-u", help= "Elasticsearch database URL where search request") parser.add_argument("--index", help="Elasticsearch index") parser.add_argument("--login", "-l", default = None, help="Login to connect to elasticsearch") parser.add_argument("--password", "-pw", default = None, help="Login to connect to elasticsearch") parser.add_argument("--https_proxy", default = None, help="https proxy. Default value in None") parser.add_argument("--http_proxy", default = None, help ="http proxy. Default value in None") parser.add_argument("--size", "-s", help="output file", default=100) parser.add_argument("--input", help="A Corpus to deduplicate at json format") #parser.add_argument("--output", help="A result corpus from algorithm") args = parser.parse_args() URL = args.url INDEX = args.index LOGIN = args.login PASSWORD = args.password SIZE = args.size HTTPS_PROXY = args.https_proxy HTTP_PROXY = args.http_proxy proxies = {"https" : HTTPS_PROXY, "http" : HTTP_PROXY} # Instanciate elastic client and its request class es = ES_request( es_url=URL, connection_class= Connection, proxies = proxies, size = SIZE, http_auth=(LOGIN, PASSWORD), use_ssl=True, verify_certs=False ) for line in sys.stdin : data = json.loads(line) record = CorhalRecordDeduplicate(data['value'], index = INDEX, es = es) dup = record.deduplicate() data["value"] = dup sys.stdout.write(json.dumps(data)) sys.stdout.write('\n')