Newer
Older
web-services / co-deduplicate / v1 / corhal.py
@Nicolas Thouvenin Nicolas Thouvenin on 10 Dec 2021 1 KB add section for dev
#!/usr/bin/python3

import sys
import json
from deduplicate.corhal_dduplicate import CorhalRecordDeduplicate
from deduplicate.connect2elastic import ES_request, Connection
import argparse
import warnings

warnings.filterwarnings("ignore")

parser = argparse.ArgumentParser("Python script deduplicate conditor record")
parser.add_argument("--url", "-u", help= "Elasticsearch database URL where search request")
parser.add_argument("--index", help="Elasticsearch index")
parser.add_argument("--login", "-l", default = None, help="Login to connect to elasticsearch")
parser.add_argument("--password", "-pw", default = None, help="Login to connect to elasticsearch")
parser.add_argument("--https_proxy", default = None, help="https proxy. Default value in None")
parser.add_argument("--http_proxy", default = None, help ="http proxy. Default value in None")
parser.add_argument("--size", "-s", help="output file", default=100)
parser.add_argument("--input", help="A Corpus to deduplicate at json format")
#parser.add_argument("--output", help="A result corpus from algorithm")
args = parser.parse_args()

URL =  args.url
INDEX = args.index
LOGIN  = args.login
PASSWORD = args.password
SIZE = args.size
HTTPS_PROXY = args.https_proxy
HTTP_PROXY = args.http_proxy

proxies = {"https" : HTTPS_PROXY, "http" : HTTP_PROXY}

# Instanciate elastic client and its request class
es = ES_request(
    es_url=URL,
    connection_class= Connection,
    proxies = proxies,
    size = SIZE,
    http_auth=(LOGIN, PASSWORD),
    use_ssl=True, 
    verify_certs=False
)


for line in sys.stdin :
    data = json.loads(line)
    record = CorhalRecordDeduplicate(data['value'], index = INDEX, es = es)
    dup = record.deduplicate()
    data["value"] = dup
    sys.stdout.write(json.dumps(data))
    sys.stdout.write('\n')