Newer
Older
libconsulte / api.py
#! /usr/bin/python3
"""
Query the ISTEX API (ES: lucene q => json doc)
"""
__author__    = "Romain Loth"
__copyright__ = "Copyright 2014-5 INIST-CNRS (ISTEX project)"
__license__   = "LGPL"
__version__   = "0.1"
__email__     = "romain.loth@inist.fr"
__status__    = "Dev"

from json            import loads
from urllib.parse    import quote
from urllib.request  import urlopen, HTTPBasicAuthHandler, build_opener, install_opener
from urllib.error    import URLError

from os import path
from sys import stderr

# globals
DEFAULT_API_CONF = {
	'host'  : 'api.istex.fr',
	'route' : 'document'
}

class AuthWarning(Exception):
	def __init__(self, msg):
		self.msg = msg
	def __str__(self):
		return repr(self.msg)

# private function
# ----------------
def _get(my_url):
	"""Get remote url *that contains a json* and parse it"""
	
	#~ print("> api._get:%s" % my_url, file=stderr)
	
	try:
		remote_file = urlopen(my_url)
		
	except URLError as url_e:
		# signale 401 Unauthorized ou 404 etc
		print("api: HTTP ERR (%s) sur '%s'" % 
			(url_e.reason, my_url), file=stderr)
		# Plus d'infos: serveur, Content-Type, WWW-Authenticate..
		# print ("ERR.info(): \n %s" % url_e.info(), file=stderr)
		exit(1)
	try:
		response = remote_file.read()
	except httplib.IncompleteRead as ir_e:
		response = ir_e.partial
		print("WARN: IncompleteRead '%s' but 'partial' content has page" 
				% my_url, file=stderr)
	remote_file.close()
	result_str = response.decode('UTF-8')
	json_values = loads(result_str)
	return json_values

def _sget(my_url, mon_user=None, mon_pass=None):
	"""
	Get remote auth-protected url *that contains a file* and pass it
	For instance when retrieving fulltext from ISTEX API
	"""
	
	# print ("REGARD:", mon_user, mon_pass)
	
	auth_handler = HTTPBasicAuthHandler()
	auth_handler.add_password(
			realm  = 'Authentification sur api.istex.fr',
			uri    = 'https://api.istex.fr',
			user   = mon_user,
			passwd = mon_pass)
	install_opener(build_opener(auth_handler))
	
	# contact
	try:
		remote_file = urlopen(my_url)
		
	except URLError as url_e:
		if url_e.getcode() == 401:
			raise AuthWarning("need_auth")
		else:
			# 404 à gérer sans quitter pour les fulltexts
			print("api: HTTP ERR no %i (%s) sur '%s'" % 
				(url_e.getcode(),url_e.msg, my_url), file=stderr)
			print ("ERR.info(): \n %s" % url_e.info(), file=stderr)
			# exit(1)
	
	# lecture
	contents = remote_file.read()
	remote_file.close()
	
	return contents


# public functions
# ----------------
# £TODO: stockage disque sur fichier tempo si liste grande et champx nbx
def search(q, api_conf=DEFAULT_API_CONF, limit=None, outfields=('title','host.issn','fulltext')):
	"""
	Query the API and get a (perhaps long) "hits" array of json metadata.

	args:
	-----
	   q    -- a lucene query
	                  ex: "quantum cat AND publicationDate:[1970 TO *]"

	optional kwargs:
	- - - - - - - - -
	   outfields   -- fieldNames list for the api to return for each hit 
	   limit       -- max returned hits threshold (= int)
	   api_conf    -- an inherited http config dict with these 2 keys:
	                    * api_conf['host']   <- default: "api.istex.fr"
	                    * api_conf['route']  <- default: "document"

	Output format is a parsed json with a total value and a hit list:
	{ 'hits': [ { 'id': '21B88F4EFBA46DC85E863709CA9824DEED7B7BFC',
				  'title': 'Recovering information borne by quanta that '
						   'crossed the black hole event horizon'},
				{ 'id': 'C095E6F0A43EBE3E98E2E6E17DD8775617636034',
				  'title': 'Holographic insights and puzzles'}],
	  'total': 2}
	"""
	
	# préparation requête
	url_encoded_lucene_query = quote(q)
	
	# construction de l'URL
	base_url = 'https:' + '//' + api_conf['host']  + '/' + api_conf['route'] + '/' + '?' + 'q=' + url_encoded_lucene_query + '&output=' + ",".join(outfields)
	
	n_docs = count(q)
	# print('%s documents trouvés' % n_docs)
	
	# limitation éventuelle fournie par le switch --maxi
	if limit is not None:
		n_docs = limit
	
	# la liste des résultats à renvoyer
	all_hits = []
	
	# ensuite 2 cas de figure : 1 requête ou plusieurs
	if n_docs <= 5000:
		# requête simple
		my_url = base_url + '&size=%i' % n_docs
		json_values = _get(my_url)
		all_hits = json_values['hits']
	
	else:
		# requêtes paginées pour les tailles > 5000
		print("Collecting result hits... ", file=stderr)
		for k in range(0, n_docs, 5000):
			print("%i..." % k, file=stderr)
			my_url = base_url + '&size=5000' + "&from=%i" % k
			json_values = _get(my_url)
			all_hits += json_values['hits']
		
		# TODO stocker si > RAM/5
		
		# si on avait une limite par ex 7500 et qu'on est allés jusqu'à 10000
		all_hits = all_hits[0:n_docs]
	
	return(all_hits)


def count(q, api_conf=DEFAULT_API_CONF):
	"""
	Get total hits for a lucene query on ISTEX api.
	"""
	# préparation requête
	url_encoded_lucene_query = quote(q)
	
	# construction de l'URL
	count_url = 'https:' + '//' + api_conf['host']  + '/' + api_conf['route'] + '/' + '?' + 'q=' + url_encoded_lucene_query + '&size=1'
	
	# requête
	json_values = _get(count_url)
	
	
	return int(json_values['total'])
	
	
def write_fulltexts(api_did, api_conf=DEFAULT_API_CONF, tgt_dir='.', login=None, passw=None, base_name=None):
	"""
	Get TEI, PDF and ZIP fulltexts for a given ISTEX document.
	"""
	# préparation requête
	fulltext_url = 'https:' + '//' + api_conf['host']  + '/' + api_conf['route'] + '/' + api_did + '/fulltext/'
	
	# available_filetypes = ['pdf', 'tei', 'zip']
	available_filetypes = ['pdf', 'tei']
	
	# default name is just the ID
	if not base_name:
		base_name = api_did
	
	for filetype in available_filetypes:
		response = _sget(fulltext_url + filetype, 
							mon_user=login, mon_pass=passw)
		tgt_path = path.join(tgt_dir, base_name+'.'+filetype)
		fh = open(tgt_path, 'wb')
		fh.write(response)
		fh.close()
	return None

def terms_facet(facet_name, q="*", api_conf=DEFAULT_API_CONF):
	"""
	Get list of possible values/outcomes for a given field, with their counts (within the perimeter of the query q).
	
	output format {"facet_value_1": count_1, ...}
	"""
	# préparation requête
	url_encoded_lucene_query = quote(q)
	
	# construction de l'URL
	facet_url = 'https:' + '//' + api_conf['host']  + '/' + api_conf['route'] + '/' + '?' + 'q=' + url_encoded_lucene_query + '&facet=' + facet_name
	
	# requête
	json_values = _get(facet_url)
	key_counts = json_values['aggregations'][facet_name]['buckets']
	
	
	# simplification de la structure
	# [
	#  {'docCount': 8059500, 'key': 'en'},
	#  {'docCount': 1138473, 'key': 'de'}
	# ]
	# => sortie + compacte:
	#    {'en': 8059500, 'de': 1138473 }
	simple_dict = {}
	for record in key_counts:
		k = record['key']
		n = record['docCount']
		
		simple_dict[k] = n
	
	return simple_dict


########################################################################
if __name__ == '__main__':
	
	# test de requête simple
	q = input("test d'interrogation API ISTEX (entrez une requête Lucene):")
	
	print(search(q, limit=10))
	
	# test de récupération PDF (avec Auth si nécessaire) puis écriture
	write_fulltexts('5286C468C888B8857D1F8971080594B788D54013')