#! /usr/bin/python3 """ Query the ISTEX API (ES: lucene q => json doc) """ __author__ = "Romain Loth" __copyright__ = "Copyright 2014-5 INIST-CNRS (ISTEX project)" __license__ = "LGPL" __version__ = "0.1" __email__ = "romain.loth@inist.fr" __status__ = "Dev" from json import loads from urllib.parse import quote from urllib.request import urlopen, HTTPBasicAuthHandler, build_opener, install_opener from urllib.error import URLError from getpass import getpass from os import path from sys import stderr # globals DEFAULT_API_CONF = { 'host' : 'api.istex.fr', 'route' : 'document' } class AuthWarning(Exception): def __init__(self, msg): self.msg = msg def __str__(self): return repr(self.msg) # private function # ---------------- def _get(my_url): """ Get remote url *that contains a ~json~* and parse it """ # print("> api._get:%s" % my_url, file=stderr) try: remote_file = urlopen(my_url) except URLError as url_e: # signale 401 Unauthorized ou 404 etc print("api: HTTP ERR (%s) sur '%s'" % (url_e.reason, my_url), file=stderr) # Plus d'infos: serveur, Content-Type, WWW-Authenticate.. # print ("ERR.info(): \n %s" % url_e.info(), file=stderr) exit(1) try: response = remote_file.read() except httplib.IncompleteRead as ir_e: response = ir_e.partial print("WARN: IncompleteRead '%s' but 'partial' content has page" % my_url, file=stderr) remote_file.close() result_str = response.decode('UTF-8') json_values = loads(result_str) return json_values def _bget(my_url, user=None, passw=None): """ Get remote auth-protected url *that contains a ~file~* and pass its binary data straight from remote response (for instance when retrieving fulltext from ISTEX API) """ # /!\ attention le password est en clair ici /!\ # print ("REGARD:", user, passw, file=stderr) no_contents = False auth_handler = HTTPBasicAuthHandler() auth_handler.add_password( realm = 'Authentification sur api.istex.fr', uri = 'https://api.istex.fr', user = user, passwd = passw) install_opener(build_opener(auth_handler)) print("GET bin (user:%s)" % user, file=stderr) # contact try: remote_file = urlopen(my_url) except URLError as url_e: if url_e.getcode() == 401: raise AuthWarning("need_auth") else: # 404 à gérer *sans quitter* pour les fulltexts en nombre... no_contents = True print("api: HTTP ERR no %i (%s) sur '%s'" % (url_e.getcode(),url_e.msg, my_url), file=stderr) # pour + de détail # print ("ERR.info(): \n %s" % url_e.info(),file=stderr) if no_contents: return None else: # lecture contents = remote_file.read() remote_file.close() return contents # public functions # ---------------- # £TODO: stockage disque sur fichier tempo si liste grande et champx nbx def search(q, api_conf=DEFAULT_API_CONF, limit=None, outfields=('title','host.issn','fulltext')): """ Query the API and get a (perhaps long) "hits" array of json metadata. args: ----- q -- a lucene query ex: "quantum cat AND publicationDate:[1970 TO *]" optional kwargs: - - - - - - - - - outfields -- fieldNames list for the api to return for each hit limit -- max returned hits threshold (= int) api_conf -- an inherited http config dict with these 2 keys: * api_conf['host'] <- default: "api.istex.fr" * api_conf['route'] <- default: "document" Output format is a parsed json with a total value and a hit list: { 'hits': [ { 'id': '21B88F4EFBA46DC85E863709CA9824DEED7B7BFC', 'title': 'Recovering information borne by quanta that ' 'crossed the black hole event horizon'}, { 'id': 'C095E6F0A43EBE3E98E2E6E17DD8775617636034', 'title': 'Holographic insights and puzzles'}], 'total': 2} """ # préparation requête url_encoded_lucene_query = quote(q) # construction de l'URL base_url = 'https:' + '//' + api_conf['host'] + '/' + api_conf['route'] + '/' + '?' + 'q=' + url_encoded_lucene_query + '&output=' + ",".join(outfields) n_docs = count(q) # print('%s documents trouvés' % n_docs) # limitation éventuelle fournie par le switch --maxi if limit is not None: n_docs = limit # la liste des résultats à renvoyer all_hits = [] # ensuite 2 cas de figure : 1 requête ou plusieurs if n_docs <= 5000: # requête simple my_url = base_url + '&size=%i' % n_docs json_values = _get(my_url) all_hits = json_values['hits'] else: # requêtes paginées pour les tailles > 5000 print("Collecting result hits... ", file=stderr) for k in range(0, n_docs, 5000): print("%i..." % k, file=stderr) my_url = base_url + '&size=5000' + "&from=%i" % k json_values = _get(my_url) all_hits += json_values['hits'] # TODO stocker si > RAM/5 # si on avait une limite par ex 7500 et qu'on est allés jusqu'à 10000 all_hits = all_hits[0:n_docs] return(all_hits) def count(q, api_conf=DEFAULT_API_CONF): """ Get total hits for a lucene query on ISTEX api. """ # préparation requête url_encoded_lucene_query = quote(q) # construction de l'URL count_url = 'https:' + '//' + api_conf['host'] + '/' + api_conf['route'] + '/' + '?' + 'q=' + url_encoded_lucene_query + '&size=1' # requête json_values = _get(count_url) return int(json_values['total']) def write_fulltexts(DID, base_name=None, api_conf=DEFAULT_API_CONF, tgt_dir='.', login=None, passw=None, api_types=['fulltext/pdf', 'metadata/xml']): """ Get XML metas, TEI, PDF, ZIP fulltexts etc. for a given ISTEX-API document. """ # vérification for at in api_types: if at not in ['fulltext/pdf', 'fulltext/tei', 'fulltext/txt', 'fulltext/zip', 'metadata/xml', 'metadata/mods', ]: raise KeyError("Unknown filetype %s" % at) # default name is just the ID and the fileextension if not base_name: base_name = DID # préparation requête da_url = 'https://'+api_conf['host']+'/'+api_conf['route']+'/'+DID for at in api_types: response = _bget(da_url+'/'+at, user=login, passw=passw) # _bget renvoie None pour les (rares) 404 # (ex: demande tei a ecco) if response is not None: # ext par défaut: partie droite de la route de l'api ext = at.split('/')[1] tgt_path = path.join(tgt_dir, base_name+'.'+ext) fh = open(tgt_path, 'wb') fh.write(response) fh.close() def write_fulltexts_loop_interact(list_of_ids, list_of_basenames=None, api_conf=DEFAULT_API_CONF, tgt_dir='.', api_types=['fulltext/pdf', 'metadata/xml']): """ Calls the preceding function in a loop for an entire list, With optional interactive authentification step: - IF (login and passw are None AND _bget raises AuthWarning) THEN ask user """ # test sur le premier fichier: authentification est-elle nécessaire ? need_auth = False first_doc_id = list_of_ids[0] if list_of_basenames: first_base_name = list_of_basenames[0] else: first_base_name = None try: # test with no auth credentials write_fulltexts( first_doc_id, first_base_name, tgt_dir=tgt_dir, api_types=api_types ) print("API:retrieving doc no 1 from %s" % api_types) except AuthWarning as e: print("NB: l'API veut une authentification pour les fulltexts SVP...", file=stderr) need_auth = True # récupération avec ou sans authentification if need_auth: my_login = input(' => Nom d\'utilisateur "ia": ') my_passw = getpass(prompt=' => Mot de passe: ') for i, did in enumerate(list_of_ids): if list_of_basenames: my_bname = list_of_basenames[i] else: my_bname = None print("API:retrieving doc no %s from %s" % (str(i+1),api_types)) try: write_fulltexts( did, base_name = my_bname, tgt_dir=tgt_dir, login=my_login, passw=my_passw, api_types= api_types ) except AuthWarning as e: print("authentification refusée :(") my_login = input(' => Nom d\'utilisateur "ia": ') my_passw = getpass(prompt=' => Mot de passe: ') else: for i, did in enumerate(ids): # on ne refait pas le 1er car il a marché if i == 0: continue if list_of_basenames: my_bname = list_of_basenames[i] else: my_bname = None print("API:retrieving doc no %s from %s" % (str(i+1),api_types)) write_fulltexts( did, base_name=my_bname, tgt_dir=tgt_dir, api_types=api_types ) def terms_facet(facet_name, q="*", api_conf=DEFAULT_API_CONF): """ Get list of possible values/outcomes for a given field, with their counts (within the perimeter of the query q). output format {"facet_value_1": count_1, ...} """ # préparation requête url_encoded_lucene_query = quote(q) # construction de l'URL facet_url = 'https:' + '//' + api_conf['host'] + '/' + api_conf['route'] + '/' + '?' + 'q=' + url_encoded_lucene_query + '&facet=' + facet_name # requête json_values = _get(facet_url) key_counts = json_values['aggregations'][facet_name]['buckets'] # simplification de la structure # [ # {'docCount': 8059500, 'key': 'en'}, # {'docCount': 1138473, 'key': 'de'} # ] # => sortie + compacte: # {'en': 8059500, 'de': 1138473 } simple_dict = {} for record in key_counts: k = record['key'] n = record['docCount'] simple_dict[k] = n return simple_dict ######################################################################## if __name__ == '__main__': # test de requête simple q = input("test d'interrogation API ISTEX (entrez une requête Lucene):") print(search(q, limit=10)) # test de récupération PDF (avec Auth si nécessaire) puis écriture write_fulltexts('5286C468C888B8857D1F8971080594B788D54013')