diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/__init__.py diff --git a/api.py b/api.py index 6701ef0..7bf0c48 100755 --- a/api.py +++ b/api.py @@ -13,7 +13,7 @@ from urllib.parse import quote from urllib.request import urlopen, HTTPBasicAuthHandler, build_opener, install_opener from urllib.error import URLError - +from getpass import getpass from os import path from sys import stderr @@ -193,7 +193,7 @@ return int(json_values['total']) -def write_fulltexts(DID, api_conf=DEFAULT_API_CONF, tgt_dir='.', login=None, passw=None, base_name=None, api_types=['fulltext/pdf', 'metadata/xml']): +def write_fulltexts(DID, base_name=None, api_conf=DEFAULT_API_CONF, tgt_dir='.', login=None, passw=None, api_types=['fulltext/pdf', 'metadata/xml']): """ Get XML metas, TEI, PDF, ZIP fulltexts etc. for a given ISTEX-API document. @@ -233,6 +233,80 @@ fh.close() +def write_fulltexts_loop_interact(list_of_ids, list_of_basenames=None, api_conf=DEFAULT_API_CONF, tgt_dir='.', api_types=['fulltext/pdf', 'metadata/xml']): + """ + Calls the preceding function in a loop for an entire list, + + With optional interactive authentification step: + - IF (login and passw are None AND _bget raises AuthWarning) + THEN ask user + + """ + # test sur le premier fichier: authentification est-elle nécessaire ? + need_auth = False + + first_doc_id = list_of_ids[0] + if list_of_basenames: + first_base_name = list_of_basenames[0] + else: + first_base_name = None + + try: + # test with no auth credentials + write_fulltexts( + first_doc_id, first_base_name, + tgt_dir=tgt_dir, + api_types=api_types + ) + print("API:retrieving doc no 1 from %s" % api_types) + except AuthWarning as e: + print("NB: l'API veut une authentification pour les fulltexts SVP...", + file=stderr) + need_auth = True + + # récupération avec ou sans authentification + if need_auth: + my_login = input(' => Nom d\'utilisateur "ia": ') + my_passw = getpass(prompt=' => Mot de passe: ') + for i, did in enumerate(list_of_ids): + if list_of_basenames: + my_bname = list_of_basenames[i] + else: + my_bname = None + + print("API:retrieving doc no %s from %s" % (str(i+1),api_types)) + try: + write_fulltexts( + did, + base_name = my_bname, + tgt_dir=tgt_dir, + login=my_login, + passw=my_passw, + api_types= api_types + ) + except AuthWarning as e: + print("authentification refusée :(") + my_login = input(' => Nom d\'utilisateur "ia": ') + my_passw = getpass(prompt=' => Mot de passe: ') + + else: + for i, did in enumerate(ids): + # on ne refait pas le 1er car il a marché + if i == 0: + continue + if list_of_basenames: + my_bname = list_of_basenames[i] + else: + my_bname = None + print("API:retrieving doc no %s from %s" % (str(i+1),api_types)) + write_fulltexts( + did, + base_name=my_bname, + tgt_dir=tgt_dir, + api_types=api_types + ) + + def terms_facet(facet_name, q="*", api_conf=DEFAULT_API_CONF): """ Get list of possible values/outcomes for a given field, with their counts (within the perimeter of the query q). diff --git a/pool_cache/corpusName.pool.json b/pool_cache/corpusName.pool.json index c33ef20..98f9483 100644 --- a/pool_cache/corpusName.pool.json +++ b/pool_cache/corpusName.pool.json @@ -1 +1 @@ -{"f": {"corpusName:elsevier": 6002882, "corpusName:bmj": 722426, "corpusName:nature": 376199, "corpusName:springer": 2304877, "corpusName:wiley": 4655758, "corpusName:oup": 1444253, "corpusName:eebo": 124410, "corpusName:ecco": 207613}, "nd": 15838418, "nr": 15838418, "totd": 0} \ No newline at end of file +{"totd": 0, "nd": 15838418, "nr": 15838418, "f": {"corpusName:nature": 376199, "corpusName:elsevier": 6002882, "corpusName:oup": 1444253, "corpusName:ecco": 207613, "corpusName:bmj": 722426, "corpusName:wiley": 4655758, "corpusName:springer": 2304877, "corpusName:eebo": 124410}} \ No newline at end of file diff --git a/sampler.py b/sampler.py index e7587b9..f36e0f7 100755 --- a/sampler.py +++ b/sampler.py @@ -32,7 +32,6 @@ # imports standard from sys import argv, stderr -from getpass import getpass from re import sub, search, escape from random import shuffle from itertools import product @@ -44,8 +43,8 @@ # imports locaux try: - import api - import field_value_lists + from libconsulte import api + from libconsulte import field_value_lists # =<< target_language_values, target_scat_values, # target_genre_values, target_date_ranges except ImportError: @@ -84,7 +83,6 @@ ] # binned listing via date ranges (also in field_value_lists.py) -# £todo ajouter ici pdfCharCount RANGEFACET_FIELDS = [ 'publicationDate', 'copyrightDate' @@ -330,8 +328,6 @@ N_reponses = pool_info['nr'] N_workdocs = pool_info['nd'] doc_grand_total = pool_info['totd'] - cache.close() - print('...ok cache (%i workdocs)' % N_workdocs,file=stderr) else: print('...no cache found',file=stderr) @@ -377,7 +373,7 @@ # do the counting for each combo for i, combi in enumerate(sorted(combinations)): if i % 100 == 0: - print("pool %i/%i" % (i,n_combos), file=stderr) + print("pool %i/%i" % (i,n_combos)) query = " AND ".join(combi) @@ -605,7 +601,7 @@ # do we need to change smoothing ? if args.smoothing_init and float(args.smoothing_init) > 0: - print("Setting initial smoothing to %.2f" % args.smoothing_init, file=stderr) + print("Setting initial smoothing to %.2f" % args.smoothing_init) # global var change in main LISSAGE = args.smoothing_init @@ -751,58 +747,17 @@ my_dir = path.join(getcwd(),my_name) mkdir(my_dir) + # two "parallel" lists ids = list(got_ids_idx.keys()) + basenames = [std_filename(one_id, got_ids_idx[one_id]) for one_id in ids] - # test sur le premier fichier: authentification nécessaire ? - need_auth = False - try: - bname = std_filename(ids[0], got_ids_idx[ids[0]]) - api.write_fulltexts(ids[0], tgt_dir=my_dir, - base_name=bname, - api_types=['metadata/xml', - 'fulltext/pdf'] - ) - print("retrieving PDF and XML-N for doc no 1") - except api.AuthWarning as e: - print("NB: le système veut une authentification SVP...", - file=stderr) - need_auth = True - - # récupération avec ou sans authentification - if need_auth: - my_login = input(' => Nom d\'utilisateur "ia": ') - my_passw = getpass(prompt=' => Mot de passe: ') - for i, did in enumerate(ids): - my_bname = std_filename(did, got_ids_idx[did]) - # got_ids_idx[did].to_filename() <-- todo from STD_MAP - print("retrieving PDF and XML-N for doc no " + str(i+1)) - try: - api.write_fulltexts(did, - tgt_dir=my_dir, - login=my_login, - passw=my_passw, - base_name = my_bname, - api_types=['metadata/xml', - 'fulltext/pdf'] - ) - except api.AuthWarning as e: - print("authentification refusée :(") - my_login = input(' => Nom d\'utilisateur "ia": ') - my_passw = getpass(prompt=' => Mot de passe: ') - - else: - for i, did in enumerate(ids): - # on ne refait pas le 1er car il a marché - if i == 0: - continue - my_bname = std_filename(did, got_ids_idx[did]) - print("retrieving PDF and XML-N for doc no " + str(i+1)) - api.write_fulltexts(did, - tgt_dir=my_dir, - base_name=my_bname, - api_types=['metadata/xml', - 'fulltext/pdf'] - ) + # loop with interactive authentification prompt if needed + api.write_fulltexts_loop_interact( + ids, basenames, + tgt_dir=my_dir, + api_types=['metadata/xml', + 'fulltext/pdf'] + ) LOG.append("SAVE: saved docs in %s/" % my_dir)