#! /usr/bin/python3 """ Prépare une table de métadonnées à partir d'une liste d'identifiants """ __author__ = "Romain Loth" __copyright__ = "Copyright 2015 INIST-CNRS (ISTEX project)" __license__ = "LGPL" __version__ = "0.1" __email__ = "romain.loth@inist.fr" __status__ = "Dev" # imports standard from sys import argv, stderr from re import match from json import loads from argparse import ArgumentParser from urllib.request import urlopen from urllib.error import URLError # ---------------------------------------------------------------------- # CONFIG : liste des champs à mettre en colonne # (key: API Name, val: local name) # #~ STD_MAP = { #~ 'id' : 'istex_id', # 40 caractères [0-9A-F] #~ 'doi' : 'doi', #~ 'corpusName' : 'istex_lot', # que les trois premières lettres #~ 'publicationDate' : 'pub_year', # le premier match à /(1\d|20)\d\d/ #~ 'author.name' : 'authors_', #~ 'genre' : 'genres_', # sans recodage (cf. istex-data) #~ 'title' : 'title', #~ 'language' : 'lang', # avec recodage #~ 'categories.wos' : 'cats_', # à étendre #~ 'serie.issn' : 'in_issn', # en distri. compl. avec host.issn #~ 'host.issn' : 'in_issn', #~ #'volume' : 'in_vol', # todo #~ #'firstPage' : 'in_fpg' # todo #~ 'qualityIndicators.pdfVersion' : 'pdfver', #~ 'qualityIndicators.pdfWordCount' : 'pdfwc', #~ 'qualityIndicators.refBibsNative' : 'bibnat', #~ } # private function # ---------------- def _get(my_url): """ Get remote url *that contains a ~json~* and parse it """ # print("> api._get:%s" % my_url, file=stderr) try: remote_file = urlopen(my_url) except URLError as url_e: # signale 401 Unauthorized ou 404 etc print("api: HTTP ERR (%s) sur '%s'" % (url_e.reason, my_url), file=stderr) # Plus d'infos: serveur, Content-Type, WWW-Authenticate.. # print ("ERR.info(): \n %s" % url_e.info(), file=stderr) raise try: response = remote_file.read() except httplib.IncompleteRead as ir_e: response = ir_e.partial print("WARN: IncompleteRead '%s' but 'partial' content has page" % my_url, file=stderr) remote_file.close() result_str = response.decode('UTF-8') json_values = loads(result_str) return json_values # todo mettre à part dans une lib def safe_str(a_string=""): return sub("[^A-Za-z0-9àäçéèïîøöôüùαβγ]+","_",a_string) if __name__ == "__main__": parser = ArgumentParser( description="IDS => interro API => table des métadonnées", usage="ids_to_table.py -l liste_ids.txt", epilog="- © 2015 Inist-CNRS (ISTEX) romain.loth at inist.fr -" ) parser.add_argument('-l','--list_in', metavar='ID_list.txt', help="an alternative input: a list of IDs of the pdfs to be retrieved from api.istex.fr and processed", type=str, required=False, action='store') args = parser.parse_args(argv[1:]) filehandle = open(args.list_in) ids_ok = [line.rstrip() for line in filehandle] filehandle.close() # header line # £TODO STD_MAP print("\t".join(['istex_id', 'corpus', 'pub_year', 'pub_period', 'pdfver', 'pdfwc','bibnat', 'author_1','lang','doctype_1', 'cat_sci', 'title'])) # pour chaque ligne si valable for i, the_id in enumerate(ids_ok): if not match(r'[0-9A-Fa-f]{40}', the_id): print("(skip ligne %i) L'identifiant '%s' n'est pas au format istex" % (i+1, the_id), file=stderr) continue expected_url = 'https://api.istex.fr/document/' + the_id # ICI REQUETE API hit = _get(expected_url) index_temp = {'co':hit['corpusName'][0:3]} # sortie 1 ligne par document : filtrage A et impression B # A) index intermédiaire : info filtrées et avec tests # ----------------------------------------------------- # £TODO: check conventions for null values # £TODO: ajouter tout ça dans STD_MAP avec un for key in STD_MAP: # cf. sampler l 433 if 'publicationDate' in hit and len(hit['publicationDate']): index_temp['yr'] = hit['publicationDate'][0:4] else: index_temp['yr'] = 'XXXX' if 'title' in hit and len(hit['title']): index_temp['ti'] = hit['title'] else: index_temp['ti'] = "UNTITLED" if 'author' in hit and len(hit['author'][0]['name']): first_auth = hit['author'][0]['name'] his_lastname = first_auth.split()[-1] index_temp['au'] = his_lastname else: index_temp['au'] = "UNKNOWN" if 'language' in hit and len(hit['language']): index_temp['lg'] = hit['language'][0] else: index_temp['lg'] = "UNKOWN_LANG" if 'genre' in hit and len(hit['genre']): index_temp['typ'] = hit['genre'][0] else: index_temp['typ'] = "UNKOWN_GENRE" if 'categories' in hit and len(hit['categories']) and 'wos' in hit['categories'] and len(hit['categories']['wos']): index_temp['cat'] = "/".join(hit['categories']['wos']) else: index_temp['cat'] = "UNKOWN_SCI_CAT" if 'qualityIndicators' in hit: if 'pdfVersion' in hit['qualityIndicators']: index_temp['ver'] = hit['qualityIndicators']['pdfVersion'] else: index_temp['ver'] = "UNKNOWN_PDFVER" if 'pdfWordCount' in hit['qualityIndicators']: index_temp['wcp'] = hit['qualityIndicators']['pdfWordCount'] else: index_temp['wcp'] = "UNKNOWN_PDFWORDCOUNT" if 'refBibsNative' in hit['qualityIndicators']: index_temp['bibnat'] = hit['qualityIndicators']['refBibsNative'] else: index_temp['bibnat'] = "UNKNOWN_REFBIBSNATIVE" else: index_temp['ver'] = "UNKNOWN_PDFVER" index_temp['wcp'] = "UNKNOWN_PDFWORDCOUNT" index_temp['bibnat'] = "UNKNOWN_REFBIBSNATIVE" # B) sortie en colonnes imprimées cf. sampler l 712 # ----------------------------------------------------- print("\t".join([ the_id, index_temp['co'], index_temp['yr'], # period, index_temp['ver'], str(index_temp['wcp']), str(index_temp['bibnat']), index_temp['au'], index_temp['lg'], index_temp['typ'], index_temp['cat'], index_temp['ti'], #~ index_temp['_q'] ] ) )