libconsulte/ids_to_table.py at 279a56ec94a0b43c80b58f0a1ac021bd1d9b1370

Fork: 0
istex / libconsulte
Find file
Newer
Older
libconsulte / ids_to_table.py
Romain Loth on 18 Nov 2015 6 KB petit script ids => interro API => table de métas
Raw Blame History
#! /usr/bin/python3
"""
Prépare une table de métadonnées à partir d'une liste d'identifiants
"""
__author__    = "Romain Loth"
__copyright__ = "Copyright 2015 INIST-CNRS (ISTEX project)"
__license__   = "LGPL"
__version__   = "0.1"
__email__     = "romain.loth@inist.fr"
__status__    = "Dev"

# imports standard
from sys       import argv, stderr
from re        import match
from json      import loads
from argparse  import ArgumentParser
from urllib.request  import urlopen
from urllib.error    import URLError


# ----------------------------------------------------------------------
# CONFIG : liste des champs à mettre en colonne
# (key: API Name, val: local name)
# 
#~ STD_MAP = {
	#~ 'id'              : 'istex_id',  # 40 caractères [0-9A-F]
	#~ 'doi'             : 'doi',
	#~ 'corpusName'      : 'istex_lot', # que les trois premières lettres
	#~ 'publicationDate' : 'pub_year',  # le premier match à /(1\d|20)\d\d/
	#~ 'author.name'     : 'authors_',
	#~ 'genre'           : 'genres_',   # sans recodage (cf. istex-data)
	#~ 'title'           : 'title',
	#~ 'language'        : 'lang',      # avec recodage
	#~ 'categories.wos'  : 'cats_',     # à étendre
	#~ 'serie.issn'      : 'in_issn',   # en distri. compl. avec host.issn
	#~ 'host.issn'       : 'in_issn',
	#~ #'volume'          : 'in_vol',   # todo
	#~ #'firstPage'       : 'in_fpg'    # todo
	#~ 'qualityIndicators.pdfVersion' : 'pdfver',
	#~ 'qualityIndicators.pdfWordCount' : 'pdfwc',
	#~ 'qualityIndicators.refBibsNative' : 'bibnat',
#~ }

# private function
# ----------------
def _get(my_url):
	"""
	Get remote url *that contains a ~json~* 
	and parse it
	"""
	
	# print("> api._get:%s" % my_url, file=stderr)
	
	try:
		remote_file = urlopen(my_url)
		
	except URLError as url_e:
		# signale 401 Unauthorized ou 404 etc
		print("api: HTTP ERR (%s) sur '%s'" % 
			(url_e.reason, my_url), file=stderr)
		# Plus d'infos: serveur, Content-Type, WWW-Authenticate..
		# print ("ERR.info(): \n %s" % url_e.info(), file=stderr)
		raise
	try:
		response = remote_file.read()
	except httplib.IncompleteRead as ir_e:
		response = ir_e.partial
		print("WARN: IncompleteRead '%s' but 'partial' content has page" 
				% my_url, file=stderr)
	remote_file.close()
	result_str = response.decode('UTF-8')
	json_values = loads(result_str)
	return json_values

# todo mettre à part dans une lib
def safe_str(a_string=""):
	return sub("[^A-Za-z0-9àäçéèïîøöôüùαβγ]+","_",a_string)

if __name__ == "__main__":

	parser = ArgumentParser(
			description="IDS => interro API => table des métadonnées",
			usage="ids_to_table.py -l liste_ids.txt",
			epilog="- © 2015 Inist-CNRS (ISTEX) romain.loth at inist.fr -"
			)
	
	parser.add_argument('-l','--list_in',
		metavar='ID_list.txt',
		help="an alternative input: a list of IDs of the pdfs to be retrieved from api.istex.fr and processed",
		type=str,
		required=False,
		action='store')
	
	args = parser.parse_args(argv[1:])
	
	filehandle = open(args.list_in)
	ids_ok = [line.rstrip() for line in filehandle]
	filehandle.close()
	
	# header line
	# £TODO STD_MAP
	print("\t".join(['istex_id', 'corpus', 
	                 'pub_year', 'pub_period',
	                 'pdfver', 'pdfwc','bibnat',
	                 'author_1','lang','doctype_1',
	                 'cat_sci', 'title']))
	
	
	# pour chaque ligne si valable
	for i, the_id in enumerate(ids_ok):
		if not match(r'[0-9A-Fa-f]{40}', the_id):
			print("(skip ligne %i) L'identifiant '%s' n'est pas au format istex" % (i+1, the_id), file=stderr)
			continue
		
		expected_url = 'https://api.istex.fr/document/' + the_id
		
		# ICI REQUETE API
		hit = _get(expected_url)
		
		index_temp = {'co':hit['corpusName'][0:3]}
		
		# sortie 1 ligne par document : filtrage A et impression B
		
		# A) index intermédiaire : info filtrées et avec tests
		# -----------------------------------------------------
		# £TODO: check conventions for null values
		# £TODO: ajouter tout ça dans STD_MAP avec un for key in STD_MAP:
		# cf. sampler l 433
		if 'publicationDate' in hit and len(hit['publicationDate']):
			index_temp['yr'] = hit['publicationDate'][0:4]
		else:
			index_temp['yr'] = 'XXXX'
		
		if 'title' in hit and len(hit['title']):
			index_temp['ti'] = hit['title']
		else:
			index_temp['ti'] = "UNTITLED"
		
		if 'author' in hit and len(hit['author'][0]['name']):
			first_auth = hit['author'][0]['name']
			his_lastname = first_auth.split()[-1]
			index_temp['au'] = his_lastname
		else:
			index_temp['au'] = "UNKNOWN"
		
		if 'language' in hit and len(hit['language']):
			index_temp['lg'] = hit['language'][0]
		else:
			index_temp['lg'] = "UNKOWN_LANG"
		
		if 'genre' in hit and len(hit['genre']):
			index_temp['typ'] = hit['genre'][0]
		else:
			index_temp['typ'] = "UNKOWN_GENRE"
		
		if 'categories' in hit and len(hit['categories']) and 'wos' in hit['categories'] and len(hit['categories']['wos']):
			index_temp['cat'] = "/".join(hit['categories']['wos'])
		else:
			index_temp['cat'] = "UNKOWN_SCI_CAT"
		
		if 'qualityIndicators' in hit:
			if 'pdfVersion' in hit['qualityIndicators']:
				index_temp['ver'] = hit['qualityIndicators']['pdfVersion']
			else:
				index_temp['ver'] = "UNKNOWN_PDFVER"
			if 'pdfWordCount' in hit['qualityIndicators']:
				index_temp['wcp'] = hit['qualityIndicators']['pdfWordCount']
			else:
				index_temp['wcp'] = "UNKNOWN_PDFWORDCOUNT"
			if 'refBibsNative' in hit['qualityIndicators']:
				index_temp['bibnat'] = hit['qualityIndicators']['refBibsNative']
			else:
				index_temp['bibnat'] = "UNKNOWN_REFBIBSNATIVE"
		else:
			index_temp['ver'] = "UNKNOWN_PDFVER"
			index_temp['wcp'] = "UNKNOWN_PDFWORDCOUNT"
			index_temp['bibnat'] = "UNKNOWN_REFBIBSNATIVE"
		
		
		# B) sortie en colonnes imprimées cf. sampler l 712
		# -----------------------------------------------------
		
		print("\t".join([ the_id,
		                   index_temp['co'],
		                   index_temp['yr'],
		                   # period,
		                   index_temp['ver'],
		                   str(index_temp['wcp']),
		                   str(index_temp['bibnat']),
		                   index_temp['au'],
		                   index_temp['lg'],
		                   index_temp['typ'],
		                   index_temp['cat'],
		                   index_temp['ti'],
		                   #~ index_temp['_q']
		                   ]
		               )
			)