Newer
Older
libconsulte / field_combo_count.py
@Romain Loth Romain Loth on 17 Nov 2015 9 KB plus compatible windows et plus clair
#! /usr/bin/python3
"""
Pooling isolé de sampler.py, renvoie une combinaison d'aggrégations

(c'est-à-dire les décomptes par documents pour toute combinaison de critères)

Exemple de sortie:

{
 "f": {
  "corpusName:bmj AND publicationDate:[* TO 1979]": 368369,
  "corpusName:bmj AND publicationDate:[1980 TO 1999]": 163926,
  "corpusName:bmj AND publicationDate:[2000 TO *]": 172940,
  (...)
  "corpusName:oup AND publicationDate:[* TO 1979]": 734369,
  "corpusName:oup AND publicationDate:[1980 TO 1999]": 414020,
  "corpusName:oup AND publicationDate:[2000 TO *]": 287814,
  "corpusName:springer AND publicationDate:[* TO 1979]": 490971,
  "corpusName:springer AND publicationDate:[1980 TO 1999]": 547932,
  "corpusName:springer AND publicationDate:[2000 TO *]": 9,
  "corpusName:wiley AND publicationDate:[* TO 1979]": 1185039,
  "corpusName:wiley AND publicationDate:[1980 TO 1999]": 1621624,
  "corpusName:wiley AND publicationDate:[2000 TO *]": 1849095
 },
 "nd": 14675130,
 "nr": 14804223,
 "totd": 15968740
}
"""
__author__    = "Romain Loth"
__copyright__ = "Copyright 2014-5 INIST-CNRS (ISTEX project)"
__license__   = "LGPL"
__version__   = "0.1"
__email__     = "romain.loth@inist.fr"
__status__    = "Dev"

# imports standard
from sys       import argv, stderr
from re        import sub
from itertools import product
from json      import dumps
from argparse  import ArgumentParser


# imports locaux
try:
	# CHEMIN 1 cas de figure du dossier utilisé comme librairie
	#          au sein d'un package plus grand (exemple: bib-adapt-corpus)
	from libconsulte import api
	from libconsulte import field_value_lists
	# =<< target_language_values, target_scat_values,
	#     target_genre_values, target_date_ranges
except ImportError:
	try:
		# CHEMIN 2: cas de figure d'un appel depuis le dossier courant
		#           exemple: on veut juste lancer le sampler tout seul
		import api
		import field_value_lists
		
	# cas de figure où il n'y a vraiment rien
	except ImportError:
		print("""ERR: Les modules 'api.py' et 'field_value_lists.py' doivent être
		 placés à côté du script sampler.py ou dans un dossier du PYTHONPATH, pour sa bonne execution...""", file=stderr)
		exit(1)


def my_parse_args(arglist=None):
	"""Arguments ligne de commande pour main()"""
	
	parser = ArgumentParser(
		description="--- Returns API doc counts for combined facets as json \"pools\" ---",
		usage="\n------\n  field_combo_count.py --crit luceneField1 luceneField2 ...",
		epilog="-- © 2014-2015 :: romain.loth at inist.fr :: Inist-CNRS (ISTEX) --"
		)
	
	parser.add_argument('-c', '--crit',
		dest="criteria_list",
		#~ metavar=('"corpusName"', '"publicationDate"'),
		metavar="",
		help="""API field(s) to count (exemple: corpusName publicationDate) (space-separated)""",
		nargs='+',
		required=True,
		action='store')
	
	parser.add_argument('-v', '--verbose',
		help="verbose switch",
		default=False,
		required=False,
		action='store_true')
	
	args = parser.parse_args(arglist)
	
	# --- checks and pre-propagation --------
	#  if known criteria ?
	flag_ok = True
	for field_name in args.criteria_list:
		if field_name not in field_value_lists.KNOWN_FIELDS:
			flag_ok = False
			print("Unknown field in -c args: '%s'" % field_name, 
			      file=stderr)
	
	if not flag_ok:
		exit(1)
	# ----------------------------------------
	
	return(args)

def facet_vals(field_name):
	"""
	For each field, returns the list of possible outcomes
	
	ex: > facet_vals('corpusName')
	    > ['elsevier','wiley', 'nature', 'sage', ...]
	"""
	
	if field_name in field_value_lists.TERMFACET_FIELDS_auto:
		# deuxième partie si un "sous.type"
		facet_name = sub('^[^.]+\.', '', field_name)
		return(api.terms_facet(facet_name).keys())
	
	elif field_name in field_value_lists.TERMFACET_FIELDS_local:
		# on a en stock 3 listes ad hoc
		if field_name == 'language':
			return(field_value_lists.LANG)
		elif field_name == 'genre':
			return(field_value_lists.GENRE)
		elif field_name == 'categories.wos':
			return(field_value_lists.SCAT)
		else:
			raise UnimplementedError()
	
	elif field_name in field_value_lists.RANGEFACET_FIELDS:
		applicable_bins = {}
		
		# recup des listes d'intervalles pertinentes
		# TODO faire une table de correspondance
		if field_name == 'publicationDate' or field_name == 'copyrightDate':
			applicable_bins = field_value_lists.DATE
		elif field_name == 'qualityIndicators.pdfCharCount':
			applicable_bins = field_value_lists.NBC
		luc_ranges = []
		
		# conversion couple(min max) en syntaxe lucene "[min TO max]"
		for interval in applicable_bins:
			a = str(interval[0])
			b = str(interval[1])
			luc_ranges.append('[' + a + ' TO ' + b + ']')
		return(luc_ranges)
	
	else:
		print ("ERROR: ?? the API doesn't allow a facet query on field '%s' (and I don't have a field_value_lists for this field either :-/ )" % field_name, file=stderr)
		exit(1)


def pooling(crit_fields, verbose=False):
	"""
	Sorte de tableau croisé: compte le nombre de docs pour chaque combinaison de critères
	
	Renvoie un dict avec ces décomptes et les totaux
	
	Exemple pour les critères corpusName et pdfCharCount
	{
	  "f": {
		"corpusName:bmj AND pdfCharCount:[* TO 1999]": 24524,
		"corpusName:bmj AND pdfCharCount:[2000 TO *]": 662848,
		"corpusName:brill-journals AND pdfCharCount:[* TO 1999]": 10949,
		"corpusName:brill-journals AND pdfCharCount:[2000 TO *]": 119318,
		"corpusName:elsevier AND pdfCharCount:[* TO 1999]": 275461,
		"corpusName:elsevier AND pdfCharCount:[2000 TO *]": 5740132,
		"corpusName:nature AND pdfCharCount:[* TO 1999]": 332156,
		"corpusName:nature AND pdfCharCount:[2000 TO *]": 45139,
		"corpusName:oup AND pdfCharCount:[* TO 1999]": 58662,
		"corpusName:oup AND pdfCharCount:[2000 TO *]": 1385591,
		"corpusName:springer AND pdfCharCount:[* TO 1999]": 61973,
		"corpusName:springer AND pdfCharCount:[2000 TO *]": 2242902,
		"corpusName:wiley AND pdfCharCount:[* TO 1999]": 593998,
		"corpusName:wiley AND pdfCharCount:[2000 TO *]": 4044204
	  },
	  "totd": 15982692         # nombre de docs au total dans la base
	  "nd": 15597857,          # nombre de docs pour l'ensemble des critères
	  "nr": 15597857,          # nombre de réponses pour l'ensemble des critère 
	                           # (intéressant pour les champs "QCM")
	}

	NB: les choix de fourchettes et les valeurs de facettes sont
	    configurables dans field_value_lists.py
	"""
	####### POOLING ########
	#
	N_reponses = 0
	N_workdocs = 0
	doc_grand_total = 0
	# dict of counts for each combo ((crit1:val_1a),(crit2:val_2a)...)
	abs_freqs = {}
	
	# ---------------------------------------------------------------
	# (1) PARTITIONING THE SEARCH SPACE IN POSSIBLE OUTCOMES --------
	print("Sending count queries for criteria pools...",file=stderr)
	## build all "field:values" pairs per criterion field
	## (list of list of strings: future lucene query chunks)
	all_possibilities = []
	
	# petit hommage à notre collègue Nourdine Combo !
	n_combos = 1
	
	for my_criterion in crit_fields:
		# print("CRIT",my_criterion)
		field_outcomes = facet_vals(my_criterion)
		# print("field_outcomes",field_outcomes)
		n_combos = n_combos * len(field_outcomes)
		# lucene query chunks
		all_possibilities.append(
			[my_criterion + ':' + val for val in field_outcomes]
		)
	
	# par ex: 2 critères vont donner 2 listes dans all_possibilities
	# [
	#  ['qualityIndicators.refBibsNative:T', 'qualityIndicators.refBibsNative:F'], 
	#
	#  ['corpusName:brill', 'corpusName:bmj', 'corpusName:wiley', 'corpusName:elsevier',
	#   'corpusName:ecco', 'corpusName:eebo', 'corpusName:springer', 'corpusName:nature', 
	#   'corpusName:oup', 'corpusName:journals']
	# ]
	
	## list combos (cartesian product of field_outcomes)
	# we're directly unpacking *args into itertool.product()
	# (=> we get an iterator over tuples of combinable query chunks)
	combinations = product(*all_possibilities)
	
	
	# example for -c corpusName, publicationDate
	#	[
	#	('corpusName:ecco', 'publicationDate:[* TO 1959]'),
	#	('corpusName:ecco', 'publicationDate:[1960 TO 1999]'),
	#	('corpusName:ecco', 'publicationDate:[2000 TO *]'),
	#	('corpusName:elsevier', 'publicationDate:[* TO 1959]'),
	#	('corpusName:elsevier', 'publicationDate:[1960 TO 1999]'),
	#	('corpusName:elsevier', 'publicationDate:[2000 TO *]'),
	#	(...)
	#	]
	
	# ---------------------------------------------------------------
	# (2) getting total counts for each criteria --------------------
	
	# number of counted answers
	#  (1 doc can give several hits if a criterion was multivalued)
	N_reponses = 0
	
	# do the counting for each combo
	for i, combi in enumerate(sorted(combinations)):
		if i % 100 == 0:
			print("pool %i/%i" % (i,n_combos), file=stderr)
		
		query = " AND ".join(combi)
		
		# counting requests ++++
		freq = api.count(query)
		
		# print(freq)
		
		if verbose:
			print("pool:'% -30s': % 8i" %(query,freq),file=stderr)
		
		# storing and agregation
		N_reponses += freq
		abs_freqs[query] = freq
	
	# number of documents sending answers (hence normalizing constant N)
	N_workdocs = api.count(" AND ".join([k+":*" for k in crit_fields]))
	
	if verbose:
		print("--------- pool totals -----------", file=stderr)
		print("#answered hits :   % 12s" % N_reponses, file=stderr)
		print("#workdocs (N) :    % 12s" % N_workdocs, file=stderr)
		# for comparison: all_docs = N + api.count(q="NOT(criterion:*)")
		doc_grand_total = api.count(q='*')
		print("#all API docs fyi: % 12s" % doc_grand_total,file=stderr)
		print("---------------------------------", file=stderr)
	
	# resulting pool info in f + various totals
	return {"f":abs_freqs, "nr":N_reponses, "nd":N_workdocs, "totd":doc_grand_total}

if __name__ == "__main__":
	# arguments cli
	args = my_parse_args(argv[1:])
	
	# run
	json = pooling(args.criteria_list, verbose=args.verbose)
	
	# sortie: en json valide et "pretty"
	print(dumps(json, indent=2, sort_keys=True))