#! /usr/bin/python3 """ Pooling isolé de sampler.py, renvoie une combinaison d'aggrégations (c'est-à-dire les décomptes par documents pour toute combinaison de critères) Exemple de sortie: { "f": { "corpusName:bmj AND publicationDate:[* TO 1979]": 368369, "corpusName:bmj AND publicationDate:[1980 TO 1999]": 163926, "corpusName:bmj AND publicationDate:[2000 TO *]": 172940, (...) "corpusName:oup AND publicationDate:[* TO 1979]": 734369, "corpusName:oup AND publicationDate:[1980 TO 1999]": 414020, "corpusName:oup AND publicationDate:[2000 TO *]": 287814, "corpusName:springer AND publicationDate:[* TO 1979]": 490971, "corpusName:springer AND publicationDate:[1980 TO 1999]": 547932, "corpusName:springer AND publicationDate:[2000 TO *]": 9, "corpusName:wiley AND publicationDate:[* TO 1979]": 1185039, "corpusName:wiley AND publicationDate:[1980 TO 1999]": 1621624, "corpusName:wiley AND publicationDate:[2000 TO *]": 1849095 }, "nd": 14675130, "nr": 14804223, "totd": 15968740 } """ __author__ = "Romain Loth" __copyright__ = "Copyright 2014-5 INIST-CNRS (ISTEX project)" __license__ = "LGPL" __version__ = "0.1" __email__ = "romain.loth@inist.fr" __status__ = "Dev" # imports standard from sys import argv, stderr from re import sub from itertools import product from json import dumps from argparse import ArgumentParser from os import path from configparser import ConfigParser # imports locaux try: # CHEMIN 1 cas de figure du dossier utilisé comme librairie # au sein d'un package plus grand (exemple: bib-adapt-corpus) from libconsulte import api from libconsulte import field_value_lists # =<< target_language_values, target_scat_values, # target_genre_values, target_date_ranges except ImportError: try: # CHEMIN 2: cas de figure d'un appel depuis le dossier courant # exemple: on veut juste lancer le sampler tout seul import api import field_value_lists # cas de figure où il n'y a vraiment rien except ImportError: print("""ERR: Les modules 'api.py' et 'field_value_lists.py' doivent être placés à côté du script sampler.py ou dans un dossier du PYTHONPATH, pour sa bonne execution...""", file=stderr) exit(1) # -------------------------------------------------------------------- # lecture du fichier config local : script_dir = path.dirname(path.realpath(__file__)) CONF = ConfigParser() with open(path.join(script_dir,'libconsulte.conf')) as f: CONF.read_file(f) # -------------------------------------------------------------------- def my_parse_args(arglist=None): """Arguments ligne de commande pour main()""" parser = ArgumentParser( description="--- Returns API doc counts for combined facets as json \"pools\" ---", usage="\n------\n field_combo_count.py --crit luceneField1 luceneField2 ...", epilog="-- © 2014-2015 :: romain.loth at inist.fr :: Inist-CNRS (ISTEX) --" ) parser.add_argument('-c', '--crit', dest="criteria_list", #~ metavar=('"corpusName"', '"publicationDate"'), metavar="", help="""API field(s) to count (exemple: corpusName publicationDate) (space-separated)""", nargs='+', required=True, action='store') parser.add_argument('-v', '--verbose', help="verbose switch", default=False, required=False, action='store_true') args = parser.parse_args(arglist) # --- checks and pre-propagation -------- # if known criteria ? flag_ok = True for field_name in args.criteria_list: if field_name not in field_value_lists.KNOWN_FIELDS: flag_ok = False print("Unknown field in -c args: '%s'" % field_name, file=stderr) if not flag_ok: exit(1) # ---------------------------------------- return(args) def facet_vals(field_name): """ For each field, returns the list of possible outcomes ex: > facet_vals('corpusName') > ['elsevier','wiley', 'nature', 'sage', ...] """ term_fields = CONF['known_fields']['TERM_FIELDS'].split("\n") if field_name in term_fields: # filtre sur les catégories ayant moins de MIN_DOCS min_docs = int(CONF['divers']['MIN_FACET_POOL']) key_list = list( api.terms_facet( field_name, min_val=min_docs ).keys() ) key_list_terms = ["'%s'" % key for key in key_list] # le complément : la facette autre explicite # ex: (NOT(eng OR fre OR deu OR unknown)) other_facet = '(NOT(' + ' OR '.join(key_list_terms) + '))' key_list_terms.append(other_facet) # print(key_list_terms) return(key_list_terms) #~ elif field_name in field_value_lists.TERMFACET_FIELDS_local: #~ # on a en stock 3 listes ad hoc #~ if field_name == 'language': #~ return(field_value_lists.LANG) #~ elif field_name == 'genre': #~ return(field_value_lists.GENRE) #~ elif field_name == 'categories.wos': #~ return(field_value_lists.SCAT) #~ else: #~ raise UnimplementedError() elif field_name in field_value_lists.RANGEFACET_FIELDS: applicable_bins = {} # recup des listes d'intervalles pertinentes # TODO faire une table de correspondance if field_name == 'publicationDate' or field_name == 'copyrightDate': applicable_bins = field_value_lists.DATE elif field_name == 'qualityIndicators.pdfCharCount': applicable_bins = field_value_lists.NBC luc_ranges = [] # conversion couple(min max) en syntaxe lucene "[min TO max]" for interval in applicable_bins: a = str(interval[0]) b = str(interval[1]) luc_ranges.append('[' + a + ' TO ' + b + ']') return(luc_ranges) else: print ("ERROR: ?? the API doesn't allow a facet query on field '%s' (and I don't have a field_value_lists for this field either :-/ )" % field_name, file=stderr) exit(1) def pooling(crit_fields, verbose=False): """ Sorte de tableau croisé: compte le nombre de docs pour chaque combinaison de critères Renvoie un dict avec ces décomptes et les totaux Exemple pour les critères corpusName et pdfCharCount { "f": { "corpusName:bmj AND pdfCharCount:[* TO 1999]": 24524, "corpusName:bmj AND pdfCharCount:[2000 TO *]": 662848, "corpusName:brill-journals AND pdfCharCount:[* TO 1999]": 10949, "corpusName:brill-journals AND pdfCharCount:[2000 TO *]": 119318, "corpusName:elsevier AND pdfCharCount:[* TO 1999]": 275461, "corpusName:elsevier AND pdfCharCount:[2000 TO *]": 5740132, "corpusName:nature AND pdfCharCount:[* TO 1999]": 332156, "corpusName:nature AND pdfCharCount:[2000 TO *]": 45139, "corpusName:oup AND pdfCharCount:[* TO 1999]": 58662, "corpusName:oup AND pdfCharCount:[2000 TO *]": 1385591, "corpusName:springer AND pdfCharCount:[* TO 1999]": 61973, "corpusName:springer AND pdfCharCount:[2000 TO *]": 2242902, "corpusName:wiley AND pdfCharCount:[* TO 1999]": 593998, "corpusName:wiley AND pdfCharCount:[2000 TO *]": 4044204 }, "totd": 15982692 # nombre de docs au total dans la base "nd": 15597857, # nombre de docs pour l'ensemble des critères "nr": 15597857, # nombre de réponses pour l'ensemble des critère # (intéressant pour les champs "QCM") } NB: les choix de fourchettes et les valeurs de facettes sont configurables dans field_value_lists.py """ ####### POOLING ######## # N_reponses = 0 N_workdocs = 0 doc_grand_total = 0 # dict of counts for each combo ((crit1:val_1a),(crit2:val_2a)...) abs_freqs = {} # --------------------------------------------------------------- # (1) PARTITIONING THE SEARCH SPACE IN POSSIBLE OUTCOMES -------- print("Sending count queries for criteria pools...",file=stderr) ## build all "field:values" pairs per criterion field ## (list of list of strings: future lucene query chunks) all_possibilities = [] # salut en passant à notre collègue Nourdine Combo ! n_combos = 1 for my_criterion in crit_fields: # print("CRIT",my_criterion) field_outcomes = facet_vals(my_criterion) # print("field_outcomes",field_outcomes) n_combos = n_combos * len(field_outcomes) # lucene query chunks all_possibilities.append( # pas de quotes si opérateurs booléens [my_criterion + ':' + val for val in field_outcomes] ) # par ex: 2 critères vont donner 2 listes dans all_possibilities # [ # ['qualityIndicators.refBibsNative:T', 'qualityIndicators.refBibsNative:F'], # # ['corpusName:brill', 'corpusName:bmj', 'corpusName:wiley', 'corpusName:elsevier', # 'corpusName:ecco', 'corpusName:eebo', 'corpusName:springer', 'corpusName:nature', # 'corpusName:oup', 'corpusName:journals'] # ] ## list combos (cartesian product of field_outcomes) # we're directly unpacking *args into itertool.product() # (=> we get an iterator over tuples of combinable query chunks) combinations = product(*all_possibilities) # example for -c corpusName, publicationDate # ------- # the list of combinations looks like this : # [ # ('corpusName:ecco', 'publicationDate:[* TO 1959]'), # ('corpusName:ecco', 'publicationDate:[1960 TO 1999]'), # ('corpusName:ecco', 'publicationDate:[2000 TO *]'), # ('corpusName:elsevier', 'publicationDate:[* TO 1959]'), # ('corpusName:elsevier', 'publicationDate:[1960 TO 1999]'), # ('corpusName:elsevier', 'publicationDate:[2000 TO *]'), # (...) # ] # --------------------------------------------------------------- # (2) getting total counts for each criteria -------------------- # number of counted answers # (1 doc can give several hits if a criterion was multivalued) N_reponses = 0 # do the counting for each combo for i, combi in enumerate(sorted(combinations)): if i % 100 == 0: print("pool %i/%i" % (i,n_combos), file=stderr) query = " AND ".join(combi) # counting requests ++++ freq = api.count(query) # print(freq) if verbose: print("pool:'% -30s': % 8i" %(query,freq),file=stderr) # storing and agregation N_reponses += freq abs_freqs[query] = freq # number of documents sending answers (hence normalizing constant N) N_workdocs = api.count(" AND ".join([k+":*" for k in crit_fields])) # for comparison: all_docs = N + api.count(q="NOT(criterion:*)") doc_grand_total = api.count(q='*') if verbose: print("--------- pool totals -----------", file=stderr) print("#answered hits : % 12s" % N_reponses, file=stderr) print("#workdocs (N) : % 12s" % N_workdocs, file=stderr) print("#all API docs fyi: % 12s" % doc_grand_total,file=stderr) print("---------------------------------", file=stderr) # resulting pool info in f + various totals return {"f":abs_freqs, "nr":N_reponses, "nd":N_workdocs, "totd":doc_grand_total} if __name__ == "__main__": # arguments cli args = my_parse_args(argv[1:]) # run json = pooling(args.criteria_list, verbose=args.verbose) # sortie: en json valide et "pretty" print(dumps(json, indent=2, sort_keys=True))