diff --git a/api.py b/api.py index 4f52f80..3afbf7f 100644 --- a/api.py +++ b/api.py @@ -12,12 +12,14 @@ from json import loads from urllib.parse import quote from urllib.request import urlopen, HTTPBasicAuthHandler, build_opener, install_opener -from urllib.error import URLError +from urllib.error import URLError, HTTPError from getpass import getpass from os import path from sys import stderr from re import sub -from json import dumps # pretty printing si debug ou main +from json import dumps +from collections import OrderedDict +from random import sample # globals DEFAULT_API_CONF = { @@ -31,21 +33,23 @@ def __str__(self): return repr(self.msg) -# private function -# ---------------- + + +# private functions +# ----------------- + def _get(my_url): """ Get remote url *that contains a ~json~* - and parse it + and parse it into an OrderedDict """ # print("> api._get:%s" % my_url, file=stderr) - try: remote_file = urlopen(my_url) - except URLError as url_e: - # signale 401 Unauthorized ou 404 etc + except (HTTPError, URLError) as url_e: + # signale 401 Unauthorized ou 404 ou 400 etc print("api: HTTP ERR (%s) sur '%s'" % (url_e.reason, my_url), file=stderr) # Plus d'infos: serveur, Content-Type, WWW-Authenticate.. @@ -59,7 +63,7 @@ % my_url, file=stderr) remote_file.close() result_str = response.decode('UTF-8') - json_values = loads(result_str) + json_values = loads(result_str, object_pairs_hook=OrderedDict) return json_values @@ -111,9 +115,7 @@ # public functions # ---------------- -# £TODO1: séparer le mode i_from dans une fonction à part random_search_one -# £TODO2: limit par défaut à 1 pour éviter de tout télécharger si test rapide ?? -# £TODO3: stockage disque sur fichier tempo si liste grande et champx nbx +# £TODO: stockage disque sur fichier tempo si liste grande X champs nbx def search(q, api_conf=DEFAULT_API_CONF, limit=None, n_docs=None, outfields=('title','host.issn','fulltext'), i_from=0): """ Query the API and get a (perhaps long) "hits" array of json metadata. @@ -139,13 +141,12 @@ ne pas interférer avec la logique de pagination des requêtes plus normales à partir de 0 - Output format is a parsed json with a total value and a hit list: - { 'hits': [ { 'id': '21B88F4EFBA46DC85E863709CA9824DEED7B7BFC', + Output format is the hit list parsed from the API's json : + [ { 'id': '21B88F4EFBA46DC85E863709CA9824DEED7B7BFC', 'title': 'Recovering information borne by quanta that ' 'crossed the black hole event horizon'}, { 'id': 'C095E6F0A43EBE3E98E2E6E17DD8775617636034', - 'title': 'Holographic insights and puzzles'}], - 'total': 2} + 'title': 'Holographic insights and puzzles'}] """ # préparation requête @@ -162,15 +163,8 @@ # print("api.search().base_url:", base_url) - # éventuel point de départ choisi (à utiliser notamment avec limit = 1 pour aléa) - if i_from != 0: - if limit != 1: - raise NotImplementedError("l'utilisation de from dans les requêtes API n'est ici implémentée que pour des tailles de 1 (use case: choix d'un doc aléatoire)") - else: - base_url += '&from=' + str(i_from) - # limitation éventuelle fournie par le switch --maxi - if limit is not None: + if (limit is not None) and (type(limit) == int) and (limit >= 0) and (limit < n_docs): n_docs = limit # la liste des résultats à renvoyer @@ -182,7 +176,7 @@ my_url = base_url + '&size=%i' % n_docs # debug - # print("api.search()._get_url:", my_url0ll) + # print("api.search()._get_url:", my_url) try: json_values = _get(my_url) @@ -207,6 +201,99 @@ return(all_hits) + + + + +# requête aléatoire +# ------------------ +# ancien mode i_from de search mis dans fonction à part random_search +def random_search(q, quota = 1, nb_known_docs=None, outfields=('title','host.issn','fulltext'), api_conf=DEFAULT_API_CONF): + """ + Query the API for a given hit (doc => json metadata). + + args: + ----- + q -- a lucene query + ex: "quantum cat AND publicationDate:[1970 TO *]" + + quota -- how many hits to draw + (nombre de hits à piocher) + + nb_known_docs -- si on le connait déjà, le nombre de docs totaux pour cette q + (permet de savoir parmi combien on va piocher) + + si None sera recalculé + + optional kwargs: + - - - - - - - - - + outfields -- fieldNames list for the api to return for each hit + api_conf -- an inherited http config dict with these 2 keys: + * api_conf['host'] <- default: "api.istex.fr" + * api_conf['route'] <- default: "document" + + Output format same as search() + """ + + # préparation requête + url_encoded_lucene_query = my_url_quoting(q) + + # décompte à part + if nb_known_docs is None: + nb_known_docs = count(url_encoded_lucene_query, already_escaped=True) + # print('%s documents trouvés' % n_docs) + + + # ---------------- tirage aléatoire ---------------------- + + # range [0,1,2,3,...,total_req] + all_indices = range(nb_known_docs) + + # /!\ sample() du module standard "random" + local_tirage = sample( + population=all_indices, + k=quota + ) + #----------- + + str_local_tirage = str(local_tirage) + # ellipse + if len(str_local_tirage) > 40: + str_local_tirage = str_local_tirage[0:37]+'...' + + # pour infos + print(" ... drawing among %i docs => %s" % (nb_known_docs, str_local_tirage), file=stderr) + + # ici on va lancer un par un plusieurs _get() + # avec à chaque fois l'indice i pioché + # envoyé à API comme paramètre "from" + + # => du coup peut être un peu long + + # base commune de l'URL + base_url = 'https:' + '//' + api_conf['host'] + '/' + api_conf['route'] + '/' + '?' + 'q=' + url_encoded_lucene_query + '&output=' + ",".join(outfields) + + # nos résultats + random_hits = [] + + for mon_indice in sorted(local_tirage): + + my_url = base_url + '&from=' + str(mon_indice) + '&size=1' + + # [1 json hit] + new_hit = _get(my_url)['hits'] + + if len(new_hit) != 1: + raise ValueError("q=%s&from=%i vide ??" % (q, mon_indice)) + else: + # enregistrement + random_hits.append(new_hit.pop()) + + return(random_hits) + + + + def count(q, api_conf=DEFAULT_API_CONF, already_escaped=False): """ Get total hits for a lucene query on ISTEX api. @@ -343,17 +430,38 @@ ) -def terms_facet(facet_name, q="*", api_conf=DEFAULT_API_CONF): +def terms_facet(facet_name, q="*", size_param="*", min_val=None, api_conf=DEFAULT_API_CONF): """ Get list of possible values/outcomes for a given field, with their counts (within the perimeter of the query q). output format {"facet_value_1": count_1, ...} + + NB: simplification de la structure + api: [{'docCount': 8059500, 'key': 'eng'}, + {'docCount': 1138473, 'key': 'deu'}] + => ici sortie + compacte: + {'eng': 8059500, 'deu': 1138473 } + + + attention à l'interprétation du total + -------------------------------------- + # ici on travaille sur un nb de réponses à ma_facette + nb_reps = sum(terms_facet("ma_facette").values()) + + # par opposition à un nb de docs ayant la facette définie + nb_docs = api.count(q="ma_facette:*") + + On aura toujours : (nb_reps >= nb_docs) """ + # vérif du type taille + if (type(size_param) != int and size_param != "*") or (type(size_param) == int and size_param < 0): + raise TypeError("L'interrogation par facettes requiert un paramètre size int > 0 ou '*' cf. https://api.istex.fr/documentation/300-search.html#facettes") + # préparation requête url_encoded_lucene_query = my_url_quoting(q) # construction de l'URL - facet_url = 'https:' + '//' + api_conf['host'] + '/' + api_conf['route'] + '/' + '?' + 'q=' + url_encoded_lucene_query + '&facet=' + facet_name + facet_url = 'https:' + '//' + api_conf['host'] + '/' + api_conf['route'] + '/' + '?' + 'q=' + url_encoded_lucene_query + '&facet=' + facet_name + '[' + size_param + ']' # requête json_values = _get(facet_url) @@ -367,14 +475,23 @@ # ] # => sortie + compacte: # {'eng': 8059500, 'deu': 1138473 } - simple_dict = {} - for record in key_counts: - k = record['key'] - n = record['docCount'] - - simple_dict[k] = n + simpler = OrderedDict() + if min_val == None: + # mode classique : tous les résultats + for record in key_counts: + k = record['key'] + n = record['docCount'] + + simpler[k] = n + else: + # mode avec filtre eg valeurs >= 2 + for record in key_counts: + k = record['key'] + n = record['docCount'] + if n >= min_val: + simpler[k] = n - return simple_dict + return simpler def my_url_quoting(a_query): diff --git a/field_combo_count.py b/field_combo_count.py index 1b14c4c..5209add 100644 --- a/field_combo_count.py +++ b/field_combo_count.py @@ -40,6 +40,8 @@ from itertools import product from json import dumps from argparse import ArgumentParser +from os import path +from configparser import ConfigParser # imports locaux @@ -64,6 +66,14 @@ exit(1) +# -------------------------------------------------------------------- +# lecture du fichier config local : +script_dir = path.dirname(path.realpath(__file__)) +CONF = ConfigParser() +with open(path.join(script_dir,'libconsulte.conf')) as f: + CONF.read_file(f) +# -------------------------------------------------------------------- + def my_parse_args(arglist=None): """Arguments ligne de commande pour main()""" @@ -113,21 +123,45 @@ > ['elsevier','wiley', 'nature', 'sage', ...] """ - if field_name in field_value_lists.TERMFACET_FIELDS_auto: - # deuxième partie si un "sous.type" - facet_name = sub('^[^.]+\.', '', field_name) - return(api.terms_facet(facet_name).keys()) + term_fields = CONF['known_fields']['TERM_FIELDS'].split("\n") - elif field_name in field_value_lists.TERMFACET_FIELDS_local: - # on a en stock 3 listes ad hoc - if field_name == 'language': - return(field_value_lists.LANG) - elif field_name == 'genre': - return(field_value_lists.GENRE) - elif field_name == 'categories.wos': - return(field_value_lists.SCAT) - else: - raise UnimplementedError() + + if field_name in term_fields: + + # filtre sur les catégories ayant moins de MIN_DOCS + min_docs = int(CONF['divers']['MIN_FACET_POOL']) + + key_list = list( + api.terms_facet( + field_name, + min_val=min_docs + ).keys() + ) + + key_list_terms = ["'%s'" % key for key in key_list] + + # le complément : la facette autre explicite + # ex: (NOT(eng OR fre OR deu OR unknown)) + + other_facet = '(NOT(' + ' OR '.join(key_list_terms) + '))' + + key_list_terms.append(other_facet) + + # print(key_list_terms) + + return(key_list_terms) + + + #~ elif field_name in field_value_lists.TERMFACET_FIELDS_local: + #~ # on a en stock 3 listes ad hoc + #~ if field_name == 'language': + #~ return(field_value_lists.LANG) + #~ elif field_name == 'genre': + #~ return(field_value_lists.GENRE) + #~ elif field_name == 'categories.wos': + #~ return(field_value_lists.SCAT) + #~ else: + #~ raise UnimplementedError() elif field_name in field_value_lists.RANGEFACET_FIELDS: applicable_bins = {} @@ -200,7 +234,7 @@ ## (list of list of strings: future lucene query chunks) all_possibilities = [] - # petit hommage à notre collègue Nourdine Combo ! + # salut en passant à notre collègue Nourdine Combo ! n_combos = 1 for my_criterion in crit_fields: @@ -210,6 +244,7 @@ n_combos = n_combos * len(field_outcomes) # lucene query chunks all_possibilities.append( + # pas de quotes si opérateurs booléens [my_criterion + ':' + val for val in field_outcomes] ) @@ -229,6 +264,8 @@ # example for -c corpusName, publicationDate + # ------- + # the list of combinations looks like this : # [ # ('corpusName:ecco', 'publicationDate:[* TO 1959]'), # ('corpusName:ecco', 'publicationDate:[1960 TO 1999]'), @@ -268,12 +305,13 @@ # number of documents sending answers (hence normalizing constant N) N_workdocs = api.count(" AND ".join([k+":*" for k in crit_fields])) + # for comparison: all_docs = N + api.count(q="NOT(criterion:*)") + doc_grand_total = api.count(q='*') + if verbose: print("--------- pool totals -----------", file=stderr) print("#answered hits : % 12s" % N_reponses, file=stderr) print("#workdocs (N) : % 12s" % N_workdocs, file=stderr) - # for comparison: all_docs = N + api.count(q="NOT(criterion:*)") - doc_grand_total = api.count(q='*') print("#all API docs fyi: % 12s" % doc_grand_total,file=stderr) print("---------------------------------", file=stderr) diff --git a/field_value_lists.py b/field_value_lists.py index 61c0946..bea1448 100644 --- a/field_value_lists.py +++ b/field_value_lists.py @@ -30,20 +30,22 @@ # (grouped according to the method we use for value listing) # ---------------------------------------------------------------------- # auto value-listing via facet query -TERMFACET_FIELDS_auto = [ +TERMFACET_FIELDS = [ 'corpusName', 'qualityIndicators.pdfVersion', - 'qualityIndicators.refBibsNative' - ] - -# value-listing provided locally (stored into field_value_lists.py) -TERMFACET_FIELDS_local = [ + 'qualityIndicators.refBibsNative', 'language', 'genre', - 'categories.wos' + 'categories.wos', + 'host.title', + 'host.issn', + 'host.isbn', + 'serie.title', + 'serie.issn', + 'serie.isbn' ] -# binned listing via date ranges (also in field_value_lists.py) +# binned listing via ranges (also in field_value_lists.py) RANGEFACET_FIELDS = [ 'publicationDate', 'copyrightDate', @@ -51,25 +53,7 @@ 'qualityIndicators.pdfWordCount' ] -KNOWN_FIELDS = TERMFACET_FIELDS_auto + TERMFACET_FIELDS_local + RANGEFACET_FIELDS - - -# ---------------------------------------------------------------------- -# 4 document classification criteria => 4 schemes -# -# 3 lists of constants : LANG, GENRE, SCICAT -# 1 list of ranges (intervals) : DATE -# ---------------------------------------------------------------------- - - -## target language list ---------------------------- 1 -LANG = ( - "eng", - "deu", - "fre", - # "autres" - "((NOT eng) AND (NOT deu) AND (NOT fre))" - ) +KNOWN_FIELDS = TERMFACET_FIELDS + RANGEFACET_FIELDS @@ -141,248 +125,6 @@ (2000, "*") ) -# SCICAT (aka academic discipline) ----------------- 4 -SCAT = ( - # "WOS subject cats" scheme - "ACOUSTICS", - "AGRICULTURAL ECONOMICS & POLICY", - "AGRICULTURAL ENGINEERING", - "AGRICULTURE", - "AGRICULTURE, DAIRY & ANIMAL SCIENCE", - "AGRICULTURE, MULTIDISCIPLINARY", - "AGRICULTURE, SOIL SCIENCE", - "AGRONOMY", - "ALLERGY", - "ANESTHESIOLOGY", - "ANTHROPOLOGY", - "APPLIED LINGUISTICS", - "AREA STUDIES", - "ART", - "ASTRONOMY & ASTROPHYSICS", - "AUDIOLOGY & SPEECH-LANGUAGE PATHOLOGY", - "AUTOMATION & CONTROL SYSTEMS", - "BEHAVIORAL SCIENCES", - "BIOCHEMICAL RESEARCH METHODS", - "BIOCHEMISTRY & MOLECULAR BIOLOGY", - "BIODIVERSITY CONSERVATION", - "BIOLOGY", - "BIOPHYSICS", - "BIOTECHNOLOGY & APPLIED MICROBIOLOGY", - "BUSINESS", - "BUSINESS, FINANCE", - "CARDIAC & CARDIOVASCULAR SYSTEMS", - "CELL & TISSUE ENGINEERING", - "CELL BIOLOGY", - "CHEMISTRY", - "CHEMISTRY, ANALYTICAL", - "CHEMISTRY, APPLIED", - "CHEMISTRY, INORGANIC & NUCLEAR", - "CHEMISTRY, MEDICINAL", - "CHEMISTRY, MULTIDISCIPLINARY", - "CHEMISTRY, ORGANIC", - "CHEMISTRY, PHYSICAL", - "CLINICAL NEUROLOGY", - "COMMUNICATION", - "COMPUTER SCIENCE, ARTIFICIAL INTELLIGENCE", - "COMPUTER SCIENCE, CYBERNETICS", - "COMPUTER SCIENCE, HARDWARE & ARCHITECTURE", - "COMPUTER SCIENCE, INFORMATION SYSTEMS", - "COMPUTER SCIENCE, INTERDISCIPLINARY APPLICATIONS", - "COMPUTER SCIENCE, SOFTWARE ENGINEERING", - "COMPUTER SCIENCE, THEORY & METHODS", - "CONSTRUCTION & BUILDING TECHNOLOGY", - "CRIMINOLOGY & PENOLOGY", - "CRITICAL CARE MEDICINE", - "CRYSTALLOGRAPHY", - "CULTURAL STUDIES", - "DEMOGRAPHY", - "DENTISTRY, ORAL SURGERY & MEDICINE", - "DERMATOLOGY", - "DEVELOPMENTAL BIOLOGY", - "ECOLOGY", - "ECONOMICS", - "EDUCATION & EDUCATIONAL RESEARCH", - "EDUCATION, SCIENTIFIC DISCIPLINES", - "EDUCATION, SPECIAL", - "ELECTROCHEMISTRY", - "EMERGENCY MEDICINE", - "ENDOCRINOLOGY & METABOLISM", - "ENERGY & FUELS", - "ENGINEERING", - "ENGINEERING, AEROSPACE", - "ENGINEERING, BIOMEDICAL", - "ENGINEERING, CHEMICAL", - "ENGINEERING, CIVIL", - "ENGINEERING, ELECTRICAL & ELECTRONIC", - "ENGINEERING, ENVIRONMENTAL", - "ENGINEERING, GEOLOGICAL", - "ENGINEERING, INDUSTRIAL", - "ENGINEERING, MANUFACTURING", - "ENGINEERING, MARINE", - "ENGINEERING, MECHANICAL", - "ENGINEERING, MULTIDISCIPLINARY", - "ENGINEERING, OCEAN", - "ENGINEERING, PETROLEUM", - "ENTOMOLOGY", - "ENVIRONMENTAL SCIENCES", - "ENVIRONMENTAL STUDIES", - "ERGONOMICS", - "ETHICS", - "ETHNIC STUDIES", - "EVOLUTIONARY BIOLOGY", - "FAMILY STUDIES", - "FILM, RADIO, TELEVISION", - "FISHERIES", - "FOOD SCIENCE & TECHNOLOGY", - "FORESTRY", - "GASTROENTEROLOGY & HEPATOLOGY", - "GENETICS & HEREDITY", - "GEOCHEMISTRY & GEOPHYSICS", - "GEOGRAPHY", - "GEOGRAPHY, PHYSICAL", - "GEOLOGY", - "GEOSCIENCES, MULTIDISCIPLINARY", - "GERIATRICS & GERONTOLOGY", - "GERONTOLOGY", - "HEALTH CARE SCIENCES & SERVICES", - "HEALTH POLICY & SERVICES", - "HEMATOLOGY", - "HISTORY", - "HISTORY & PHILOSOPHY OF SCIENCE", - "HISTORY OF SOCIAL SCIENCES", - "HORTICULTURE", - "HOSPITALITY, LEISURE, SPORT & TOURISM", - "HUMANITIES, MULTIDISCIPLINARY", - "IMAGING SCIENCE & PHOTOGRAPHIC TECHNOLOGY", - "IMMUNOLOGY", - "INFECTIOUS DISEASES", - "INFORMATION SCIENCE & LIBRARY SCIENCE", - "INSTRUMENTS & INSTRUMENTATION", - "INTEGRATIVE & COMPLEMENTARY MEDICINE", - "INTERNATIONAL RELATIONS", - "LANGUAGE & LINGUISTICS", - "LAW", - "LINGUISTICS", - "LITERARY THEORY & CRITICISM", - "LITERATURE", - "LITERATURE, AMERICAN", - "LITERATURE, ROMANCE", - "LOGIC", - "MANAGEMENT", - "MARINE & FRESHWATER BIOLOGY", - "MATERIALS SCIENCE", - "MATERIALS SCIENCE, BIOMATERIALS", - "MATERIALS SCIENCE, CERAMICS", - "MATERIALS SCIENCE, CHARACTERIZATION & TESTING", - "MATERIALS SCIENCE, COATINGS & FILMS", - "MATERIALS SCIENCE, COMPOSITES", - "MATERIALS SCIENCE, MULTIDISCIPLINARY", - "MATERIALS SCIENCE, TEXTILES", - "MATHEMATICAL & COMPUTATIONAL BIOLOGY", - "MATHEMATICS", - "MATHEMATICS, APPLIED", - "MATHEMATICS, INTERDISCIPLINARY APPLICATIONS", - "MECHANICS", - "MEDICAL ETHICS", - "MEDICAL INFORMATICS", - "MEDICAL LABORATORY TECHNOLOGY", - "MEDICINE, GENERAL & INTERNAL", - "MEDICINE, LEGAL", - "MEDICINE, RESEARCH & EXPERIMENTAL", - "METALLURGY", - "METALLURGY & METALLURGICAL ENGINEERING", - "METEOROLOGY & ATMOSPHERIC SCIENCES", - "MICROBIOLOGY", - "MICROSCOPY", - "MINERALOGY", - "MINING & MINERAL PROCESSING", - "MULTIDISCIPLINARY SCIENCES", - "MUSIC", - "MYCOLOGY", - "NANOSCIENCE & NANOTECHNOLOGY", - "NEUROIMAGING", - "NEUROSCIENCES", - "NUCLEAR SCIENCE & TECHNOLOGY", - "NURSING", - "NUTRITION & DIETETICS", - "OBSTETRICS & GYNECOLOGY", - "OCEANOGRAPHY", - "ONCOLOGY", - "OPERATIONS RESEARCH & MANAGEMENT SCIENCE", - "OPHTHALMOLOGY", - "OPTICS", - "ORTHOPEDICS", - "OTORHINOLARYNGOLOGY", - "PALEONTOLOGY", - "PARASITOLOGY", - "PATHOLOGY", - "PEDIATRICS", - "PERIPHERAL VASCULAR DISEASE", - "PHARMACOLOGY & PHARMACY", - "PHILOSOPHY", - "PHYSICS", - "PHYSICS, APPLIED", - "PHYSICS, ATOMIC, MOLECULAR & CHEMICAL", - "PHYSICS, CONDENSED MATTER", - "PHYSICS, FLUIDS & PLASMAS", - "PHYSICS, MATHEMATICAL", - "PHYSICS, MULTIDISCIPLINARY", - "PHYSICS, NUCLEAR", - "PHYSICS, PARTICLES & FIELDS", - "PHYSIOLOGY", - "PLANNING & DEVELOPMENT", - "PLANT SCIENCES", - "POLITICAL SCIENCE", - "POLYMER SCIENCE", - "PRIMARY HEALTH CARE", - "PSYCHIATRY", - "PSYCHOLOGY", - "PSYCHOLOGY, APPLIED", - "PSYCHOLOGY, BIOLOGICAL", - "PSYCHOLOGY, CLINICAL", - "PSYCHOLOGY, DEVELOPMENTAL", - "PSYCHOLOGY, EDUCATIONAL", - "PSYCHOLOGY, EXPERIMENTAL", - "PSYCHOLOGY, MULTIDISCIPLINARY", - "PSYCHOLOGY, SOCIAL", - "PUBLIC ADMINISTRATION", - "PUBLIC, ENVIRONMENTAL & OCCUPATIONAL HEALTH", - "RADIOLOGY, NUCLEAR MEDICINE & MEDICAL IMAGING", - "REHABILITATION", - "RELIGION", - "REMOTE SENSING", - "REPRODUCTIVE BIOLOGY", - "RESPIRATORY SYSTEM", - "RHEUMATOLOGY", - "ROBOTICS", - "SOCIAL ISSUES", - "SOCIAL SCIENCES, BIOMEDICAL", - "SOCIAL SCIENCES, INTERDISCIPLINARY", - "SOCIAL SCIENCES, MATHEMATICAL METHODS", - "SOCIAL WORK", - "SOCIOLOGY", - "SOIL SCIENCE", - "SPECTROSCOPY", - "SPORT SCIENCES", - "STATISTICS & PROBABILITY", - "SUBSTANCE ABUSE", - "SURGERY", - "TELECOMMUNICATIONS", - "THERMODYNAMICS", - "TOXICOLOGY", - "TRANSPLANTATION", - "TRANSPORTATION", - "TRANSPORTATION SCIENCE & TECHNOLOGY", - "TROPICAL MEDICINE", - "URBAN STUDIES", - "UROLOGY & NEPHROLOGY", - "VETERINARY SCIENCES", - "VIROLOGY", - "WATER RESOURCES", - "WOMEN'S STUDIES", - "ZOOLOGY" -) - # NBC ---------------------------------------------- 5 # bins again for number of chars <=> NBC <=> qualityIndicators.pdfCharCount diff --git a/libconsulte.conf b/libconsulte.conf new file mode 100644 index 0000000..670de53 --- /dev/null +++ b/libconsulte.conf @@ -0,0 +1,23 @@ +[known_fields] + +# champs qui peuvent faire l'objet d'une facette terme +# ---------------------------------------------------- +# (et qui ont un nb de valeurs possibles raisonnable) +# (par exemple pas 'title' ou 'author.name') +TERM_FIELDS = corpusName + qualityIndicators.pdfVersion + qualityIndicators.refBibsNative + language + genre + host.title + host.issn + host.isbn + serie.title + serie.issn + serie.isbn + categories.wos + +[divers] +MIN_FACET_POOL=20000 + +# DO_COMPLEMENT=1 \ No newline at end of file diff --git a/sampler.py b/sampler.py index 2c24620..d7fdf24 100755 --- a/sampler.py +++ b/sampler.py @@ -70,7 +70,7 @@ # limit on maximum runs before returning a potentially undersized sample MAX_RUNS = 5 # paramètre de lissage +k à chaque quota (aka lissage de Laplace) -LISSAGE = 0.2 +LISSAGE = 0.1 # list of IDs to exclude from the sample result FORBIDDEN_IDS = [] @@ -366,68 +366,39 @@ print("Retrieving random samples in each quota...", file=stderr) + for combi_query in sorted(rel_freqs.keys()): - json_hits = [] + random_json_hits = [] + my_n_got = 0 # how many hits do we need? my_quota = rel_freqs[combi_query] - # adding constraints + # on prend à présent en compte la contrainte -w if constraint_query: my_query = '('+combi_query+') AND ('+constraint_query+')' - # pour les indices dispo, on doit recompter avec la contrainte - all_indices = [i for i in range(api.count(my_query))] - - # si pas de contrainte les indices dispos - # pour le tirage aléatoire sont simplement [0:freq] + known_nb_docs = None # du coup on ne connaît plus le total + + # si pas de contrainte on a déjà un total else: my_query = combi_query - all_indices = [i for i in range(abs_freqs[combi_query])] + known_nb_docs = abs_freqs[combi_query] + print ("todo %-70s" % my_query[0:64]+"...", file=stderr) - # on lance search 1 par 1 avec les indices tirés en FROM --------- - - # ici => ordre aléatoire - shuffle(all_indices) - - # on ne prend que les n premiers (tirage) - local_tirage = all_indices[0:my_quota] - - # pour infos - if verbose: - print(" ... drawing among %i docs :\n ... picked => %s" % (len(all_indices), local_tirage), file=stderr) - - for indice in local_tirage: - # ----------------- api.search(...) ---------------------------- - new_hit = api.search(my_query, - limit=1, - i_from=indice, - n_docs=abs_freqs[combi_query], - outfields=STD_MAP.keys()) - # outfields=('id','author.name','title','publicationDate','corpusName') - - if len(new_hit) != 1: - # skip vides - # à cause d'une contrainte - continue - else: - # enregistrement - json_hits.append(new_hit.pop()) - # -------------------------------------------------------------- - - # NB: 'id' field would be enough for sampling itself, but we get - # more metadatas to be able to provide an info table - my_n_answers = len(json_hits) - - my_n_got = 0 - - # for debug - # print("HITS:",json_hits, file=stderr) - + # ----------------- api.search(...) ---------------------------- + random_json_hits = api.random_search( + my_query, + # nombre à tirer (~ limit) + quota = my_quota, + # si on connait déjà le total on lui passe + nb_known_docs = known_nb_docs, + outfields=STD_MAP.keys() + ) # check unicity - for hit in json_hits: + for hit in random_json_hits: idi = hit['id'] if idi not in index and idi not in FORBIDDEN_IDS: @@ -492,7 +463,7 @@ index[idi]['wcp'] = "UNKNOWN_PDFWORDCOUNT" index[idi]['bibnat'] = "UNKNOWN_REFBIBSNATIVE" - print ("done %-70s: %i/%i" % ( + print ("done %-70s: %i/%i\n" % ( my_query[0:64]+"...", my_n_got, my_quota