diff --git a/api.py b/api.py index 7bf0c48..61f48cd 100755 --- a/api.py +++ b/api.py @@ -290,7 +290,7 @@ my_passw = getpass(prompt=' => Mot de passe: ') else: - for i, did in enumerate(ids): + for i, did in enumerate(list_of_ids): # on ne refait pas le 1er car il a marché if i == 0: continue diff --git a/corpusdirs.py b/corpusdirs.py old mode 100644 new mode 100755 index 45a7f8c..85fd368 --- a/corpusdirs.py +++ b/corpusdirs.py @@ -21,6 +21,34 @@ from subprocess import call from json import dump, load +# pour utilisation autonome +# corpusdirs.py new_corpus_name -t info_table.tsv +from argparse import ArgumentParser, RawDescriptionHelpFormatter +from sys import argv + + +# imports locaux +try: + # CHEMIN 1 cas de figure du dossier utilisé comme librairie + # au sein d'un package plus grand (exemple: bib-adapt-corpus) + from libconsulte import api + from libconsulte import field_value_lists + # =<< target_language_values, target_scat_values, + # target_genre_values, target_date_ranges +except ImportError: + try: + # CHEMIN 2: cas de figure d'un appel depuis le dossier courant + # exemple: on veut juste lancer le sampler tout seul + import api + import field_value_lists + + # cas de figure où il n'y a vraiment rien + except ImportError: + print("ERR: Les modules 'api.py' et 'field_value_lists.py' doivent être placés à côté du script corpusdirs.py ou dans un dossier du PYTHONPATH, pour sa bonne execution...") + exit(1) + + + # Infos structurelles de corpus par défaut BSHELVES = { # basic set ---------------------------------------------------------- @@ -53,17 +81,18 @@ # ------------------------------------------------------------ # £TODO absolument une dir extraite de s1 sous la forme read_dir - def __init__(self, ko_name, new_infos=None, + def __init__(self, ko_name, new_infos=None, new_home=None, read_dir=False, corpus_type='gold', - verbose=False, new_home=None, - shelves_struct=None): + verbose=False, shelves_struct=None): """ 2 INPUT modes -IN: *new_infos* : a metadata table (eg sampler output) (no fulltexts yet, no workdir needed) + + *new_home* : the container dir (ex: '.') -IN: *read_dir* : path to an existing Corpus dir (with data/ and meta/ subdirs, etc.) + + *new_home* : the container dir (ex: '.') In both modes: *new_home* is THE_CONTAINER DIR (private _home) + *shtruct* is A_SHELVES_STRUCTURE (private _shtruct) @@ -87,15 +116,15 @@ and also: self.meta self.cols self.name self.cdir --------- --------- --------- --------- """ - if not path.exists(new_home): - # suggérer bako assistant_installation à l'utilisateur ? + if not new_home or not path.exists(new_home): + print("Please provide an existing container directory to save the corpus directory into (current:%s)" % new_home) raise FileNotFoundError(new_home) # VAR 1: **home** our absolute container address ---------- # (version absolue du chemin de base indiqué à l'initialisation) self._home = path.abspath(new_home) - # VAR 2: **shtruct** our absolute container address ---------- + # VAR 2: **shtruct** the structure of "shelves" (subdirs) - # (structure for each possible shelf of this instance) # 4 possibilités ici :( @@ -163,14 +192,15 @@ else: self.cdir = read_dir - # read corresponding infos + # find corresponding infos infos_path = path.join(self.cdir,'meta','infos.tab') try: fi = open(infos_path,'r') # todo idem pour triggers except FileNotFoundError as fnf_err: fnf_err.pi_mon_rel_path = path.join(ko_name, 'meta','infos.tab') raise fnf_err - new_infos = fi.readlines() + # read in from meta/infos.tab to RAM + new_infos = [l.rstrip() for l in fi.readlines()] fi.close() if verbose: @@ -220,6 +250,10 @@ # VARS 6 and 7: >> meta << and >> cols << lookup tables -------- if new_infos: + # as source in usual INIT mode: directly with new_infos lines + # + also go here in READ mode with info lines retrieved from fs + # (only exception is empty inits) + # a simple csv reader (headers as in sampler.STD_MAP) records_obj = DictReader(new_infos, delimiter='\t') @@ -265,7 +299,7 @@ bn_fh.close() # SAVE META: shelfs (flags if some fulltexts already present) triggrs = open(trig_path,'w') - dump(self.shelfs, triggrs) # json.dump + dump(self.shelfs, triggrs, indent=2) # json.dump triggrs.close() # £TODO ici tree.json DATE x PUB @@ -287,23 +321,12 @@ print(self.ctype+'\n', file=touch_type) touch_type.close() - # print triggers + # print triggers (active/passive shelves) if verbose: - print("\n.shelfs:") - triggers_dirs = [] - for shelf, bol in self.shelfs.items(): - on_off = ' ON' if bol else 'off' - ppdir = self._shtruct[shelf]['d'] - triggers_dirs.append([ppdir,on_off]) - for td in sorted(triggers_dirs): - print(" > %-3s --- %s" % (td[1], td[0])) - - + self.print_corpus_info() # si on a eu un extension # (sera différente si et seulement si init objet fille) self._save_shelves_map() - - print("\n===( CORPUS SIZE: %i docs )===\n" % self.size) # ------------------------------------------------------------ # C O R P U S A C C E S S O R S @@ -412,7 +435,7 @@ """ trig_path = path.join(self.cdir,'meta','shelf_triggers.json') triggrs = open(trig_path,'w') - dump(self.shelfs, triggrs) # json.dump + dump(self.shelfs, triggrs, indent=2) # json.dump triggrs.close() def _save_shelves_map(self): @@ -422,7 +445,7 @@ map_path = path.join(self.cdir,'meta','shelves_map.json') # write shtruct to meta/shelves_map.json shmap = open(map_path,'w') - dump(self._shtruct, shmap) # json.dump + dump(self._shtruct, shmap, indent=2) # json.dump shmap.close() @@ -434,6 +457,20 @@ got_shelves = [sh for sh in all_sorted if self.shelfs[sh]] return got_shelves + def print_corpus_info(self): + """ + Prints a short list of possible shelves with ON/off status + and basic info: corpus_name and size + """ + print("======= CORPUSDIRS [%s] =======" % corpus_name) + triggers_dirs = [] + for shelf, bol in self.shelfs.items(): + on_off = ' ON' if bol else 'off' + ppdir = self._shtruct[shelf]['d'] + triggers_dirs.append([ppdir,on_off]) + for td in sorted(triggers_dirs): + print(" > %-3s --- %s" % (td[1], td[0])) + print("\n=====( SIZE: %i docs x %i shelfs )=====\n" % (self.size, len(self.shelfs))) # ------------------------------------------------------------ # C O R P U S B A S E C O N V E R T E R S @@ -451,7 +488,7 @@ if not dtd_prefix: dtd_prefix = path.abspath(path.join(THIS_SCRIPT_DIR,'etc','dtd_mashup')) if debug_lvl >= 1: - print("DTD REPAIR: new dtd_prefix: '%s'" % dtd_prefix) + print("DTD_REPAIR: new dtd_prefix: '%s'" % dtd_prefix) # corpus home if not our_home: @@ -461,13 +498,33 @@ if self.shelfs['XMLN']: todofiles = self.fileids(my_shelf="XMLN") + nb_missing = 0 + nb_uerrors = 0 + nb_no_dtd_wiley = 0 + nb_no_dtd_other = 0 + # temporary repaired_dir repaired_dir = path.join(self.cdir, 'data', 'with_dtd_repaired') - mkdir(repaired_dir) + if path.exists(repaired_dir): + print ('DTD_REPAIR: overwriting previous aborted reparation') + else: + mkdir(repaired_dir) for fi in todofiles: - fh = open(fi, 'r') - long_str = fh.read() + try: + fh = open(fi, 'r') + except FileNotFoundError as fnfe: + nb_missing += 1 + print("DTD_REPAIR (skip) missing source file %s" % fi) + try: + long_str = fh.read() + except UnicodeDecodeError as ue: + nb_uerrors += 1 + print("DTD_REPAIR (skip) UTF-8 decode error in input file %s" % fi) + # moving the file as it is and skipping reparation + move(fi, path.join(repaired_dir,path.basename(fi))) + continue + # £TODO alternative: add to error/ignore list with the object fh.close() # splits a doctype declaration in 3 elements @@ -497,26 +554,36 @@ new_str = sub(escape(original_declaration), new_declaration, long_str) # save - filename = path.basename(fi) - outfile = open(repaired_dir+'/'+filename, 'w') + outfile = open(path.join(repaired_dir,path.basename(fi)), 'w') outfile.write(new_str) outfile.close() else: if not search(r'wiley', long_str): # wiley often has no DTD declaration, just ns - print('dtd_repair (skip) no match on %s' % fi) + nb_no_dtd_other += 1 + print('DTD_REPAIR (skip) no match on %s' % fi) + else: + nb_no_dtd_wiley += 1 # save as is filename = path.basename(fi) - outfile = open(repaired_dir+'/'+filename, 'w') + outfile = open(path.join(repaired_dir,path.basename(fi)), 'w') outfile.write(long_str) outfile.close() # rename to std dir orig_dir = self.shelf_path("XMLN") if debug_lvl >= 2: - print ("dtd_repair: replacing native XMLs in %s by temporary contents from %s" %(orig_dir, repaired_dir)) + print ("DTD_REPAIR: replacing native XMLs in %s by temporary contents from %s" %(orig_dir, repaired_dir)) rmtree(orig_dir) rename(repaired_dir, orig_dir) + + # report + print("----------") + print("DTD_REPAIR:errors: %i missing source files" % nb_missing) + print("DTD_REPAIR:errors: %i unicode error files" % nb_uerrors) + print("DTD_REPAIR:warn: %i wiley files with no dtd (normal)" % nb_no_dtd_wiley) + print("DTD_REPAIR:warn: %i other files with no dtd (unknown)" % nb_no_dtd_other) + print("----------") # GOLDTEI @@ -533,7 +600,7 @@ if not our_home: our_home = self._home - print("*** XSL: CONVERSION PUB2TEI (NATIF VERS GOLD) ***") + print("XSL: PUB2TEI CONVERSION (NATIVE XML TO GOLD)") if not pub2tei_dir: # chemin relatif au point de lancement p2t_path = path.join(THIS_SCRIPT_DIR, 'etc', 'Pub2TEI', 'Stylesheets','Publishers.xsl') @@ -543,6 +610,8 @@ if not path.exists(p2t_path): print("%s doit au moins contenir Stylesheets/Publishers.xsl" % pub2tei_dir) + # vérification (est-ce que le fichier de sortie apparaît) + nb_errors = 0 # si dossier d'entrée if self.shelfs['XMLN']: @@ -552,8 +621,8 @@ gtei_dirpath = self.shelf_path("GTEI") if debug_lvl > 0: - print("XSL src dir: %s" % xml_dirpath) - print("XSL tgt dir: %s" % gtei_dirpath) + print("XSL: src dir=%s" % xml_dirpath) + print("XSL: tgt dir=%s" % gtei_dirpath) # mdkir dossier de sortie if not path.exists(gtei_dirpath): @@ -570,7 +639,7 @@ ] if debug_lvl > 0: - print("XSL:dbg: appel=%s" % call_args) + print("XSL: (debug) appel=%s" % call_args) try: # subprocess.call ----- @@ -582,13 +651,174 @@ else: raise - # renommage en .tei.xml comme attendu par fileids() + # verification si les docs sont bien passés + # et renommage en .tei.xml comme attendu par fileids() for fid in self.bnames: - rename(path.join(gtei_dirpath, fid+'.xml'), - path.join(gtei_dirpath, fid+'.tei.xml')) + try: + rename(path.join(gtei_dirpath, fid+'.xml'), + path.join(gtei_dirpath, fid+'.tei.xml')) + except FileNotFoundError as fnfe: + # £TODO alternative: keeping an error/ignore list with the object + ("XSL (skip) doc %s failed transformation" % fid) + nb_errors += 1 # on ne renvoie pas de valeur de retour, on signale juste le succès ou non if retval == 0: - print("*** XSL: CONVERSIONS RÉUSSIES ***") + print("----------") + print("XSL: %i successful transformations (all)" % self.size) + print("----------") else: - print("*** XSL: echec (partiel?) des conversions ***" % len(self.bnames)) + nb_ok = self.size - nb_errors + print("----------") + print("XSL: %i successful transformations" % nb_ok) + print("XSL: %i failed transformations" % nb_errors) + print("----------") + + +# utilisation autonome pour créer les dossiers basiques depuis une table +if __name__ == "__main__": + """ + Initialisation d'un corpus basique et remplissage de ses fulltexts + - on fournit une table de métadonnées infos.tab (chemin fs) + + Métadonnées, rangées dans /meta/ + - basenames.ls + - infos.tab + + Données: 3 formats, rangés dans /data/ + - .pdf, + - .xml (natif) + - et .tei.xml (pub2tei) + + Position dans le système de fichier: + sous ./corpus_name + """ + + parser = ArgumentParser( + formatter_class=RawDescriptionHelpFormatter, + description=""" + --------------------------------------------- + ISTEX-RD corpus operator (tool and library) + --------------------------------------------- + +""", + usage=""" +------ + corpusdirs.py un_nom_de_corpus --from mes_docs.tsv""", + epilog=""" +Actions: +-------- + 1) décharge PDF+XML natifs depuis l'API + 2) répare les DTD des XML natifs + 3) lance une conversion Pub2TEI + + => le tout dans des dossiers + bien rangés sous ./un_nom_de_corpus/ + + --- © 2015 Inist-CNRS (ISTEX) romain.loth at inist.fr ---""" + ) + + # argument positionnel (obligatoire) : le nom du corpus + parser.add_argument( + 'un_nom_de_corpus', + type=str, + help="nom du nouveau dossier corpus à créer" + ) + + # + parser.add_argument('--from', + metavar='mes_docs.tsv', + help="""tableau en entrée (tout tsv avec en COL1 istex_id et en COL2 le nom du lot... (par ex: la sortie détaillée de l'échantilloneur sampler.py)""", + type=str, + default=None , + required=True, + action='store') + + parser.add_argument('--debug', + metavar='1', + help="level of verbose/debug infos [default:0]", + type=int, + default=0 , + action='store') + + args = parser.parse_args(argv[1:]) + + from_table = args.from_table + debug = args.debuglvl + future_dir = corpus_name + # ============================================= + + if path.exists(corpus_name): + print("ERR: le nom '%s' est déjà pris dans ce dossier" % corpus_name) + exit(1) + + # (1/4) echantillon initial (juste la table) ------------------------- + if path.exists(from_table): + fic = open(from_table) + my_tab = [l.rstrip() for l in fic.readlines()] + fic.close() + else: + print("ERR bako.make_set: je ne trouve pas la table '%s' pour initialiser le corpus" % from_table) + exit(1) + + # (2/4) notre classe corpus ------------------------------------------ + + # Corpus + # initialisation + # - mode tab seul => fera un dossier meta/ et un data/ vide, + # - le corpus_type est mis en dur à 'gold' ce qui signale + # simplement qu'on ne change pas les étagères par défaut) + cobj = Corpus(corpus_name, new_infos = my_tab, new_home = '.', verbose = (debug>0)) + + # (3/4) téléchargement des fulltexts --------------------------------- + + my_ids = cobj.cols['istex_id'] + my_basenames = cobj.bnames + + for the_shelf in ['PDF0', 'XMLN']: + the_api_type = cobj.origin(the_shelf) + the_ext = cobj.filext(the_shelf) + tgt_dir = cobj.shelf_path(the_shelf) + + print("mkdir -p: %s" % tgt_dir) + mkdir(tgt_dir) + + api.write_fulltexts_loop_interact( + my_ids, my_basenames, + tgt_dir = tgt_dir, + api_types = [the_api_type] + ) + print("MAKE_SET: saved docs into CORPUS_HOME:%s" % cobj.name) + if debug > 0: + print(" (=> target dir:%s)" % tgt_dir) + + # NB: il doit y avoir la même extension dans cobj.filext(the_shelf) que chez l'API + # ou alors api.write_fulltexts doit autoriser à changer (renommer) les extensions + + cobj.assert_docs('PDF0') + cobj.assert_docs('XMLN') + + # persistance du statut des 2 dossiers créés + cobj.save_shelves_status() + + + # (4/4) conversion tei (type gold biblStruct) ------------------------ + + # copie en changeant les pointeurs dtd + print("***DTD LINKING***") + cobj.dtd_repair(debug_lvl = debug) + + print("***XML => TEI.XML CONVERSION***") + + # créera le dossier C-goldxmltei + cobj.pub2goldtei(debug_lvl = debug) # conversion + + cobj.assert_docs('GTEI') + + # persistence du statut du dossier créé + cobj.save_shelves_status() + + # voilà ! + cobj.print_corpus_info() + print("Corpus dirs successfully created in %s" % cobj.cdir) + diff --git a/sampler.py b/sampler.py index 2af1ce0..2970221 100755 --- a/sampler.py +++ b/sampler.py @@ -415,7 +415,7 @@ pool_info = {'f':abs_freqs, 'nr':N_reponses, 'nd':N_workdocs, 'totd':doc_grand_total} # json.dump - dump(pool_info, cache) + dump(pool_info, cache, indent=1) cache.close()