from global_functions import * import sys import json filename = sys.stdin.read().strip() full_text = ocr_pdf(filename) ## Extract infos # DOI unverified_doi = find_doi(full_text) doi = "" #only if a doi is found if unverified_doi: if verify_doi(unverified_doi): #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. doi=unverified_doi unverified_doi="" # Article item_editor = get_caracs_between(full_text,"Item","Total").replace("\n"," ") if item_editor != "": item_editor = "Item: " + item_editor new_row = {"nom_complet": filename.split("/")[1], "doi": doi, "unverified_doi":unverified_doi ,"item_editor":item_editor,"full_text": re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} sys.stdout.write((json.dumps(new_row)))