- from global_functions import *
- import sys
- import json
- filename = sys.stdin.read().strip()
- full_text = ocr_pdf(filename)
- unverified_doi = find_doi(full_text)
- doi = ""
- if unverified_doi:
- if verify_doi(unverified_doi):
-
- doi=unverified_doi
- unverified_doi=""
- item_editor = get_caracs_between(full_text,"Publication","Title:").replace("\n"," ")
- if item_editor != "":
- item_editor = "Publication + Title: " + item_editor
- new_row = {"nom_complet": filename.split("/")[1], "doi": doi, "unverified_doi":unverified_doi ,"item_editor":item_editor,"full_text": re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))}
- sys.stdout.write((json.dumps(new_row)))