Newer
Older
APC-bills / v1 / editorsFunctions / 0001097617.py
@leogail leogail on 1 Aug 813 bytes fix(apc): fix jsons
  1. from global_functions import *
  2. import sys
  3. import json
  4. filename = sys.stdin.read().strip()
  5. full_text = ocr_pdf(filename)
  6. ## Extract infos
  7. # DOI
  8. unverified_doi = find_doi(full_text)
  9. doi = ""
  10. #only if a doi is found
  11. if unverified_doi:
  12. if verify_doi(unverified_doi):
  13. #If DOI is found, write it in "doi" column instead of 'unverified_doi' column.
  14. doi=unverified_doi
  15. unverified_doi=""
  16. # Article
  17. item_editor = get_caracs_between(full_text,"Publication","Title:").replace("\n"," ")
  18. if item_editor != "":
  19. item_editor = "Publication + Title: " + item_editor
  20. new_row = {"nom_complet": filename.split("/")[1], "doi": doi, "unverified_doi":unverified_doi ,"item_editor":item_editor,"full_text": re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))}
  21. sys.stdout.write((json.dumps(new_row)))