Newer
Older
APC-bills / v1 / editorsFunctions / 0001189058.py
@leogail leogail on 1 Aug 897 bytes fix(apc): fix jsons
from global_functions import *
import sys
import json


filename = sys.stdin.read().strip()
full_text = ocr_pdf(filename)


## Extract infos
# DOI
unverified_doi = find_doi(full_text)
doi = ""
#only if a doi is found
if unverified_doi:
    if verify_doi(unverified_doi):
        #If DOI is found, write it in "doi" column instead of 'unverified_doi' column.
        doi=unverified_doi
        unverified_doi=""

# Article
item_editor =  get_caracs_between(full_text,"TITLE","REMARKS").replace(
    "USD","").replace("PRICE","").replace("DISC","").replace("NET","").replace(
        "VALUE","").replace("UNIT","").replace("TAX","").replace("VAT","").replace("\n","")
    

new_row = {"nom_complet": filename.split("/")[1], "doi": doi, "unverified_doi":unverified_doi ,"item_editor":item_editor,"full_text": re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))}
sys.stdout.write((json.dumps(new_row)))