from global_functions import *
import os
import PyPDF2
import pandas as pd
num_editor = "0001024808"
path = './factures-2021-2022-triees/%s' %num_editor
path_res='./results'
df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"])
res = []
for filename in os.listdir(path):
file = os.path.join(path, filename)
# Read PDF as text
try:
reader = PyPDF2.PdfReader(file, strict=False)
full_text = read_all_page(reader)
except:
full_text= ""
## Extract infos
# DOI
unverified_doi = find_doi(full_text)
doi = ""
#only if a doi is found
if unverified_doi:
if verify_doi(unverified_doi):
#If DOI is found, write it in "doi" column instead of 'unverified_doi' column.
doi=unverified_doi
unverified_doi=""
# Article
item_editor = get_caracs_between(full_text,"Description","Total Units").replace("\n"," ")
if item_editor != "":
item_editor = "Description: " + item_editor
else:
item_editor = get_caracs_between(full_text,"Description","Product ").replace("\n"," ")
if item_editor != "":
item_editor = "Description: " + item_editor
new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))}
df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True)
df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\')