from global_functions import * import os import PyPDF2 import pandas as pd path = './factures-2021-2022-triees/autres' path_res='./results' df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","full_text"]) res = [] for filename in os.listdir(path): file = os.path.join(path, filename) # Read PDF as text try: reader = PyPDF2.PdfReader(file, strict=False) full_text = read_all_page(reader) except: full_text= "" #Extract infos unverified_doi = find_doi(full_text) doi = "" #only if a doi is found if unverified_doi: if verify_doi(unverified_doi): #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. doi=unverified_doi unverified_doi="" new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) df_res.to_csv(path_or_buf="%s/autres-sorted.csv"%path_res,index=False,header=True,encoding='utf-8',escapechar='\\')