from global_functions import * import os import PyPDF2 import pandas as pd num_editor = "0001021938" path = './factures-2021-2022-triees/%s' %num_editor path_res='./results' df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) res = [] for filename in os.listdir(path): file = os.path.join(path, filename) # Read PDF as text try: reader = PyPDF2.PdfReader(file, strict=False) full_text = read_all_page(reader) except: full_text= "" ## Extract infos # DOI unverified_doi = find_doi(full_text) doi = "" #only if a doi is found if unverified_doi: if verify_doi(unverified_doi): #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. doi=unverified_doi unverified_doi="" # Article if "Author/Article:" in full_text: item_editor = get_caracs_between(full_text,"Author/Article:","Manuscript").replace("\n"," ") if item_editor != "": item_editor = "Author/Article: " + item_editor else: item_editor = get_caracs_between(full_text,"Author","Manuscript ID:").replace("\n"," ") if item_editor != "": item_editor = "Author" + item_editor new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\')