Newer
Older
APC-bills / v1 / editorsFunctions / 0001021938.py
from global_functions import *
import os
import PyPDF2
import pandas as pd

num_editor = "0001021938"

path = './factures-2021-2022-triees/%s' %num_editor

path_res='./results'


df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"])

res = []
for filename in os.listdir(path):
    file = os.path.join(path, filename)
    # Read PDF as text
    try:
        reader = PyPDF2.PdfReader(file, strict=False)
        full_text = read_all_page(reader)
    except:
        full_text= ""
    
    ## Extract infos
    # DOI
    unverified_doi = find_doi(full_text)
    doi = ""
    #only if a doi is found
    if unverified_doi:
        if verify_doi(unverified_doi):
            #If DOI is found, write it in "doi" column instead of 'unverified_doi' column.
            doi=unverified_doi
            unverified_doi=""

    # Article
    if "Author/Article:" in full_text:
        item_editor =  get_caracs_between(full_text,"Author/Article:","Manuscript").replace("\n"," ")
        
        if item_editor != "":
            item_editor = "Author/Article: " + item_editor

    else:
        item_editor =  get_caracs_between(full_text,"Author","Manuscript ID:").replace("\n"," ")
        
        if item_editor != "":
            item_editor = "Author" + item_editor

    new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))}
    df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True)

df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\')