Newer
Older
APC-bills / editors-processing / 0001079074.py
@Leo-gail Leo-gail on 10 Jan 1 KB initial commit
from global_functions import *
import os
import PyPDF2
import pandas as pd

num_editor = "0001079074"
path = './factures-2021-2022-triees/%s' %num_editor
path_res='./results'
df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"])

res = []
for filename in os.listdir(path):
    file = os.path.join(path, filename)
    # Read PDF as text
    try:
        reader = PyPDF2.PdfReader(file, strict=False)
        full_text = read_all_page(reader)
    except:
        full_text= ""
    
    #Extract infos
    # DOI
    unverified_doi = find_doi(full_text)
    doi = ""
    #only if a doi is found
    if unverified_doi:
        if verify_doi(unverified_doi):
            #If DOI is found, write it in "doi" column instead of 'unverified_doi' column.
            doi=unverified_doi
            unverified_doi=""


    # propre à l'éditeur
    item_editor = get_caracs_between(full_text,"Article:","Tax").replace("\n"," ")   
    if item_editor != "":
        item_editor = "Article: " + item_editor
    
    
    new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,'item_editor':item_editor,'full_text': full_text} #HERE ADD
    df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True)

df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\')