Newer
Older
APC-bills / v1 / editorsFunctions / global_functions.py
import re
from requests_ratelimiter import LimiterSession
import os
import PyPDF2

mail_adress = "[prefix_mail]@[domaine].[fr|com]"
session = LimiterSession(per_second=5)

def find_doi(text):
    """
    return the first doi found in a text (input)
    """
    doiRegex = r'\b10.\d{4,}\/[^\s]+\b'
    doi = re.search(doiRegex, text)
    if doi == None:
        return ""
    try:
        doiStr = doi.group()
        return doiStr
    except:
        return ""
    
def verify_doi(doi,mail=mail_adress):
    """
    check with crossref api if doi is correct.
    """
    url = f"https://api.crossref.org/works/{doi}/agency?mailto={mail}"

    # Return True if DOI exists in crossref api
    code_response = session.get(url).status_code
    return code_response == 200


def read_all_page(reader):
    """
    for a reader, return all his page separate with "\n" in str format
    """
    text= ""
    lenPages = len(reader.pages)
    for i in range(lenPages):
        #transform page into text
        text+= reader.pages[i].extract_text()
        text+= "\n"
    return text


def ocr_pdf(filename):
    # Read PDF as text
    try:
        reader = PyPDF2.PdfReader(filename, strict=False)
        full_text = read_all_page(reader)
    except Exception as e:
        full_text= f"{e}"
        
    return full_text


def get_caracs_between(chaine, debut, fin):
    """return a string between to string "debut" "fin"

    Args:
        chaine (str): search in
        debut (str): beginning of the string
        fin (str): ending

    Returns:
        str: string
    """
    # Use regex
    motif = re.escape(debut) + "(.*?)" + re.escape(fin)
    resultat = re.search(motif, chaine,re.DOTALL)

    if resultat:
        contenu_entre_chaines = resultat.group(1)
        return contenu_entre_chaines
    else:
        return ""
    
import re

def get_caracs_between_multiple(chaine, debuts, fins):
    """return a string between uncertenly beginning or ending

    Args:
        chaine (_str_): input text
        debuts (_list_): list of possibles begins
        fins (_list_): list of possibles endings

    Returns:
        _str_: caracters between begins,ends. Can take multiple ends/begins !
    """
    motifs = [re.escape(debut) + "(.*?)" + re.escape(fin) for debut, fin in zip(debuts, fins)]

    global_regex = "|".join(motifs)

    resultat = re.search(global_regex, chaine,re.DOTALL)

    # Vérifier si la correspondance a été trouvée
    if resultat:
        contenu_entre_chaines = resultat.group(1)
        return contenu_entre_chaines
    else:
        return ""