import re
from requests_ratelimiter import LimiterSession
import os
import PyPDF2
mail_adress = "[prefix_mail]@[domaine].[fr|com]"
session = LimiterSession(per_second=5)
def find_doi(text):
"""
return the first doi found in a text (input)
"""
doiRegex = r'\b10.\d{4,}\/[^\s]+\b'
doi = re.search(doiRegex, text)
if doi == None:
return ""
try:
doiStr = doi.group()
return doiStr
except:
return ""
def verify_doi(doi,mail=mail_adress):
"""
check with crossref api if doi is correct.
"""
url = f"https://api.crossref.org/works/{doi}/agency?mailto={mail}"
# Return True if DOI exists in crossref api
code_response = session.get(url).status_code
return code_response == 200
def read_all_page(reader):
"""
for a reader, return all his page separate with "\n" in str format
"""
text= ""
lenPages = len(reader.pages)
for i in range(lenPages):
#transform page into text
text+= reader.pages[i].extract_text()
text+= "\n"
return text
def ocr_pdf(filename):
# Read PDF as text
try:
reader = PyPDF2.PdfReader(filename, strict=False)
full_text = read_all_page(reader)
except Exception as e:
full_text= f"{e}"
return full_text
def get_caracs_between(chaine, debut, fin):
"""return a string between to string "debut" "fin"
Args:
chaine (str): search in
debut (str): beginning of the string
fin (str): ending
Returns:
str: string
"""
# Use regex
motif = re.escape(debut) + "(.*?)" + re.escape(fin)
resultat = re.search(motif, chaine,re.DOTALL)
if resultat:
contenu_entre_chaines = resultat.group(1)
return contenu_entre_chaines
else:
return ""
import re
def get_caracs_between_multiple(chaine, debuts, fins):
"""return a string between uncertenly beginning or ending
Args:
chaine (_str_): input text
debuts (_list_): list of possibles begins
fins (_list_): list of possibles endings
Returns:
_str_: caracters between begins,ends. Can take multiple ends/begins !
"""
motifs = [re.escape(debut) + "(.*?)" + re.escape(fin) for debut, fin in zip(debuts, fins)]
global_regex = "|".join(motifs)
resultat = re.search(global_regex, chaine,re.DOTALL)
# Vérifier si la correspondance a été trouvée
if resultat:
contenu_entre_chaines = resultat.group(1)
return contenu_entre_chaines
else:
return ""