import re from requests_ratelimiter import LimiterSession import os import PyPDF2 mail_adress = "[prefix_mail]@[domaine].[fr|com]" session = LimiterSession(per_second=5) def find_doi(text): """ return the first doi found in a text (input) """ doiRegex = r'\b10.\d{4,}\/[^\s]+\b' doi = re.search(doiRegex, text) if doi == None: return "" try: doiStr = doi.group() return doiStr except: return "" def verify_doi(doi,mail=mail_adress): """ check with crossref api if doi is correct. """ url = f"https://api.crossref.org/works/{doi}/agency?mailto={mail}" # Return True if DOI exists in crossref api code_response = session.get(url).status_code return code_response == 200 def read_all_page(reader): """ for a reader, return all his page separate with "\n" in str format """ text= "" lenPages = len(reader.pages) for i in range(lenPages): #transform page into text text+= reader.pages[i].extract_text() text+= "\n" return text def ocr_pdf(filename): # Read PDF as text try: reader = PyPDF2.PdfReader(filename, strict=False) full_text = read_all_page(reader) except Exception as e: full_text= f"{e}" return full_text def get_caracs_between(chaine, debut, fin): """return a string between to string "debut" "fin" Args: chaine (str): search in debut (str): beginning of the string fin (str): ending Returns: str: string """ # Use regex motif = re.escape(debut) + "(.*?)" + re.escape(fin) resultat = re.search(motif, chaine,re.DOTALL) if resultat: contenu_entre_chaines = resultat.group(1) return contenu_entre_chaines else: return "" import re def get_caracs_between_multiple(chaine, debuts, fins): """return a string between uncertenly beginning or ending Args: chaine (_str_): input text debuts (_list_): list of possibles begins fins (_list_): list of possibles endings Returns: _str_: caracters between begins,ends. Can take multiple ends/begins ! """ motifs = [re.escape(debut) + "(.*?)" + re.escape(fin) for debut, fin in zip(debuts, fins)] global_regex = "|".join(motifs) resultat = re.search(global_regex, chaine,re.DOTALL) # Vérifier si la correspondance a été trouvée if resultat: contenu_entre_chaines = resultat.group(1) return contenu_entre_chaines else: return ""