diff --git a/README.md b/README.md index b484275..e84d2fc 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,7 @@ ## Utilisation Toutes les factures sont initialement mélangées dans le même dossier. -0. (facultatif) metric-data permet d'afficher la répartition des différents codes éditeurs pour cibler les plus importants à traiter. -1. Lancer le code tri factures en modifiant les paramètres `path_bills`, qui correspond au nom du dossier d'entrée et éventuellement `path_bills_output`, correspondant au nom du dossier de sortie. -2. Renseigner son adresse mail comme chaîne de caractère en modifiant le paramètre `mail_adress` dans le fichier __editors-processing/global_function.py__ -2. Lancer un par un les programmes dans editors processing : ils traitent les éditeurs un par un en utilisant de la recherhce de chaîne de caractère. A l'exception de `global_functions.py`qui contient des fonctions utilisées dans chacun des programmes. -3. Les fichiers sont à présents traités dans un dossier __results__. Dans chaque programme, les paramètres `path` et `path_res` doivent être modifiés en conséquent (_path_ : dossier contenant les factures triées + num éditeur) (_path_res_ : results par défaut) -4. On aura en résultat autant de csv que d'éditeurs traités. Les non traités seront dans le fichier "autres". +1. Renseigner son adresse mail comme chaîne de caractère en modifiant le paramètre `mail_adress` dans le fichier __editors-processing/global_function.py__ +2. Mettre toutes les factures dans un dossier. Renseigner ce nom de dossier à la place de `path_bills` dans les programmes __v1/apc.py__ et __metric-data.py__, programme facultatif permettant d'afficher la répartition des différents codes éditeurs pour cibler les plus importants à traiter. +3. lancer le script __v1/apc.py__. Les résultats sont dans le csv res_facture.csv (échapement `\\` (ou juste `\` ?), séparateur `,`) +4. On peut lancer __metrics_output.py__ pour obtenir les ratios de factures correctement traitées par éditeur (permet de repérer si un éditeur change d'année en année) diff --git a/metric-data.py b/metric-data.py deleted file mode 100644 index 4337425..0000000 --- a/metric-data.py +++ /dev/null @@ -1,18 +0,0 @@ -from pathlib import Path - -path_bills = "factures-2021-2022" - -#fonction qui classe les PDFs par editeur -def metrics_editors(path): - dic_stats = {} - files = Path('./%s/' %path).glob('*') - for file in files: - code_editor=str(file).split("_")[1] - if code_editor not in dic_stats: - dic_stats[code_editor]=1 - else : - dic_stats[code_editor]+=1 - return dict(sorted(dic_stats.items(), key=lambda item:item[1])) - - -print(metrics_editors(path=path_bills)) \ No newline at end of file diff --git a/metric-input.py b/metric-input.py new file mode 100644 index 0000000..4fdfa59 --- /dev/null +++ b/metric-input.py @@ -0,0 +1,18 @@ +from pathlib import Path + +path_bills = "factures-2019-2023" + +#fonction qui classe les PDFs par editeur +def metrics_editors(path): + dic_stats = {} + files = Path('./%s/' %path).glob('*') + for file in files: + code_editor=str(file).split("_")[1] + if code_editor not in dic_stats: + dic_stats[code_editor]=1 + else : + dic_stats[code_editor]+=1 + return dict(sorted(dic_stats.items(), key=lambda item:item[1])) + + +print(metrics_editors(path=path_bills)) \ No newline at end of file diff --git a/metric-output.py b/metric-output.py new file mode 100644 index 0000000..3996d58 --- /dev/null +++ b/metric-output.py @@ -0,0 +1,30 @@ +import pandas as pd +import json + +df_res = pd.read_csv('res_facture.csv') + +with open('./v1/dictionaries/code_fournisseur2fournisseur.json', 'r') as f_in: + id_editor2editor = json.load(f_in) + +# Fonction pour calculer le ratio des champs "item_editor" vides +def calculate_empty_item_editor_ratio(df): + total_items = len(df) + empty_item_editors = df['item_editor'].isna().sum() + if total_items > 0: + ratio = 1 - empty_item_editors / total_items + else: + ratio = 0 + return ratio + +results = {} + +for code_editor in id_editor2editor.keys(): + # Extraire le code éditeur de la colonne "nom_complet" pour filtrer les lignes + df_res['extracted_code'] = df_res['nom_complet'].apply(lambda x: x.split('_')[1] if len(x.split('_')) > 1 else None) + + # Filtrer le DataFrame pour les lignes où le code extrait est égal au code de l'éditeur + df_editor = df_res[df_res['extracted_code'] == code_editor] + ratio = calculate_empty_item_editor_ratio(df_editor) + results[code_editor] = ratio + +print(json.dumps(results)) \ No newline at end of file diff --git a/tri-factures.py b/tri-factures.py deleted file mode 100644 index 894ad64..0000000 --- a/tri-factures.py +++ /dev/null @@ -1,35 +0,0 @@ -import shutil -from pathlib import Path -import os -import json - -path_bills = "factures-2021-2022" -path_bills_output = "factures-2021-2022-triees" -""" -Notice d'utilisation -ATTENTION : Les dossier d'entrée et de sortie ne doivent PAS CONTENIR de "_" : c'est réservé pour les noms de fichier pdf des factures. - -Ce programme permet de trier un dossier de facture par code fournisseur -Si le code fournisseur n'est pas dans la liste id_editor2editor, ne peut être traité de manière automatique. -""" -#address path to datas (pdf only) -with open('./dictionaries/code_fournisseur2fournisseur.json','r') as f_in: - id_editor2editor = json.load(f_in) - -#fonction utilitaire pour classEditors qui copie le fichier PDF dans un repository dans son code editeur. -def copyItIn(file,repository,global_path='./%s' %path_bills_output): - if not os.path.exists('%s/%s' % (global_path,repository) ): - os.makedirs('%s/%s' % (global_path,repository) ) - shutil.copy(file, '%s/%s' % (global_path,repository) ) - - -# classe les PDFs par editeur - -files = Path('./%s/' %path_bills).glob('*') -for file in files: - code_editor=str(file).split("_")[1] - if code_editor in id_editor2editor.keys(): - copyItIn(file,code_editor) - else: - copyItIn(file,'autres') - diff --git a/v1/apc.py b/v1/apc.py new file mode 100644 index 0000000..ed510e5 --- /dev/null +++ b/v1/apc.py @@ -0,0 +1,43 @@ +import json +from pathlib import Path +import subprocess +import pandas as pd +from editorsFunctions.global_functions import * + + +path_bills = "factures-2019-2023" + +with open('./v1/dictionaries/code_fournisseur2fournisseur.json', 'r') as f_in: + id_editor2editor = json.load(f_in) + + +def execute_script(script_name, file_name): + result = subprocess.run(['python3', script_name], input=file_name, capture_output=True, text=True) + if result.returncode == 0: + try: + data = json.loads(result.stdout) + return data + except json.JSONDecodeError as e: + return None + else: + return None + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +files = Path('./%s/' % path_bills).glob('*') +for file in files: + code_editor = str(file).split("_")[1] + if code_editor in id_editor2editor.keys(): + script_name = f"v1/editorsFunctions/{code_editor}.py" + data = execute_script(script_name, str(file)) + if data is not None: + df = pd.DataFrame([data]) + df_res = pd.concat([df_res, df], ignore_index=True) + else: + script_name = f"v1/editorsFunctions/autres.py" + data = execute_script(script_name, str(file)) + if data is not None: + df = pd.DataFrame([data]) + df_res = pd.concat([df_res, df], ignore_index=True) + +df_res.to_csv('res_facture.csv', index=False) diff --git a/v1/dictionaries/code_fournisseur2fournisseur.json b/v1/dictionaries/code_fournisseur2fournisseur.json new file mode 100644 index 0000000..f6f751f --- /dev/null +++ b/v1/dictionaries/code_fournisseur2fournisseur.json @@ -0,0 +1,29 @@ +{ + "0001021938": 1, + "0001197861": 2, + "0001200930": 3, + "0001079074": 4, + "0001021664": 5, + "0001024808": 6, + "0001097617": 7, + "0001091455": 8, + "0001217181": 9, + "0001090050": 10, + "0001157725": 11, + "0001062310": 12, + "0001021888": 13, + "0001222959": 14, + "0001234181": 15, + "0001226137": 16, + "0001021626": 17, + "0001247066": 18, + "0001021875": 19, + "0001167626": 20, + "0001271047": 21, + "0001024308": 22, + "0001021793": 23, + "0001189058": 24, + "0001024313": 25, + "0001024524": 26, + "0001129981": 27 +} diff --git a/v1/editorsFunctions/0001021626.py b/v1/editorsFunctions/0001021626.py index 20470e8..68ceb02 100644 --- a/v1/editorsFunctions/0001021626.py +++ b/v1/editorsFunctions/0001021626.py @@ -1,45 +1,28 @@ from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001021626" - -path = './factures-2019-2023-triees/%s' %num_editor - -path_res='./results' +import sys +import json -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - # Article - item_editor = get_caracs_between(full_text,"Order Reference","VAT amount") - if item_editor != "": - item_editor = "Order Reference" + item_editor - +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) +# Article +item_editor = get_caracs_between(full_text,"Order Reference","VAT amount") +if item_editor != "": + item_editor = "Order Reference" + item_editor -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar=',') + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001021664.py b/v1/editorsFunctions/0001021664.py index 2ed621b..857ef10 100644 --- a/v1/editorsFunctions/0001021664.py +++ b/v1/editorsFunctions/0001021664.py @@ -1,50 +1,34 @@ from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001021664" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' +import sys +import json -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) +filename = sys.stdin.read().strip() -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" +full_text = ocr_pdf(filename) - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" - # Article - item_editor = get_caracs_between(full_text,"Description","Product ").replace("\n"," ").replace("Total Due","") +# Article +item_editor = get_caracs_between(full_text,"Description","Product ").replace("\n"," ").replace("Total Due","") +if item_editor != "": + item_editor = "Product ID + Description: " + item_editor + +else: + item_editor = get_caracs_between(full_text,"Description","Total Units").replace("\n"," ").replace("Total Due","") if item_editor != "": item_editor = "Product ID + Description: " + item_editor - - else: - item_editor = get_caracs_between(full_text,"Description","Total Units").replace("\n"," ").replace("Total Due","") - if item_editor != "": - item_editor = "Product ID + Description: " + item_editor - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001021793.py b/v1/editorsFunctions/0001021793.py index fc697e6..b40fde8 100644 --- a/v1/editorsFunctions/0001021793.py +++ b/v1/editorsFunctions/0001021793.py @@ -1,44 +1,28 @@ from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001021793" - -path = './factures-2019-2023-triees/%s' %num_editor - -path_res='./results' +import sys +import json -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - # Article - item_editor = get_caracs_between(full_text,"Item","Total").replace("\n"," ") - if item_editor != "": - item_editor = "Item: " + item_editor +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) +# Article +item_editor = get_caracs_between(full_text,"Item","Total").replace("\n"," ") +if item_editor != "": + item_editor = "Item: " + item_editor -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001021875.py b/v1/editorsFunctions/0001021875.py index 17c5cf1..63ff218 100644 --- a/v1/editorsFunctions/0001021875.py +++ b/v1/editorsFunctions/0001021875.py @@ -1,45 +1,28 @@ from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001021875" - -path = './factures-2019-2023-triees/%s' %num_editor - -path_res='./results' +import sys +import json -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - # Article - item_editor = get_caracs_between(full_text,"Article","VAT") - if item_editor != "": - item_editor = "Article" + item_editor +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +item_editor = get_caracs_between(full_text,"Article","VAT") +if item_editor != "": + item_editor = "Article" + item_editor - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar=',') +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001021888.py b/v1/editorsFunctions/0001021888.py index 5552818..d77c3b3 100644 --- a/v1/editorsFunctions/0001021888.py +++ b/v1/editorsFunctions/0001021888.py @@ -1,44 +1,28 @@ from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001021888" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' +import sys +import json -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - # Article - item_editor = get_caracs_between(full_text,"Description","Payment terms").replace("\n"," ") - if item_editor != "": - item_editor = "Title: " + item_editor +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) +# Article +item_editor = get_caracs_between(full_text,"Description","Payment terms").replace("\n"," ") +if item_editor != "": + item_editor = "Title: " + item_editor -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001021938.py b/v1/editorsFunctions/0001021938.py index 581d269..f3b11b1 100644 --- a/v1/editorsFunctions/0001021938.py +++ b/v1/editorsFunctions/0001021938.py @@ -1,52 +1,36 @@ from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001021938" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' +import sys +import json -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" + +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +if "Author/Article:" in full_text: + item_editor = get_caracs_between(full_text,"Author/Article:","Manuscript").replace("\n"," ") - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" + if item_editor != "": + item_editor = "Author/Article: " + item_editor - # Article - if "Author/Article:" in full_text: - item_editor = get_caracs_between(full_text,"Author/Article:","Manuscript").replace("\n"," ") - - if item_editor != "": - item_editor = "Author/Article: " + item_editor +else: + item_editor = get_caracs_between(full_text,"Author","Manuscript ID:").replace("\n"," ") + + if item_editor != "": + item_editor = "Author" + item_editor - else: - item_editor = get_caracs_between(full_text,"Author","Manuscript ID:").replace("\n"," ") - - if item_editor != "": - item_editor = "Author" + item_editor - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001024308.py b/v1/editorsFunctions/0001024308.py index 204126b..3486e6b 100644 --- a/v1/editorsFunctions/0001024308.py +++ b/v1/editorsFunctions/0001024308.py @@ -1,45 +1,28 @@ from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001024308" - -path = './factures-2019-2023-triees/%s' %num_editor - -path_res='./results' +import sys +import json -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - # Article - item_editor = get_caracs_between(full_text,"PUBLICATION CHARGES","PAGE CHARGES").replace("\n","") - if item_editor != "": - item_editor = "PUBLICATION CHARGES" + item_editor +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +item_editor = get_caracs_between(full_text,"PUBLICATION CHARGES","PAGE CHARGES").replace("\n","") +if item_editor != "": + item_editor = "PUBLICATION CHARGES" + item_editor - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar=',') +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001024313.py b/v1/editorsFunctions/0001024313.py index bad9076..902dc65 100644 --- a/v1/editorsFunctions/0001024313.py +++ b/v1/editorsFunctions/0001024313.py @@ -1,45 +1,28 @@ from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001024313" - -path = './factures-2019-2023-triees/%s' %num_editor - -path_res='./results' +import sys +import json -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - # Article - item_editor = get_caracs_between(re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text)),"Title:","Journal:") - if item_editor != "": - item_editor = "Title: " + item_editor - +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) +# Article +item_editor = get_caracs_between(re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text)),"Title:","Journal:") +if item_editor != "": + item_editor = "Title: " + item_editor -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar=',') + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001024524.py b/v1/editorsFunctions/0001024524.py index 68d6caa..fbeea7a 100644 --- a/v1/editorsFunctions/0001024524.py +++ b/v1/editorsFunctions/0001024524.py @@ -1,45 +1,28 @@ from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001024524" - -path = './factures-2019-2023-triees/%s' %num_editor - -path_res='./results' +import sys +import json -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" - # Article - item_editor = get_caracs_between(full_text,"Article:","Corresponding Author:") - if item_editor != "": - item_editor = "Title: " + item_editor +# Article +item_editor = get_caracs_between(full_text,"Article:","Corresponding Author:") +if item_editor != "": + item_editor = "Title: " + item_editor - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar=',') +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001024808.py b/v1/editorsFunctions/0001024808.py index 190abea..e524593 100644 --- a/v1/editorsFunctions/0001024808.py +++ b/v1/editorsFunctions/0001024808.py @@ -1,50 +1,33 @@ from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001024808" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' +import sys +import json -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - # Article - item_editor = get_caracs_between(full_text,"Description","Total Units").replace("\n"," ") +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +item_editor = get_caracs_between(full_text,"Description","Total Units").replace("\n"," ") +if item_editor != "": + item_editor = "Description: " + item_editor + +else: + item_editor = get_caracs_between(full_text,"Description","Product ").replace("\n"," ") if item_editor != "": item_editor = "Description: " + item_editor - - else: - item_editor = get_caracs_between(full_text,"Description","Product ").replace("\n"," ") - if item_editor != "": - item_editor = "Description: " + item_editor - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001062310.py b/v1/editorsFunctions/0001062310.py index 97e7c53..e17c34e 100644 --- a/v1/editorsFunctions/0001062310.py +++ b/v1/editorsFunctions/0001062310.py @@ -1,43 +1,26 @@ from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001062310" - -path = './factures-2019-2023-triees/%s' %num_editor - -path_res='./results' +import sys +import json -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" - # Article - item_editor = get_caracs_between(full_text,"Total","USD").replace("\n"," ").replace("Article Processing", "|| Article Processing") +# Article +item_editor = get_caracs_between(full_text,"Total","USD").replace("\n"," ").replace("Article Processing", "|| Article Processing") - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar=',') +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001079074.py b/v1/editorsFunctions/0001079074.py index dcfd9a5..685edbc 100644 --- a/v1/editorsFunctions/0001079074.py +++ b/v1/editorsFunctions/0001079074.py @@ -1,42 +1,29 @@ from global_functions import * -import os -import PyPDF2 -import pandas as pd +import sys +import json -num_editor = "0001079074" -path = './factures-2021-2022-triees/%s' %num_editor -path_res='./results' -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) + - #Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" +#Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" - # propre à l'éditeur - item_editor = get_caracs_between(full_text,"Article:","Tax").replace("\n"," ") - if item_editor != "": - item_editor = "Article: " + item_editor +# propre à l'éditeur +item_editor = get_caracs_between(full_text,"Article:","Tax").replace("\n"," ") +if item_editor != "": + item_editor = "Article: " + item_editor - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,'item_editor':item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} #HERE ADD - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001090050.py b/v1/editorsFunctions/0001090050.py index 37c8a2b..88edaf8 100644 --- a/v1/editorsFunctions/0001090050.py +++ b/v1/editorsFunctions/0001090050.py @@ -1,44 +1,28 @@ from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001090050" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' +import sys +import json -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - # Article - item_editor = get_caracs_between(full_text,"Manuscript","\n").replace("\n"," ") - if item_editor != "": - item_editor = "Manuscript: " + item_editor +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) +# Article +item_editor = get_caracs_between(full_text,"Manuscript","\n").replace("\n"," ") +if item_editor != "": + item_editor = "Manuscript: " + item_editor -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001091455.py b/v1/editorsFunctions/0001091455.py index c4303a7..b2a4a61 100644 --- a/v1/editorsFunctions/0001091455.py +++ b/v1/editorsFunctions/0001091455.py @@ -1,44 +1,27 @@ from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001091455" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' +import sys +import json -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - # Article - item_editor = get_caracs_between(full_text,"Item","Total").replace("\n"," ") - if item_editor != "": - item_editor = "Item: " + item_editor +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) +# Article +item_editor = get_caracs_between(full_text,"Item","Total").replace("\n"," ") +if item_editor != "": + item_editor = "Item: " + item_editor -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001097617.py b/v1/editorsFunctions/0001097617.py index adf9d34..42886d1 100644 --- a/v1/editorsFunctions/0001097617.py +++ b/v1/editorsFunctions/0001097617.py @@ -1,44 +1,28 @@ from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001097617" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' +import sys +import json -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - # Article - item_editor = get_caracs_between(full_text,"","").replace("\n"," ") - if item_editor != "": - item_editor = "Title: " + item_editor +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) +# Article +item_editor = get_caracs_between(full_text,"","").replace("\n"," ") +if item_editor != "": + item_editor = "Title: " + item_editor -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001129981.py b/v1/editorsFunctions/0001129981.py index 447a490..e116f53 100644 --- a/v1/editorsFunctions/0001129981.py +++ b/v1/editorsFunctions/0001129981.py @@ -1,43 +1,26 @@ from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001129981" - -path = './factures-2019-2023-triees/%s' %num_editor - -path_res='./results' +import sys +import json -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - # Article - item_editor = get_caracs_between(full_text,"Total Due","Check").replace("\n"," ") +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar=',') +# Article +item_editor = get_caracs_between(full_text,"Total Due","Check").replace("\n"," ") + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001157725.py b/v1/editorsFunctions/0001157725.py index 33d3418..0571b1b 100644 --- a/v1/editorsFunctions/0001157725.py +++ b/v1/editorsFunctions/0001157725.py @@ -1,44 +1,28 @@ from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001157725" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' +import sys +import json -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" - # Article - item_editor = get_caracs_between(full_text,"Journal Name","DOI").replace("\n"," ") - if item_editor != "": - item_editor = "Description: " + item_editor +# Article +item_editor = get_caracs_between(full_text,"Journal Name","DOI").replace("\n"," ") +if item_editor != "": + item_editor = "Description: " + item_editor - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001167626.py b/v1/editorsFunctions/0001167626.py index d85bbb0..0fd8ace 100644 --- a/v1/editorsFunctions/0001167626.py +++ b/v1/editorsFunctions/0001167626.py @@ -1,44 +1,28 @@ from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001167626" - -path = './factures-2019-2023-triees/%s' %num_editor - -path_res='./results' +import sys +import json -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - # Article - item_editor = get_caracs_between(full_text,"Article","TOTAL VALUE") - if item_editor != "": - item_editor = "Article" + item_editor +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) +# Article +item_editor = get_caracs_between(full_text,"Article","TOTAL VALUE") +if item_editor != "": + item_editor = "Article" + item_editor -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar=',') + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001189058.py b/v1/editorsFunctions/0001189058.py index ffb8927..e3c65e6 100644 --- a/v1/editorsFunctions/0001189058.py +++ b/v1/editorsFunctions/0001189058.py @@ -1,45 +1,28 @@ from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001189058" - -path = './factures-2019-2023-triees/%s' %num_editor - -path_res='./results' +import sys +import json -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - # Article - item_editor = get_caracs_between(full_text,"TITLE","REMARKS").replace( - "USD","").replace("PRICE","").replace("DISC","").replace("NET","").replace( - "VALUE","").replace("UNIT","").replace("TAX","").replace("VAT","").replace("\n","") +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +item_editor = get_caracs_between(full_text,"TITLE","REMARKS").replace( + "USD","").replace("PRICE","").replace("DISC","").replace("NET","").replace( + "VALUE","").replace("UNIT","").replace("TAX","").replace("VAT","").replace("\n","") - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar=',') +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001197861.py b/v1/editorsFunctions/0001197861.py index 821172a..835a794 100644 --- a/v1/editorsFunctions/0001197861.py +++ b/v1/editorsFunctions/0001197861.py @@ -1,44 +1,28 @@ from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001197861" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' +import sys +import json -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" - # Article - item_editor = get_caracs_between(full_text,"Description","Net Value") - if item_editor != "": - item_editor = "Description: " + item_editor +# Article +item_editor = get_caracs_between(full_text,"Description","Net Value") +if item_editor != "": + item_editor = "Description: " + item_editor - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001200930.py b/v1/editorsFunctions/0001200930.py index 431af63..65f9542 100644 --- a/v1/editorsFunctions/0001200930.py +++ b/v1/editorsFunctions/0001200930.py @@ -1,48 +1,29 @@ from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001200930" - -# path = './test/%s' %num_editor -path = './factures-2021-2022-triees/%s' %num_editor - -# path_res='./test-results' -path_res='./results' +import sys +import json -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) - # Article - item_editor = get_caracs_between(full_text,'"','"').replace("\n"," ") - if item_editor != "": - item_editor = "Article Title: " + item_editor +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + +# Article +item_editor = get_caracs_between(full_text,'"','"').replace("\n"," ") +if item_editor != "": + item_editor = "Article Title: " + item_editor - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001217181.py b/v1/editorsFunctions/0001217181.py index 595050c..16366ec 100644 --- a/v1/editorsFunctions/0001217181.py +++ b/v1/editorsFunctions/0001217181.py @@ -1,42 +1,29 @@ from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001217181" -path = './factures-2021-2022-triees/%s' %num_editor -path_res='./results' -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" +import sys +import json - # Article - item_editor = get_caracs_between(full_text,"Title","published in").replace("\n"," ") - if item_editor != "": - item_editor = "Title: " + item_editor +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') + +# Article +item_editor = get_caracs_between(full_text,"Title","published in").replace("\n"," ") +if item_editor != "": + item_editor = "Title: " + item_editor + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001222959.py b/v1/editorsFunctions/0001222959.py index 4d235b4..9062198 100644 --- a/v1/editorsFunctions/0001222959.py +++ b/v1/editorsFunctions/0001222959.py @@ -1,44 +1,28 @@ from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001222959" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' +import sys +import json -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - # Article - item_editor = get_caracs_between(full_text,"Manuscript ID","Net Value").replace("\n"," ") - if item_editor != "": - item_editor = "Manuscript ID: " + item_editor +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) +# Article +item_editor = get_caracs_between(full_text,"Manuscript ID","Net Value").replace("\n"," ") +if item_editor != "": + item_editor = "Manuscript ID: " + item_editor -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001226137.py b/v1/editorsFunctions/0001226137.py index d477d57..bc90124 100644 --- a/v1/editorsFunctions/0001226137.py +++ b/v1/editorsFunctions/0001226137.py @@ -1,47 +1,31 @@ from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001226137" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' +import sys +import json -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - # Article - item_editor = get_caracs_between(full_text,"Title","Author").replace("\n"," ") - if item_editor != "": - item_editor = "Article Title: " + item_editor - - is_oa = get_caracs_between(full_text,"Item Description","Total Amount").replace("\n"," ") - item_editor = item_editor + "\nItem Description :" + is_oa +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) +# Article +item_editor = get_caracs_between(full_text,"Title","Author").replace("\n"," ") +if item_editor != "": + item_editor = "Article Title: " + item_editor -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') +is_oa = get_caracs_between(full_text,"Item Description","Total Amount").replace("\n"," ") +item_editor = item_editor + "\nItem Description :" + is_oa + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001234181.py b/v1/editorsFunctions/0001234181.py index ad3dc50..42886d1 100644 --- a/v1/editorsFunctions/0001234181.py +++ b/v1/editorsFunctions/0001234181.py @@ -1,44 +1,28 @@ from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001234181" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' +import sys +import json -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - # Article - item_editor = get_caracs_between(full_text,"","").replace("\n"," ") - if item_editor != "": - item_editor = "Title: " + item_editor +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) +# Article +item_editor = get_caracs_between(full_text,"","").replace("\n"," ") +if item_editor != "": + item_editor = "Title: " + item_editor -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001247066.py b/v1/editorsFunctions/0001247066.py index 1570c47..c601228 100644 --- a/v1/editorsFunctions/0001247066.py +++ b/v1/editorsFunctions/0001247066.py @@ -1,45 +1,28 @@ from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001247066" - -path = './factures-2019-2023-triees/%s' %num_editor - -path_res='./results' +import sys +import json -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - # Article - item_editor = get_caracs_between(full_text,"INVOICE","Unit") - if item_editor != "": - item_editor = "Description: " + item_editor.replace("tinyLine\n","") +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +item_editor = get_caracs_between(full_text,"INVOICE","Unit") +if item_editor != "": + item_editor = "Description: " + item_editor.replace("tinyLine\n","") - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar=',') +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001271047.py b/v1/editorsFunctions/0001271047.py index 04e17a4..75b716a 100644 --- a/v1/editorsFunctions/0001271047.py +++ b/v1/editorsFunctions/0001271047.py @@ -1,45 +1,28 @@ from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001271047" - -path = './factures-2019-2023-triees/%s' %num_editor - -path_res='./results' +import sys +import json -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - # Article - item_editor = get_caracs_between(full_text,"Title","Author") - if item_editor != "": - item_editor = "Title: " + item_editor - +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) +# Article +item_editor = get_caracs_between(full_text,"Title","Author") +if item_editor != "": + item_editor = "Title: " + item_editor -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar=',') + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/autres.py b/v1/editorsFunctions/autres.py index d84bc1f..fe45bf4 100644 --- a/v1/editorsFunctions/autres.py +++ b/v1/editorsFunctions/autres.py @@ -1,34 +1,22 @@ from global_functions import * -import os -import PyPDF2 -import pandas as pd - -path = './factures-2021-2022-triees/autres' -path_res='./results' -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - #Extract infos - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" +import sys +import json - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) -df_res.to_csv(path_or_buf="%s/autres-sorted.csv"%path_res,index=False,header=True,encoding='utf-8',escapechar='\\') \ No newline at end of file + +#Extract infos +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file