diff --git a/README.md b/README.md index b484275..e84d2fc 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,7 @@ ## Utilisation Toutes les factures sont initialement mélangées dans le même dossier. -0. (facultatif) metric-data permet d'afficher la répartition des différents codes éditeurs pour cibler les plus importants à traiter. -1. Lancer le code tri factures en modifiant les paramètres `path_bills`, qui correspond au nom du dossier d'entrée et éventuellement `path_bills_output`, correspondant au nom du dossier de sortie. -2. Renseigner son adresse mail comme chaîne de caractère en modifiant le paramètre `mail_adress` dans le fichier __editors-processing/global_function.py__ -2. Lancer un par un les programmes dans editors processing : ils traitent les éditeurs un par un en utilisant de la recherhce de chaîne de caractère. A l'exception de `global_functions.py`qui contient des fonctions utilisées dans chacun des programmes. -3. Les fichiers sont à présents traités dans un dossier __results__. Dans chaque programme, les paramètres `path` et `path_res` doivent être modifiés en conséquent (_path_ : dossier contenant les factures triées + num éditeur) (_path_res_ : results par défaut) -4. On aura en résultat autant de csv que d'éditeurs traités. Les non traités seront dans le fichier "autres". +1. Renseigner son adresse mail comme chaîne de caractère en modifiant le paramètre `mail_adress` dans le fichier __editors-processing/global_function.py__ +2. Mettre toutes les factures dans un dossier. Renseigner ce nom de dossier à la place de `path_bills` dans les programmes __v1/apc.py__ et __metric-data.py__, programme facultatif permettant d'afficher la répartition des différents codes éditeurs pour cibler les plus importants à traiter. +3. lancer le script __v1/apc.py__. Les résultats sont dans le csv res_facture.csv (échapement `\\` (ou juste `\` ?), séparateur `,`) +4. On peut lancer __metrics_output.py__ pour obtenir les ratios de factures correctement traitées par éditeur (permet de repérer si un éditeur change d'année en année) diff --git a/dictionaries/code_fournisseur2fournisseur.json b/dictionaries/code_fournisseur2fournisseur.json deleted file mode 100644 index 4275f25..0000000 --- a/dictionaries/code_fournisseur2fournisseur.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "0001021938":1, - "0001197861":2, - "0001200930":3, - "0001079074":4, - "0001021664":5, - "0001024808":6, - "0001097617":7, - "0001091455":8, - "0001217181":9, - "0001090050":10, - "0001157725":11, - "0001062310":12, - "0001021888":13, - "0001222959":14, - "0001234181":15, - "0001226137":16 -} diff --git a/editors-processing/0001021664.py b/editors-processing/0001021664.py deleted file mode 100644 index bc3f41c..0000000 --- a/editors-processing/0001021664.py +++ /dev/null @@ -1,50 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001021664" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' - - -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - # Article - item_editor = get_caracs_between(full_text,"Description","Product ").replace("\n"," ").replace("Total Due","") - if item_editor != "": - item_editor = "Product ID + Description: " + item_editor - - else: - item_editor = get_caracs_between(full_text,"Description","Total Units").replace("\n"," ").replace("Total Due","") - if item_editor != "": - item_editor = "Product ID + Description: " + item_editor - - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001021888.py b/editors-processing/0001021888.py deleted file mode 100644 index 6f0f056..0000000 --- a/editors-processing/0001021888.py +++ /dev/null @@ -1,44 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001021888" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' - - -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - # Article - item_editor = get_caracs_between(full_text,"Description","Payment terms").replace("\n"," ") - if item_editor != "": - item_editor = "Title: " + item_editor - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001021938.py b/editors-processing/0001021938.py deleted file mode 100644 index 76e7a1e..0000000 --- a/editors-processing/0001021938.py +++ /dev/null @@ -1,52 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001021938" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' - - -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - # Article - if "Author/Article:" in full_text: - item_editor = get_caracs_between(full_text,"Author/Article:","Manuscript").replace("\n"," ") - - if item_editor != "": - item_editor = "Author/Article: " + item_editor - - else: - item_editor = get_caracs_between(full_text,"Author","Manuscript ID:").replace("\n"," ") - - if item_editor != "": - item_editor = "Author" + item_editor - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001024808.py b/editors-processing/0001024808.py deleted file mode 100644 index b18290e..0000000 --- a/editors-processing/0001024808.py +++ /dev/null @@ -1,50 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001024808" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' - - -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - # Article - item_editor = get_caracs_between(full_text,"Description","Total Units").replace("\n"," ") - if item_editor != "": - item_editor = "Description: " + item_editor - - else: - item_editor = get_caracs_between(full_text,"Description","Product ").replace("\n"," ") - if item_editor != "": - item_editor = "Description: " + item_editor - - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001062310.py b/editors-processing/0001062310.py deleted file mode 100644 index bb39759..0000000 --- a/editors-processing/0001062310.py +++ /dev/null @@ -1,44 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001062310" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' - - -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - # Article - item_editor = get_caracs_between(full_text,"Amount","Processing Charge").replace("\n"," ") - if item_editor != "": - item_editor = "Article Number + Title of Manuscript: " + item_editor - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001079074.py b/editors-processing/0001079074.py deleted file mode 100644 index 7470402..0000000 --- a/editors-processing/0001079074.py +++ /dev/null @@ -1,42 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001079074" -path = './factures-2021-2022-triees/%s' %num_editor -path_res='./results' -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - #Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - - # propre à l'éditeur - item_editor = get_caracs_between(full_text,"Article:","Tax").replace("\n"," ") - if item_editor != "": - item_editor = "Article: " + item_editor - - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,'item_editor':item_editor,'full_text': full_text} #HERE ADD - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001090050.py b/editors-processing/0001090050.py deleted file mode 100644 index 4c122d7..0000000 --- a/editors-processing/0001090050.py +++ /dev/null @@ -1,44 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001090050" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' - - -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - # Article - item_editor = get_caracs_between(full_text,"Manuscript","\n").replace("\n"," ") - if item_editor != "": - item_editor = "Manuscript: " + item_editor - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001091455.py b/editors-processing/0001091455.py deleted file mode 100644 index f5c834b..0000000 --- a/editors-processing/0001091455.py +++ /dev/null @@ -1,44 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001091455" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' - - -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - # Article - item_editor = get_caracs_between(full_text,"Item","Total").replace("\n"," ") - if item_editor != "": - item_editor = "Item: " + item_editor - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001097617.py b/editors-processing/0001097617.py deleted file mode 100644 index 6500aeb..0000000 --- a/editors-processing/0001097617.py +++ /dev/null @@ -1,44 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001097617" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' - - -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - # Article - item_editor = get_caracs_between(full_text,"","").replace("\n"," ") - if item_editor != "": - item_editor = "Title: " + item_editor - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001157725.py b/editors-processing/0001157725.py deleted file mode 100644 index 7e6c8f7..0000000 --- a/editors-processing/0001157725.py +++ /dev/null @@ -1,44 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001157725" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' - - -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - # Article - item_editor = get_caracs_between(full_text,"Journal Name","DOI").replace("\n"," ") - if item_editor != "": - item_editor = "Description: " + item_editor - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001197861.py b/editors-processing/0001197861.py deleted file mode 100644 index 93bd46d..0000000 --- a/editors-processing/0001197861.py +++ /dev/null @@ -1,44 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001197861" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' - - -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - # Article - item_editor = get_caracs_between(full_text,"Description","Net Value") - if item_editor != "": - item_editor = "Description: " + item_editor - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001200930.py b/editors-processing/0001200930.py deleted file mode 100644 index 1d06086..0000000 --- a/editors-processing/0001200930.py +++ /dev/null @@ -1,48 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001200930" - -# path = './test/%s' %num_editor -path = './factures-2021-2022-triees/%s' %num_editor - -# path_res='./test-results' -path_res='./results' - - -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - - # Article - item_editor = get_caracs_between(full_text,'"','"').replace("\n"," ") - if item_editor != "": - item_editor = "Article Title: " + item_editor - - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001217181.py b/editors-processing/0001217181.py deleted file mode 100644 index ab91666..0000000 --- a/editors-processing/0001217181.py +++ /dev/null @@ -1,42 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001217181" -path = './factures-2021-2022-triees/%s' %num_editor -path_res='./results' -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - - # Article - item_editor = get_caracs_between(full_text,"Title","published in").replace("\n"," ") - if item_editor != "": - item_editor = "Title: " + item_editor - - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001222959.py b/editors-processing/0001222959.py deleted file mode 100644 index a51d561..0000000 --- a/editors-processing/0001222959.py +++ /dev/null @@ -1,44 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001222959" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' - - -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - # Article - item_editor = get_caracs_between(full_text,"Manuscript ID","Net Value").replace("\n"," ") - if item_editor != "": - item_editor = "Manuscript ID: " + item_editor - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001226137.py b/editors-processing/0001226137.py deleted file mode 100644 index 75c2cdc..0000000 --- a/editors-processing/0001226137.py +++ /dev/null @@ -1,47 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001226137" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' - - -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - # Article - item_editor = get_caracs_between(full_text,"Title","Author").replace("\n"," ") - if item_editor != "": - item_editor = "Article Title: " + item_editor - - is_oa = get_caracs_between(full_text,"Item Description","Total Amount").replace("\n"," ") - item_editor = item_editor + "\nItem Description :" + is_oa - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001234181.py b/editors-processing/0001234181.py deleted file mode 100644 index 7a0943a..0000000 --- a/editors-processing/0001234181.py +++ /dev/null @@ -1,44 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001234181" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' - - -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - # Article - item_editor = get_caracs_between(full_text,"","").replace("\n"," ") - if item_editor != "": - item_editor = "Title: " + item_editor - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/autres.py b/editors-processing/autres.py deleted file mode 100644 index 8dbd11c..0000000 --- a/editors-processing/autres.py +++ /dev/null @@ -1,34 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -path = './factures-2021-2022-triees/autres' -path_res='./results' -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - #Extract infos - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/autres-sorted.csv"%path_res,index=False,header=True,encoding='utf-8',escapechar='\\') \ No newline at end of file diff --git a/editors-processing/global_functions.py b/editors-processing/global_functions.py deleted file mode 100644 index de6cb82..0000000 --- a/editors-processing/global_functions.py +++ /dev/null @@ -1,90 +0,0 @@ -import re -from requests_ratelimiter import LimiterSession - -mail_adress = "[prefix_mail]@[domaine].[fr|com]" -session = LimiterSession(per_second=5) - -def find_doi(text): - """ - return the first doi found in a text (input) - """ - doiRegex = r'\b10.\d{4,}\/[^\s]+\b' - doi = re.search(doiRegex, text) - if doi == None: - return "" - try: - doiStr = doi.group() - return doiStr - except: - return "" - -def verify_doi(doi,mail=mail_adress): - """ - check with crossref api if doi is correct. - """ - url = f"https://api.crossref.org/works/{doi}/agency?mailto={mail}" - - # Return True if DOI exists in crossref api - code_response = session.get(url).status_code - return code_response == 200 - - -def read_all_page(reader): - """ - for a reader, return all his page separate with "\n" in str format - """ - text= "" - lenPages = len(reader.pages) - for i in range(lenPages): - #transform page into text - text+= reader.pages[i].extract_text() - text+= "\n" - return text - - -def get_caracs_between(chaine, debut, fin): - """return a string between to string "debut" "fin" - - Args: - chaine (str): search in - debut (str): beginning of the string - fin (str): ending - - Returns: - str: string - """ - # Use regex - motif = re.escape(debut) + "(.*?)" + re.escape(fin) - resultat = re.search(motif, chaine,re.DOTALL) - - if resultat: - contenu_entre_chaines = resultat.group(1) - return contenu_entre_chaines - else: - return "" - -import re - -def get_caracs_between_multiple(chaine, debuts, fins): - """return a string between uncertenly beginning or ending - - Args: - chaine (_str_): input text - debuts (_list_): list of possibles begins - fins (_list_): list of possibles endings - - Returns: - _str_: caracters between begins,ends. Can take multiple ends/begins ! - """ - motifs = [re.escape(debut) + "(.*?)" + re.escape(fin) for debut, fin in zip(debuts, fins)] - - global_regex = "|".join(motifs) - - resultat = re.search(global_regex, chaine,re.DOTALL) - - # Vérifier si la correspondance a été trouvée - if resultat: - contenu_entre_chaines = resultat.group(1) - return contenu_entre_chaines - else: - return "" diff --git a/metric-data.py b/metric-data.py deleted file mode 100644 index 4337425..0000000 --- a/metric-data.py +++ /dev/null @@ -1,18 +0,0 @@ -from pathlib import Path - -path_bills = "factures-2021-2022" - -#fonction qui classe les PDFs par editeur -def metrics_editors(path): - dic_stats = {} - files = Path('./%s/' %path).glob('*') - for file in files: - code_editor=str(file).split("_")[1] - if code_editor not in dic_stats: - dic_stats[code_editor]=1 - else : - dic_stats[code_editor]+=1 - return dict(sorted(dic_stats.items(), key=lambda item:item[1])) - - -print(metrics_editors(path=path_bills)) \ No newline at end of file diff --git a/metric-input.py b/metric-input.py new file mode 100644 index 0000000..4fdfa59 --- /dev/null +++ b/metric-input.py @@ -0,0 +1,18 @@ +from pathlib import Path + +path_bills = "factures-2019-2023" + +#fonction qui classe les PDFs par editeur +def metrics_editors(path): + dic_stats = {} + files = Path('./%s/' %path).glob('*') + for file in files: + code_editor=str(file).split("_")[1] + if code_editor not in dic_stats: + dic_stats[code_editor]=1 + else : + dic_stats[code_editor]+=1 + return dict(sorted(dic_stats.items(), key=lambda item:item[1])) + + +print(metrics_editors(path=path_bills)) \ No newline at end of file diff --git a/metric-output.py b/metric-output.py new file mode 100644 index 0000000..3996d58 --- /dev/null +++ b/metric-output.py @@ -0,0 +1,30 @@ +import pandas as pd +import json + +df_res = pd.read_csv('res_facture.csv') + +with open('./v1/dictionaries/code_fournisseur2fournisseur.json', 'r') as f_in: + id_editor2editor = json.load(f_in) + +# Fonction pour calculer le ratio des champs "item_editor" vides +def calculate_empty_item_editor_ratio(df): + total_items = len(df) + empty_item_editors = df['item_editor'].isna().sum() + if total_items > 0: + ratio = 1 - empty_item_editors / total_items + else: + ratio = 0 + return ratio + +results = {} + +for code_editor in id_editor2editor.keys(): + # Extraire le code éditeur de la colonne "nom_complet" pour filtrer les lignes + df_res['extracted_code'] = df_res['nom_complet'].apply(lambda x: x.split('_')[1] if len(x.split('_')) > 1 else None) + + # Filtrer le DataFrame pour les lignes où le code extrait est égal au code de l'éditeur + df_editor = df_res[df_res['extracted_code'] == code_editor] + ratio = calculate_empty_item_editor_ratio(df_editor) + results[code_editor] = ratio + +print(json.dumps(results)) \ No newline at end of file diff --git a/to_do.xlsx b/to_do.xlsx deleted file mode 100644 index 4016360..0000000 --- a/to_do.xlsx +++ /dev/null Binary files differ diff --git a/tri-factures.py b/tri-factures.py deleted file mode 100644 index 894ad64..0000000 --- a/tri-factures.py +++ /dev/null @@ -1,35 +0,0 @@ -import shutil -from pathlib import Path -import os -import json - -path_bills = "factures-2021-2022" -path_bills_output = "factures-2021-2022-triees" -""" -Notice d'utilisation -ATTENTION : Les dossier d'entrée et de sortie ne doivent PAS CONTENIR de "_" : c'est réservé pour les noms de fichier pdf des factures. - -Ce programme permet de trier un dossier de facture par code fournisseur -Si le code fournisseur n'est pas dans la liste id_editor2editor, ne peut être traité de manière automatique. -""" -#address path to datas (pdf only) -with open('./dictionaries/code_fournisseur2fournisseur.json','r') as f_in: - id_editor2editor = json.load(f_in) - -#fonction utilitaire pour classEditors qui copie le fichier PDF dans un repository dans son code editeur. -def copyItIn(file,repository,global_path='./%s' %path_bills_output): - if not os.path.exists('%s/%s' % (global_path,repository) ): - os.makedirs('%s/%s' % (global_path,repository) ) - shutil.copy(file, '%s/%s' % (global_path,repository) ) - - -# classe les PDFs par editeur - -files = Path('./%s/' %path_bills).glob('*') -for file in files: - code_editor=str(file).split("_")[1] - if code_editor in id_editor2editor.keys(): - copyItIn(file,code_editor) - else: - copyItIn(file,'autres') - diff --git a/v1/apc.py b/v1/apc.py new file mode 100644 index 0000000..ed510e5 --- /dev/null +++ b/v1/apc.py @@ -0,0 +1,43 @@ +import json +from pathlib import Path +import subprocess +import pandas as pd +from editorsFunctions.global_functions import * + + +path_bills = "factures-2019-2023" + +with open('./v1/dictionaries/code_fournisseur2fournisseur.json', 'r') as f_in: + id_editor2editor = json.load(f_in) + + +def execute_script(script_name, file_name): + result = subprocess.run(['python3', script_name], input=file_name, capture_output=True, text=True) + if result.returncode == 0: + try: + data = json.loads(result.stdout) + return data + except json.JSONDecodeError as e: + return None + else: + return None + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +files = Path('./%s/' % path_bills).glob('*') +for file in files: + code_editor = str(file).split("_")[1] + if code_editor in id_editor2editor.keys(): + script_name = f"v1/editorsFunctions/{code_editor}.py" + data = execute_script(script_name, str(file)) + if data is not None: + df = pd.DataFrame([data]) + df_res = pd.concat([df_res, df], ignore_index=True) + else: + script_name = f"v1/editorsFunctions/autres.py" + data = execute_script(script_name, str(file)) + if data is not None: + df = pd.DataFrame([data]) + df_res = pd.concat([df_res, df], ignore_index=True) + +df_res.to_csv('res_facture.csv', index=False) diff --git a/v1/dictionaries/code_fournisseur2fournisseur.json b/v1/dictionaries/code_fournisseur2fournisseur.json new file mode 100644 index 0000000..ad3c166 --- /dev/null +++ b/v1/dictionaries/code_fournisseur2fournisseur.json @@ -0,0 +1,28 @@ +{ + "0001021938": 1, + "0001197861": 2, + "0001200930": 3, + "0001079074": 4, + "0001021664": 5, + "0001024808": 6, + "0001097617": 7, + "0001091455": 8, + "0001217181": 9, + "0001090050": 10, + "0001157725": 11, + "0001062310": 12, + "0001021888": 13, + "0001222959": 14, + "0001226137": 16, + "0001021626": 17, + "0001247066": 18, + "0001021875": 19, + "0001167626": 20, + "0001271047": 21, + "0001024308": 22, + "0001021793": 23, + "0001189058": 24, + "0001024313": 25, + "0001024524": 26, + "0001129981": 27 +} diff --git a/v1/editorsFunctions/0001021626.py b/v1/editorsFunctions/0001021626.py new file mode 100644 index 0000000..68ceb02 --- /dev/null +++ b/v1/editorsFunctions/0001021626.py @@ -0,0 +1,28 @@ +from global_functions import * +import sys +import json + + +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) + + +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +item_editor = get_caracs_between(full_text,"Order Reference","VAT amount") +if item_editor != "": + item_editor = "Order Reference" + item_editor + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001021664.py b/v1/editorsFunctions/0001021664.py new file mode 100644 index 0000000..857ef10 --- /dev/null +++ b/v1/editorsFunctions/0001021664.py @@ -0,0 +1,34 @@ +from global_functions import * +import sys +import json + + +filename = sys.stdin.read().strip() + +full_text = ocr_pdf(filename) + +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +item_editor = get_caracs_between(full_text,"Description","Product ").replace("\n"," ").replace("Total Due","") +if item_editor != "": + item_editor = "Product ID + Description: " + item_editor + +else: + item_editor = get_caracs_between(full_text,"Description","Total Units").replace("\n"," ").replace("Total Due","") + if item_editor != "": + item_editor = "Product ID + Description: " + item_editor + + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001021793.py b/v1/editorsFunctions/0001021793.py new file mode 100644 index 0000000..b40fde8 --- /dev/null +++ b/v1/editorsFunctions/0001021793.py @@ -0,0 +1,28 @@ +from global_functions import * +import sys +import json + + +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) + + +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +item_editor = get_caracs_between(full_text,"Item","Total").replace("\n"," ") +if item_editor != "": + item_editor = "Item: " + item_editor + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001021875.py b/v1/editorsFunctions/0001021875.py new file mode 100644 index 0000000..63ff218 --- /dev/null +++ b/v1/editorsFunctions/0001021875.py @@ -0,0 +1,28 @@ +from global_functions import * +import sys +import json + + +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) + + +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +item_editor = get_caracs_between(full_text,"Article","VAT") +if item_editor != "": + item_editor = "Article" + item_editor + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001021888.py b/v1/editorsFunctions/0001021888.py new file mode 100644 index 0000000..d77c3b3 --- /dev/null +++ b/v1/editorsFunctions/0001021888.py @@ -0,0 +1,28 @@ +from global_functions import * +import sys +import json + + +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) + + +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +item_editor = get_caracs_between(full_text,"Description","Payment terms").replace("\n"," ") +if item_editor != "": + item_editor = "Title: " + item_editor + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001021938.py b/v1/editorsFunctions/0001021938.py new file mode 100644 index 0000000..f3b11b1 --- /dev/null +++ b/v1/editorsFunctions/0001021938.py @@ -0,0 +1,36 @@ +from global_functions import * +import sys +import json + + +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) + + +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +if "Author/Article:" in full_text: + item_editor = get_caracs_between(full_text,"Author/Article:","Manuscript").replace("\n"," ") + + if item_editor != "": + item_editor = "Author/Article: " + item_editor + +else: + item_editor = get_caracs_between(full_text,"Author","Manuscript ID:").replace("\n"," ") + + if item_editor != "": + item_editor = "Author" + item_editor + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001024308.py b/v1/editorsFunctions/0001024308.py new file mode 100644 index 0000000..3486e6b --- /dev/null +++ b/v1/editorsFunctions/0001024308.py @@ -0,0 +1,28 @@ +from global_functions import * +import sys +import json + + +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) + + +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +item_editor = get_caracs_between(full_text,"PUBLICATION CHARGES","PAGE CHARGES").replace("\n","") +if item_editor != "": + item_editor = "PUBLICATION CHARGES" + item_editor + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001024313.py b/v1/editorsFunctions/0001024313.py new file mode 100644 index 0000000..902dc65 --- /dev/null +++ b/v1/editorsFunctions/0001024313.py @@ -0,0 +1,28 @@ +from global_functions import * +import sys +import json + + +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) + + +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +item_editor = get_caracs_between(re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text)),"Title:","Journal:") +if item_editor != "": + item_editor = "Title: " + item_editor + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001024524.py b/v1/editorsFunctions/0001024524.py new file mode 100644 index 0000000..fbeea7a --- /dev/null +++ b/v1/editorsFunctions/0001024524.py @@ -0,0 +1,28 @@ +from global_functions import * +import sys +import json + + +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) + + +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +item_editor = get_caracs_between(full_text,"Article:","Corresponding Author:") +if item_editor != "": + item_editor = "Title: " + item_editor + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001024808.py b/v1/editorsFunctions/0001024808.py new file mode 100644 index 0000000..e524593 --- /dev/null +++ b/v1/editorsFunctions/0001024808.py @@ -0,0 +1,33 @@ +from global_functions import * +import sys +import json + + +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) + + +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +item_editor = get_caracs_between(full_text,"Description","Total Units").replace("\n"," ") +if item_editor != "": + item_editor = "Description: " + item_editor + +else: + item_editor = get_caracs_between(full_text,"Description","Product ").replace("\n"," ") + if item_editor != "": + item_editor = "Description: " + item_editor + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001062310.py b/v1/editorsFunctions/0001062310.py new file mode 100644 index 0000000..e17c34e --- /dev/null +++ b/v1/editorsFunctions/0001062310.py @@ -0,0 +1,26 @@ +from global_functions import * +import sys +import json + + +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) + + +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +item_editor = get_caracs_between(full_text,"Total","USD").replace("\n"," ").replace("Article Processing", "|| Article Processing") + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001079074.py b/v1/editorsFunctions/0001079074.py new file mode 100644 index 0000000..685edbc --- /dev/null +++ b/v1/editorsFunctions/0001079074.py @@ -0,0 +1,29 @@ +from global_functions import * +import sys +import json + + +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) + + +#Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + +# propre à l'éditeur +item_editor = get_caracs_between(full_text,"Article:","Tax").replace("\n"," ") +if item_editor != "": + item_editor = "Article: " + item_editor + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001090050.py b/v1/editorsFunctions/0001090050.py new file mode 100644 index 0000000..88edaf8 --- /dev/null +++ b/v1/editorsFunctions/0001090050.py @@ -0,0 +1,28 @@ +from global_functions import * +import sys +import json + + +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) + + +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +item_editor = get_caracs_between(full_text,"Manuscript","\n").replace("\n"," ") +if item_editor != "": + item_editor = "Manuscript: " + item_editor + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001091455.py b/v1/editorsFunctions/0001091455.py new file mode 100644 index 0000000..b2a4a61 --- /dev/null +++ b/v1/editorsFunctions/0001091455.py @@ -0,0 +1,27 @@ +from global_functions import * +import sys +import json + + +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) + + +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +item_editor = get_caracs_between(full_text,"Item","Total").replace("\n"," ") +if item_editor != "": + item_editor = "Item: " + item_editor + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001097617.py b/v1/editorsFunctions/0001097617.py new file mode 100644 index 0000000..786d0d7 --- /dev/null +++ b/v1/editorsFunctions/0001097617.py @@ -0,0 +1,28 @@ +from global_functions import * +import sys +import json + + +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) + + +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +item_editor = get_caracs_between(full_text,"Publication","Title:").replace("\n"," ") +if item_editor != "": + item_editor = "Publication + Title: " + item_editor + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001129981.py b/v1/editorsFunctions/0001129981.py new file mode 100644 index 0000000..e116f53 --- /dev/null +++ b/v1/editorsFunctions/0001129981.py @@ -0,0 +1,26 @@ +from global_functions import * +import sys +import json + + +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) + + +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +item_editor = get_caracs_between(full_text,"Total Due","Check").replace("\n"," ") + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001157725.py b/v1/editorsFunctions/0001157725.py new file mode 100644 index 0000000..0571b1b --- /dev/null +++ b/v1/editorsFunctions/0001157725.py @@ -0,0 +1,28 @@ +from global_functions import * +import sys +import json + + +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) + + +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +item_editor = get_caracs_between(full_text,"Journal Name","DOI").replace("\n"," ") +if item_editor != "": + item_editor = "Description: " + item_editor + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001167626.py b/v1/editorsFunctions/0001167626.py new file mode 100644 index 0000000..0fd8ace --- /dev/null +++ b/v1/editorsFunctions/0001167626.py @@ -0,0 +1,28 @@ +from global_functions import * +import sys +import json + + +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) + + +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +item_editor = get_caracs_between(full_text,"Article","TOTAL VALUE") +if item_editor != "": + item_editor = "Article" + item_editor + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001189058.py b/v1/editorsFunctions/0001189058.py new file mode 100644 index 0000000..e3c65e6 --- /dev/null +++ b/v1/editorsFunctions/0001189058.py @@ -0,0 +1,28 @@ +from global_functions import * +import sys +import json + + +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) + + +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +item_editor = get_caracs_between(full_text,"TITLE","REMARKS").replace( + "USD","").replace("PRICE","").replace("DISC","").replace("NET","").replace( + "VALUE","").replace("UNIT","").replace("TAX","").replace("VAT","").replace("\n","") + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001197861.py b/v1/editorsFunctions/0001197861.py new file mode 100644 index 0000000..835a794 --- /dev/null +++ b/v1/editorsFunctions/0001197861.py @@ -0,0 +1,28 @@ +from global_functions import * +import sys +import json + + +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) + + +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +item_editor = get_caracs_between(full_text,"Description","Net Value") +if item_editor != "": + item_editor = "Description: " + item_editor + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001200930.py b/v1/editorsFunctions/0001200930.py new file mode 100644 index 0000000..65f9542 --- /dev/null +++ b/v1/editorsFunctions/0001200930.py @@ -0,0 +1,29 @@ +from global_functions import * +import sys +import json + + +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) + + +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + +# Article +item_editor = get_caracs_between(full_text,'"','"').replace("\n"," ") +if item_editor != "": + item_editor = "Article Title: " + item_editor + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001217181.py b/v1/editorsFunctions/0001217181.py new file mode 100644 index 0000000..16366ec --- /dev/null +++ b/v1/editorsFunctions/0001217181.py @@ -0,0 +1,29 @@ +from global_functions import * +import sys +import json + + +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) + + +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + +# Article +item_editor = get_caracs_between(full_text,"Title","published in").replace("\n"," ") +if item_editor != "": + item_editor = "Title: " + item_editor + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001222959.py b/v1/editorsFunctions/0001222959.py new file mode 100644 index 0000000..9062198 --- /dev/null +++ b/v1/editorsFunctions/0001222959.py @@ -0,0 +1,28 @@ +from global_functions import * +import sys +import json + + +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) + + +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +item_editor = get_caracs_between(full_text,"Manuscript ID","Net Value").replace("\n"," ") +if item_editor != "": + item_editor = "Manuscript ID: " + item_editor + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001226137.py b/v1/editorsFunctions/0001226137.py new file mode 100644 index 0000000..bc90124 --- /dev/null +++ b/v1/editorsFunctions/0001226137.py @@ -0,0 +1,31 @@ +from global_functions import * +import sys +import json + + +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) + + +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +item_editor = get_caracs_between(full_text,"Title","Author").replace("\n"," ") +if item_editor != "": + item_editor = "Article Title: " + item_editor + +is_oa = get_caracs_between(full_text,"Item Description","Total Amount").replace("\n"," ") +item_editor = item_editor + "\nItem Description :" + is_oa + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001247066.py b/v1/editorsFunctions/0001247066.py new file mode 100644 index 0000000..c601228 --- /dev/null +++ b/v1/editorsFunctions/0001247066.py @@ -0,0 +1,28 @@ +from global_functions import * +import sys +import json + + +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) + + +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +item_editor = get_caracs_between(full_text,"INVOICE","Unit") +if item_editor != "": + item_editor = "Description: " + item_editor.replace("tinyLine\n","") + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/0001271047.py b/v1/editorsFunctions/0001271047.py new file mode 100644 index 0000000..75b716a --- /dev/null +++ b/v1/editorsFunctions/0001271047.py @@ -0,0 +1,28 @@ +from global_functions import * +import sys +import json + + +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) + + +## Extract infos +# DOI +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + +# Article +item_editor = get_caracs_between(full_text,"Title","Author") +if item_editor != "": + item_editor = "Title: " + item_editor + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/__init__.py b/v1/editorsFunctions/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/v1/editorsFunctions/__init__.py diff --git a/v1/editorsFunctions/autres.py b/v1/editorsFunctions/autres.py new file mode 100644 index 0000000..fe45bf4 --- /dev/null +++ b/v1/editorsFunctions/autres.py @@ -0,0 +1,22 @@ +from global_functions import * +import sys +import json + + +filename = sys.stdin.read().strip() +full_text = ocr_pdf(filename) + + +#Extract infos +unverified_doi = find_doi(full_text) +doi = "" +#only if a doi is found +if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + +new_row = {'nom_complet': filename.split("/")[1], 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} +sys.stdout.write((json.dumps(new_row))) \ No newline at end of file diff --git a/v1/editorsFunctions/global_functions.py b/v1/editorsFunctions/global_functions.py new file mode 100644 index 0000000..d20b86c --- /dev/null +++ b/v1/editorsFunctions/global_functions.py @@ -0,0 +1,103 @@ +import re +from requests_ratelimiter import LimiterSession +import os +import PyPDF2 + +mail_adress = "[prefix_mail]@[domaine].[fr|com]" +session = LimiterSession(per_second=5) + +def find_doi(text): + """ + return the first doi found in a text (input) + """ + doiRegex = r'\b10.\d{4,}\/[^\s]+\b' + doi = re.search(doiRegex, text) + if doi == None: + return "" + try: + doiStr = doi.group() + return doiStr + except: + return "" + +def verify_doi(doi,mail=mail_adress): + """ + check with crossref api if doi is correct. + """ + url = f"https://api.crossref.org/works/{doi}/agency?mailto={mail}" + + # Return True if DOI exists in crossref api + code_response = session.get(url).status_code + return code_response == 200 + + +def read_all_page(reader): + """ + for a reader, return all his page separate with "\n" in str format + """ + text= "" + lenPages = len(reader.pages) + for i in range(lenPages): + #transform page into text + text+= reader.pages[i].extract_text() + text+= "\n" + return text + + +def ocr_pdf(filename): + # Read PDF as text + try: + reader = PyPDF2.PdfReader(filename, strict=False) + full_text = read_all_page(reader) + except Exception as e: + full_text= f"{e}" + + return full_text + + +def get_caracs_between(chaine, debut, fin): + """return a string between to string "debut" "fin" + + Args: + chaine (str): search in + debut (str): beginning of the string + fin (str): ending + + Returns: + str: string + """ + # Use regex + motif = re.escape(debut) + "(.*?)" + re.escape(fin) + resultat = re.search(motif, chaine,re.DOTALL) + + if resultat: + contenu_entre_chaines = resultat.group(1) + return contenu_entre_chaines + else: + return "" + +import re + +def get_caracs_between_multiple(chaine, debuts, fins): + """return a string between uncertenly beginning or ending + + Args: + chaine (_str_): input text + debuts (_list_): list of possibles begins + fins (_list_): list of possibles endings + + Returns: + _str_: caracters between begins,ends. Can take multiple ends/begins ! + """ + motifs = [re.escape(debut) + "(.*?)" + re.escape(fin) for debut, fin in zip(debuts, fins)] + + global_regex = "|".join(motifs) + + resultat = re.search(global_regex, chaine,re.DOTALL) + + # Vérifier si la correspondance a été trouvée + if resultat: + contenu_entre_chaines = resultat.group(1) + return contenu_entre_chaines + else: + return ""