diff --git a/README.md b/README.md index 94e4df9..b484275 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,18 @@ APC-bills =============== -Automatic processing of APC bills (from department "Négociations et Acquisitions") \ No newline at end of file +Automatic processing of APC bills (from department "Négociations et Acquisitions") + +## Description + +Ce repo contient l'essentiel du code pour extraire les informations des factures. Les informations à récupérer dépendent du fournisseur. Elles sont détaillées pour ce code dans le fichier __to_do.xlsx__. + +## Utilisation +Toutes les factures sont initialement mélangées dans le même dossier. + +0. (facultatif) metric-data permet d'afficher la répartition des différents codes éditeurs pour cibler les plus importants à traiter. +1. Lancer le code tri factures en modifiant les paramètres `path_bills`, qui correspond au nom du dossier d'entrée et éventuellement `path_bills_output`, correspondant au nom du dossier de sortie. +2. Renseigner son adresse mail comme chaîne de caractère en modifiant le paramètre `mail_adress` dans le fichier __editors-processing/global_function.py__ +2. Lancer un par un les programmes dans editors processing : ils traitent les éditeurs un par un en utilisant de la recherhce de chaîne de caractère. A l'exception de `global_functions.py`qui contient des fonctions utilisées dans chacun des programmes. +3. Les fichiers sont à présents traités dans un dossier __results__. Dans chaque programme, les paramètres `path` et `path_res` doivent être modifiés en conséquent (_path_ : dossier contenant les factures triées + num éditeur) (_path_res_ : results par défaut) +4. On aura en résultat autant de csv que d'éditeurs traités. Les non traités seront dans le fichier "autres". diff --git a/dictionaries/code_fournisseur2fournisseur.json b/dictionaries/code_fournisseur2fournisseur.json new file mode 100644 index 0000000..4275f25 --- /dev/null +++ b/dictionaries/code_fournisseur2fournisseur.json @@ -0,0 +1,18 @@ +{ + "0001021938":1, + "0001197861":2, + "0001200930":3, + "0001079074":4, + "0001021664":5, + "0001024808":6, + "0001097617":7, + "0001091455":8, + "0001217181":9, + "0001090050":10, + "0001157725":11, + "0001062310":12, + "0001021888":13, + "0001222959":14, + "0001234181":15, + "0001226137":16 +} diff --git a/editors-processing/0001021664.py b/editors-processing/0001021664.py new file mode 100644 index 0000000..bc3f41c --- /dev/null +++ b/editors-processing/0001021664.py @@ -0,0 +1,50 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001021664" + +path = './factures-2021-2022-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"Description","Product ").replace("\n"," ").replace("Total Due","") + if item_editor != "": + item_editor = "Product ID + Description: " + item_editor + + else: + item_editor = get_caracs_between(full_text,"Description","Total Units").replace("\n"," ").replace("Total Due","") + if item_editor != "": + item_editor = "Product ID + Description: " + item_editor + + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001021888.py b/editors-processing/0001021888.py new file mode 100644 index 0000000..6f0f056 --- /dev/null +++ b/editors-processing/0001021888.py @@ -0,0 +1,44 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001021888" + +path = './factures-2021-2022-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"Description","Payment terms").replace("\n"," ") + if item_editor != "": + item_editor = "Title: " + item_editor + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001021938.py b/editors-processing/0001021938.py new file mode 100644 index 0000000..76e7a1e --- /dev/null +++ b/editors-processing/0001021938.py @@ -0,0 +1,52 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001021938" + +path = './factures-2021-2022-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + if "Author/Article:" in full_text: + item_editor = get_caracs_between(full_text,"Author/Article:","Manuscript").replace("\n"," ") + + if item_editor != "": + item_editor = "Author/Article: " + item_editor + + else: + item_editor = get_caracs_between(full_text,"Author","Manuscript ID:").replace("\n"," ") + + if item_editor != "": + item_editor = "Author" + item_editor + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001024808.py b/editors-processing/0001024808.py new file mode 100644 index 0000000..b18290e --- /dev/null +++ b/editors-processing/0001024808.py @@ -0,0 +1,50 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001024808" + +path = './factures-2021-2022-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"Description","Total Units").replace("\n"," ") + if item_editor != "": + item_editor = "Description: " + item_editor + + else: + item_editor = get_caracs_between(full_text,"Description","Product ").replace("\n"," ") + if item_editor != "": + item_editor = "Description: " + item_editor + + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001062310.py b/editors-processing/0001062310.py new file mode 100644 index 0000000..bb39759 --- /dev/null +++ b/editors-processing/0001062310.py @@ -0,0 +1,44 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001062310" + +path = './factures-2021-2022-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"Amount","Processing Charge").replace("\n"," ") + if item_editor != "": + item_editor = "Article Number + Title of Manuscript: " + item_editor + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001079074.py b/editors-processing/0001079074.py new file mode 100644 index 0000000..7470402 --- /dev/null +++ b/editors-processing/0001079074.py @@ -0,0 +1,42 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001079074" +path = './factures-2021-2022-triees/%s' %num_editor +path_res='./results' +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + #Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + + # propre à l'éditeur + item_editor = get_caracs_between(full_text,"Article:","Tax").replace("\n"," ") + if item_editor != "": + item_editor = "Article: " + item_editor + + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,'item_editor':item_editor,'full_text': full_text} #HERE ADD + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001090050.py b/editors-processing/0001090050.py new file mode 100644 index 0000000..4c122d7 --- /dev/null +++ b/editors-processing/0001090050.py @@ -0,0 +1,44 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001090050" + +path = './factures-2021-2022-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"Manuscript","\n").replace("\n"," ") + if item_editor != "": + item_editor = "Manuscript: " + item_editor + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001091455.py b/editors-processing/0001091455.py new file mode 100644 index 0000000..f5c834b --- /dev/null +++ b/editors-processing/0001091455.py @@ -0,0 +1,44 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001091455" + +path = './factures-2021-2022-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"Item","Total").replace("\n"," ") + if item_editor != "": + item_editor = "Item: " + item_editor + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001097617.py b/editors-processing/0001097617.py new file mode 100644 index 0000000..6500aeb --- /dev/null +++ b/editors-processing/0001097617.py @@ -0,0 +1,44 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001097617" + +path = './factures-2021-2022-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"","").replace("\n"," ") + if item_editor != "": + item_editor = "Title: " + item_editor + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001157725.py b/editors-processing/0001157725.py new file mode 100644 index 0000000..7e6c8f7 --- /dev/null +++ b/editors-processing/0001157725.py @@ -0,0 +1,44 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001157725" + +path = './factures-2021-2022-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"Journal Name","DOI").replace("\n"," ") + if item_editor != "": + item_editor = "Description: " + item_editor + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001197861.py b/editors-processing/0001197861.py new file mode 100644 index 0000000..93bd46d --- /dev/null +++ b/editors-processing/0001197861.py @@ -0,0 +1,44 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001197861" + +path = './factures-2021-2022-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"Description","Net Value") + if item_editor != "": + item_editor = "Description: " + item_editor + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001200930.py b/editors-processing/0001200930.py new file mode 100644 index 0000000..1d06086 --- /dev/null +++ b/editors-processing/0001200930.py @@ -0,0 +1,48 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001200930" + +# path = './test/%s' %num_editor +path = './factures-2021-2022-triees/%s' %num_editor + +# path_res='./test-results' +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + + # Article + item_editor = get_caracs_between(full_text,'"','"').replace("\n"," ") + if item_editor != "": + item_editor = "Article Title: " + item_editor + + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001217181.py b/editors-processing/0001217181.py new file mode 100644 index 0000000..ab91666 --- /dev/null +++ b/editors-processing/0001217181.py @@ -0,0 +1,42 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001217181" +path = './factures-2021-2022-triees/%s' %num_editor +path_res='./results' +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + + # Article + item_editor = get_caracs_between(full_text,"Title","published in").replace("\n"," ") + if item_editor != "": + item_editor = "Title: " + item_editor + + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi,"item_editor":item_editor,'full_text': full_text} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001222959.py b/editors-processing/0001222959.py new file mode 100644 index 0000000..a51d561 --- /dev/null +++ b/editors-processing/0001222959.py @@ -0,0 +1,44 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001222959" + +path = './factures-2021-2022-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"Manuscript ID","Net Value").replace("\n"," ") + if item_editor != "": + item_editor = "Manuscript ID: " + item_editor + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001226137.py b/editors-processing/0001226137.py new file mode 100644 index 0000000..75c2cdc --- /dev/null +++ b/editors-processing/0001226137.py @@ -0,0 +1,47 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001226137" + +path = './factures-2021-2022-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"Title","Author").replace("\n"," ") + if item_editor != "": + item_editor = "Article Title: " + item_editor + + is_oa = get_caracs_between(full_text,"Item Description","Total Amount").replace("\n"," ") + item_editor = item_editor + "\nItem Description :" + is_oa + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001234181.py b/editors-processing/0001234181.py new file mode 100644 index 0000000..7a0943a --- /dev/null +++ b/editors-processing/0001234181.py @@ -0,0 +1,44 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001234181" + +path = './factures-2021-2022-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"","").replace("\n"," ") + if item_editor != "": + item_editor = "Title: " + item_editor + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/autres.py b/editors-processing/autres.py new file mode 100644 index 0000000..8dbd11c --- /dev/null +++ b/editors-processing/autres.py @@ -0,0 +1,34 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +path = './factures-2021-2022-triees/autres' +path_res='./results' +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + #Extract infos + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,'full_text': full_text} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/autres-sorted.csv"%path_res,index=False,header=True,encoding='utf-8',escapechar='\\') \ No newline at end of file diff --git a/editors-processing/global_functions.py b/editors-processing/global_functions.py new file mode 100644 index 0000000..de6cb82 --- /dev/null +++ b/editors-processing/global_functions.py @@ -0,0 +1,90 @@ +import re +from requests_ratelimiter import LimiterSession + +mail_adress = "[prefix_mail]@[domaine].[fr|com]" +session = LimiterSession(per_second=5) + +def find_doi(text): + """ + return the first doi found in a text (input) + """ + doiRegex = r'\b10.\d{4,}\/[^\s]+\b' + doi = re.search(doiRegex, text) + if doi == None: + return "" + try: + doiStr = doi.group() + return doiStr + except: + return "" + +def verify_doi(doi,mail=mail_adress): + """ + check with crossref api if doi is correct. + """ + url = f"https://api.crossref.org/works/{doi}/agency?mailto={mail}" + + # Return True if DOI exists in crossref api + code_response = session.get(url).status_code + return code_response == 200 + + +def read_all_page(reader): + """ + for a reader, return all his page separate with "\n" in str format + """ + text= "" + lenPages = len(reader.pages) + for i in range(lenPages): + #transform page into text + text+= reader.pages[i].extract_text() + text+= "\n" + return text + + +def get_caracs_between(chaine, debut, fin): + """return a string between to string "debut" "fin" + + Args: + chaine (str): search in + debut (str): beginning of the string + fin (str): ending + + Returns: + str: string + """ + # Use regex + motif = re.escape(debut) + "(.*?)" + re.escape(fin) + resultat = re.search(motif, chaine,re.DOTALL) + + if resultat: + contenu_entre_chaines = resultat.group(1) + return contenu_entre_chaines + else: + return "" + +import re + +def get_caracs_between_multiple(chaine, debuts, fins): + """return a string between uncertenly beginning or ending + + Args: + chaine (_str_): input text + debuts (_list_): list of possibles begins + fins (_list_): list of possibles endings + + Returns: + _str_: caracters between begins,ends. Can take multiple ends/begins ! + """ + motifs = [re.escape(debut) + "(.*?)" + re.escape(fin) for debut, fin in zip(debuts, fins)] + + global_regex = "|".join(motifs) + + resultat = re.search(global_regex, chaine,re.DOTALL) + + # Vérifier si la correspondance a été trouvée + if resultat: + contenu_entre_chaines = resultat.group(1) + return contenu_entre_chaines + else: + return "" diff --git a/metric-data.py b/metric-data.py new file mode 100644 index 0000000..4337425 --- /dev/null +++ b/metric-data.py @@ -0,0 +1,18 @@ +from pathlib import Path + +path_bills = "factures-2021-2022" + +#fonction qui classe les PDFs par editeur +def metrics_editors(path): + dic_stats = {} + files = Path('./%s/' %path).glob('*') + for file in files: + code_editor=str(file).split("_")[1] + if code_editor not in dic_stats: + dic_stats[code_editor]=1 + else : + dic_stats[code_editor]+=1 + return dict(sorted(dic_stats.items(), key=lambda item:item[1])) + + +print(metrics_editors(path=path_bills)) \ No newline at end of file diff --git a/to_do.xlsx b/to_do.xlsx new file mode 100644 index 0000000..4016360 --- /dev/null +++ b/to_do.xlsx Binary files differ diff --git a/tri-factures.py b/tri-factures.py new file mode 100644 index 0000000..894ad64 --- /dev/null +++ b/tri-factures.py @@ -0,0 +1,35 @@ +import shutil +from pathlib import Path +import os +import json + +path_bills = "factures-2021-2022" +path_bills_output = "factures-2021-2022-triees" +""" +Notice d'utilisation +ATTENTION : Les dossier d'entrée et de sortie ne doivent PAS CONTENIR de "_" : c'est réservé pour les noms de fichier pdf des factures. + +Ce programme permet de trier un dossier de facture par code fournisseur +Si le code fournisseur n'est pas dans la liste id_editor2editor, ne peut être traité de manière automatique. +""" +#address path to datas (pdf only) +with open('./dictionaries/code_fournisseur2fournisseur.json','r') as f_in: + id_editor2editor = json.load(f_in) + +#fonction utilitaire pour classEditors qui copie le fichier PDF dans un repository dans son code editeur. +def copyItIn(file,repository,global_path='./%s' %path_bills_output): + if not os.path.exists('%s/%s' % (global_path,repository) ): + os.makedirs('%s/%s' % (global_path,repository) ) + shutil.copy(file, '%s/%s' % (global_path,repository) ) + + +# classe les PDFs par editeur + +files = Path('./%s/' %path_bills).glob('*') +for file in files: + code_editor=str(file).split("_")[1] + if code_editor in id_editor2editor.keys(): + copyItIn(file,code_editor) + else: + copyItIn(file,'autres') +