diff --git a/dictionaries/code_fournisseur2fournisseur.json b/dictionaries/code_fournisseur2fournisseur.json deleted file mode 100644 index 4275f25..0000000 --- a/dictionaries/code_fournisseur2fournisseur.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "0001021938":1, - "0001197861":2, - "0001200930":3, - "0001079074":4, - "0001021664":5, - "0001024808":6, - "0001097617":7, - "0001091455":8, - "0001217181":9, - "0001090050":10, - "0001157725":11, - "0001062310":12, - "0001021888":13, - "0001222959":14, - "0001234181":15, - "0001226137":16 -} diff --git a/editors-processing/0001021664.py b/editors-processing/0001021664.py deleted file mode 100644 index bc3f41c..0000000 --- a/editors-processing/0001021664.py +++ /dev/null @@ -1,50 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001021664" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' - - -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - # Article - item_editor = get_caracs_between(full_text,"Description","Product ").replace("\n"," ").replace("Total Due","") - if item_editor != "": - item_editor = "Product ID + Description: " + item_editor - - else: - item_editor = get_caracs_between(full_text,"Description","Total Units").replace("\n"," ").replace("Total Due","") - if item_editor != "": - item_editor = "Product ID + Description: " + item_editor - - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001021888.py b/editors-processing/0001021888.py deleted file mode 100644 index 6f0f056..0000000 --- a/editors-processing/0001021888.py +++ /dev/null @@ -1,44 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001021888" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' - - -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - # Article - item_editor = get_caracs_between(full_text,"Description","Payment terms").replace("\n"," ") - if item_editor != "": - item_editor = "Title: " + item_editor - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001021938.py b/editors-processing/0001021938.py deleted file mode 100644 index 76e7a1e..0000000 --- a/editors-processing/0001021938.py +++ /dev/null @@ -1,52 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001021938" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' - - -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - # Article - if "Author/Article:" in full_text: - item_editor = get_caracs_between(full_text,"Author/Article:","Manuscript").replace("\n"," ") - - if item_editor != "": - item_editor = "Author/Article: " + item_editor - - else: - item_editor = get_caracs_between(full_text,"Author","Manuscript ID:").replace("\n"," ") - - if item_editor != "": - item_editor = "Author" + item_editor - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001024808.py b/editors-processing/0001024808.py deleted file mode 100644 index b18290e..0000000 --- a/editors-processing/0001024808.py +++ /dev/null @@ -1,50 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001024808" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' - - -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - # Article - item_editor = get_caracs_between(full_text,"Description","Total Units").replace("\n"," ") - if item_editor != "": - item_editor = "Description: " + item_editor - - else: - item_editor = get_caracs_between(full_text,"Description","Product ").replace("\n"," ") - if item_editor != "": - item_editor = "Description: " + item_editor - - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001062310.py b/editors-processing/0001062310.py deleted file mode 100644 index bb39759..0000000 --- a/editors-processing/0001062310.py +++ /dev/null @@ -1,44 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001062310" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' - - -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - # Article - item_editor = get_caracs_between(full_text,"Amount","Processing Charge").replace("\n"," ") - if item_editor != "": - item_editor = "Article Number + Title of Manuscript: " + item_editor - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001079074.py b/editors-processing/0001079074.py deleted file mode 100644 index 7470402..0000000 --- a/editors-processing/0001079074.py +++ /dev/null @@ -1,42 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001079074" -path = './factures-2021-2022-triees/%s' %num_editor -path_res='./results' -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - #Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - - # propre à l'éditeur - item_editor = get_caracs_between(full_text,"Article:","Tax").replace("\n"," ") - if item_editor != "": - item_editor = "Article: " + item_editor - - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,'item_editor':item_editor,'full_text': full_text} #HERE ADD - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001090050.py b/editors-processing/0001090050.py deleted file mode 100644 index 4c122d7..0000000 --- a/editors-processing/0001090050.py +++ /dev/null @@ -1,44 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001090050" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' - - -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - # Article - item_editor = get_caracs_between(full_text,"Manuscript","\n").replace("\n"," ") - if item_editor != "": - item_editor = "Manuscript: " + item_editor - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001091455.py b/editors-processing/0001091455.py deleted file mode 100644 index f5c834b..0000000 --- a/editors-processing/0001091455.py +++ /dev/null @@ -1,44 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001091455" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' - - -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - # Article - item_editor = get_caracs_between(full_text,"Item","Total").replace("\n"," ") - if item_editor != "": - item_editor = "Item: " + item_editor - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001097617.py b/editors-processing/0001097617.py deleted file mode 100644 index 6500aeb..0000000 --- a/editors-processing/0001097617.py +++ /dev/null @@ -1,44 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001097617" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' - - -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - # Article - item_editor = get_caracs_between(full_text,"","").replace("\n"," ") - if item_editor != "": - item_editor = "Title: " + item_editor - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001157725.py b/editors-processing/0001157725.py deleted file mode 100644 index 7e6c8f7..0000000 --- a/editors-processing/0001157725.py +++ /dev/null @@ -1,44 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001157725" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' - - -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - # Article - item_editor = get_caracs_between(full_text,"Journal Name","DOI").replace("\n"," ") - if item_editor != "": - item_editor = "Description: " + item_editor - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001197861.py b/editors-processing/0001197861.py deleted file mode 100644 index 93bd46d..0000000 --- a/editors-processing/0001197861.py +++ /dev/null @@ -1,44 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001197861" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' - - -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - # Article - item_editor = get_caracs_between(full_text,"Description","Net Value") - if item_editor != "": - item_editor = "Description: " + item_editor - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001200930.py b/editors-processing/0001200930.py deleted file mode 100644 index 1d06086..0000000 --- a/editors-processing/0001200930.py +++ /dev/null @@ -1,48 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001200930" - -# path = './test/%s' %num_editor -path = './factures-2021-2022-triees/%s' %num_editor - -# path_res='./test-results' -path_res='./results' - - -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - - # Article - item_editor = get_caracs_between(full_text,'"','"').replace("\n"," ") - if item_editor != "": - item_editor = "Article Title: " + item_editor - - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001217181.py b/editors-processing/0001217181.py deleted file mode 100644 index ab91666..0000000 --- a/editors-processing/0001217181.py +++ /dev/null @@ -1,42 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001217181" -path = './factures-2021-2022-triees/%s' %num_editor -path_res='./results' -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - - # Article - item_editor = get_caracs_between(full_text,"Title","published in").replace("\n"," ") - if item_editor != "": - item_editor = "Title: " + item_editor - - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001222959.py b/editors-processing/0001222959.py deleted file mode 100644 index a51d561..0000000 --- a/editors-processing/0001222959.py +++ /dev/null @@ -1,44 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001222959" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' - - -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - # Article - item_editor = get_caracs_between(full_text,"Manuscript ID","Net Value").replace("\n"," ") - if item_editor != "": - item_editor = "Manuscript ID: " + item_editor - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001226137.py b/editors-processing/0001226137.py deleted file mode 100644 index 75c2cdc..0000000 --- a/editors-processing/0001226137.py +++ /dev/null @@ -1,47 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001226137" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' - - -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - # Article - item_editor = get_caracs_between(full_text,"Title","Author").replace("\n"," ") - if item_editor != "": - item_editor = "Article Title: " + item_editor - - is_oa = get_caracs_between(full_text,"Item Description","Total Amount").replace("\n"," ") - item_editor = item_editor + "\nItem Description :" + is_oa - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/0001234181.py b/editors-processing/0001234181.py deleted file mode 100644 index 7a0943a..0000000 --- a/editors-processing/0001234181.py +++ /dev/null @@ -1,44 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -num_editor = "0001234181" - -path = './factures-2021-2022-triees/%s' %num_editor - -path_res='./results' - - -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - ## Extract infos - # DOI - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - # Article - item_editor = get_caracs_between(full_text,"","").replace("\n"," ") - if item_editor != "": - item_editor = "Title: " + item_editor - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/editors-processing/autres.py b/editors-processing/autres.py deleted file mode 100644 index 8dbd11c..0000000 --- a/editors-processing/autres.py +++ /dev/null @@ -1,34 +0,0 @@ -from global_functions import * -import os -import PyPDF2 -import pandas as pd - -path = './factures-2021-2022-triees/autres' -path_res='./results' -df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","full_text"]) - -res = [] -for filename in os.listdir(path): - file = os.path.join(path, filename) - # Read PDF as text - try: - reader = PyPDF2.PdfReader(file, strict=False) - full_text = read_all_page(reader) - except: - full_text= "" - - #Extract infos - unverified_doi = find_doi(full_text) - doi = "" - #only if a doi is found - if unverified_doi: - if verify_doi(unverified_doi): - #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. - doi=unverified_doi - unverified_doi="" - - - new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,'full_text': full_text} - df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) - -df_res.to_csv(path_or_buf="%s/autres-sorted.csv"%path_res,index=False,header=True,encoding='utf-8',escapechar='\\') \ No newline at end of file diff --git a/editors-processing/global_functions.py b/editors-processing/global_functions.py deleted file mode 100644 index de6cb82..0000000 --- a/editors-processing/global_functions.py +++ /dev/null @@ -1,90 +0,0 @@ -import re -from requests_ratelimiter import LimiterSession - -mail_adress = "[prefix_mail]@[domaine].[fr|com]" -session = LimiterSession(per_second=5) - -def find_doi(text): - """ - return the first doi found in a text (input) - """ - doiRegex = r'\b10.\d{4,}\/[^\s]+\b' - doi = re.search(doiRegex, text) - if doi == None: - return "" - try: - doiStr = doi.group() - return doiStr - except: - return "" - -def verify_doi(doi,mail=mail_adress): - """ - check with crossref api if doi is correct. - """ - url = f"https://api.crossref.org/works/{doi}/agency?mailto={mail}" - - # Return True if DOI exists in crossref api - code_response = session.get(url).status_code - return code_response == 200 - - -def read_all_page(reader): - """ - for a reader, return all his page separate with "\n" in str format - """ - text= "" - lenPages = len(reader.pages) - for i in range(lenPages): - #transform page into text - text+= reader.pages[i].extract_text() - text+= "\n" - return text - - -def get_caracs_between(chaine, debut, fin): - """return a string between to string "debut" "fin" - - Args: - chaine (str): search in - debut (str): beginning of the string - fin (str): ending - - Returns: - str: string - """ - # Use regex - motif = re.escape(debut) + "(.*?)" + re.escape(fin) - resultat = re.search(motif, chaine,re.DOTALL) - - if resultat: - contenu_entre_chaines = resultat.group(1) - return contenu_entre_chaines - else: - return "" - -import re - -def get_caracs_between_multiple(chaine, debuts, fins): - """return a string between uncertenly beginning or ending - - Args: - chaine (_str_): input text - debuts (_list_): list of possibles begins - fins (_list_): list of possibles endings - - Returns: - _str_: caracters between begins,ends. Can take multiple ends/begins ! - """ - motifs = [re.escape(debut) + "(.*?)" + re.escape(fin) for debut, fin in zip(debuts, fins)] - - global_regex = "|".join(motifs) - - resultat = re.search(global_regex, chaine,re.DOTALL) - - # Vérifier si la correspondance a été trouvée - if resultat: - contenu_entre_chaines = resultat.group(1) - return contenu_entre_chaines - else: - return "" diff --git a/to_do.xlsx b/to_do.xlsx deleted file mode 100644 index 4016360..0000000 --- a/to_do.xlsx +++ /dev/null Binary files differ diff --git a/v1/editorsFunctions/0001021626.py b/v1/editorsFunctions/0001021626.py new file mode 100644 index 0000000..20470e8 --- /dev/null +++ b/v1/editorsFunctions/0001021626.py @@ -0,0 +1,45 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001021626" + +path = './factures-2019-2023-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"Order Reference","VAT amount") + if item_editor != "": + item_editor = "Order Reference" + item_editor + + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar=',') diff --git a/v1/editorsFunctions/0001021664.py b/v1/editorsFunctions/0001021664.py new file mode 100644 index 0000000..2ed621b --- /dev/null +++ b/v1/editorsFunctions/0001021664.py @@ -0,0 +1,50 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001021664" + +path = './factures-2021-2022-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"Description","Product ").replace("\n"," ").replace("Total Due","") + if item_editor != "": + item_editor = "Product ID + Description: " + item_editor + + else: + item_editor = get_caracs_between(full_text,"Description","Total Units").replace("\n"," ").replace("Total Due","") + if item_editor != "": + item_editor = "Product ID + Description: " + item_editor + + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/v1/editorsFunctions/0001021793.py b/v1/editorsFunctions/0001021793.py new file mode 100644 index 0000000..fc697e6 --- /dev/null +++ b/v1/editorsFunctions/0001021793.py @@ -0,0 +1,44 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001021793" + +path = './factures-2019-2023-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"Item","Total").replace("\n"," ") + if item_editor != "": + item_editor = "Item: " + item_editor + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/v1/editorsFunctions/0001021875.py b/v1/editorsFunctions/0001021875.py new file mode 100644 index 0000000..17c5cf1 --- /dev/null +++ b/v1/editorsFunctions/0001021875.py @@ -0,0 +1,45 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001021875" + +path = './factures-2019-2023-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"Article","VAT") + if item_editor != "": + item_editor = "Article" + item_editor + + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar=',') diff --git a/v1/editorsFunctions/0001021888.py b/v1/editorsFunctions/0001021888.py new file mode 100644 index 0000000..5552818 --- /dev/null +++ b/v1/editorsFunctions/0001021888.py @@ -0,0 +1,44 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001021888" + +path = './factures-2021-2022-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"Description","Payment terms").replace("\n"," ") + if item_editor != "": + item_editor = "Title: " + item_editor + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/v1/editorsFunctions/0001021938.py b/v1/editorsFunctions/0001021938.py new file mode 100644 index 0000000..581d269 --- /dev/null +++ b/v1/editorsFunctions/0001021938.py @@ -0,0 +1,52 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001021938" + +path = './factures-2021-2022-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + if "Author/Article:" in full_text: + item_editor = get_caracs_between(full_text,"Author/Article:","Manuscript").replace("\n"," ") + + if item_editor != "": + item_editor = "Author/Article: " + item_editor + + else: + item_editor = get_caracs_between(full_text,"Author","Manuscript ID:").replace("\n"," ") + + if item_editor != "": + item_editor = "Author" + item_editor + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/v1/editorsFunctions/0001024308.py b/v1/editorsFunctions/0001024308.py new file mode 100644 index 0000000..204126b --- /dev/null +++ b/v1/editorsFunctions/0001024308.py @@ -0,0 +1,45 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001024308" + +path = './factures-2019-2023-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"PUBLICATION CHARGES","PAGE CHARGES").replace("\n","") + if item_editor != "": + item_editor = "PUBLICATION CHARGES" + item_editor + + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar=',') diff --git a/v1/editorsFunctions/0001024313.py b/v1/editorsFunctions/0001024313.py new file mode 100644 index 0000000..bad9076 --- /dev/null +++ b/v1/editorsFunctions/0001024313.py @@ -0,0 +1,45 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001024313" + +path = './factures-2019-2023-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text)),"Title:","Journal:") + if item_editor != "": + item_editor = "Title: " + item_editor + + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar=',') diff --git a/v1/editorsFunctions/0001024524.py b/v1/editorsFunctions/0001024524.py new file mode 100644 index 0000000..68d6caa --- /dev/null +++ b/v1/editorsFunctions/0001024524.py @@ -0,0 +1,45 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001024524" + +path = './factures-2019-2023-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"Article:","Corresponding Author:") + if item_editor != "": + item_editor = "Title: " + item_editor + + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar=',') diff --git a/v1/editorsFunctions/0001024808.py b/v1/editorsFunctions/0001024808.py new file mode 100644 index 0000000..190abea --- /dev/null +++ b/v1/editorsFunctions/0001024808.py @@ -0,0 +1,50 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001024808" + +path = './factures-2021-2022-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"Description","Total Units").replace("\n"," ") + if item_editor != "": + item_editor = "Description: " + item_editor + + else: + item_editor = get_caracs_between(full_text,"Description","Product ").replace("\n"," ") + if item_editor != "": + item_editor = "Description: " + item_editor + + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/v1/editorsFunctions/0001062310.py b/v1/editorsFunctions/0001062310.py new file mode 100644 index 0000000..97e7c53 --- /dev/null +++ b/v1/editorsFunctions/0001062310.py @@ -0,0 +1,43 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001062310" + +path = './factures-2019-2023-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"Total","USD").replace("\n"," ").replace("Article Processing", "|| Article Processing") + + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar=',') diff --git a/v1/editorsFunctions/0001079074.py b/v1/editorsFunctions/0001079074.py new file mode 100644 index 0000000..dcfd9a5 --- /dev/null +++ b/v1/editorsFunctions/0001079074.py @@ -0,0 +1,42 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001079074" +path = './factures-2021-2022-triees/%s' %num_editor +path_res='./results' +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + #Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + + # propre à l'éditeur + item_editor = get_caracs_between(full_text,"Article:","Tax").replace("\n"," ") + if item_editor != "": + item_editor = "Article: " + item_editor + + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,'item_editor':item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} #HERE ADD + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/v1/editorsFunctions/0001090050.py b/v1/editorsFunctions/0001090050.py new file mode 100644 index 0000000..37c8a2b --- /dev/null +++ b/v1/editorsFunctions/0001090050.py @@ -0,0 +1,44 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001090050" + +path = './factures-2021-2022-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"Manuscript","\n").replace("\n"," ") + if item_editor != "": + item_editor = "Manuscript: " + item_editor + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/v1/editorsFunctions/0001091455.py b/v1/editorsFunctions/0001091455.py new file mode 100644 index 0000000..c4303a7 --- /dev/null +++ b/v1/editorsFunctions/0001091455.py @@ -0,0 +1,44 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001091455" + +path = './factures-2021-2022-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"Item","Total").replace("\n"," ") + if item_editor != "": + item_editor = "Item: " + item_editor + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/v1/editorsFunctions/0001097617.py b/v1/editorsFunctions/0001097617.py new file mode 100644 index 0000000..adf9d34 --- /dev/null +++ b/v1/editorsFunctions/0001097617.py @@ -0,0 +1,44 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001097617" + +path = './factures-2021-2022-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"","").replace("\n"," ") + if item_editor != "": + item_editor = "Title: " + item_editor + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/v1/editorsFunctions/0001129981.py b/v1/editorsFunctions/0001129981.py new file mode 100644 index 0000000..447a490 --- /dev/null +++ b/v1/editorsFunctions/0001129981.py @@ -0,0 +1,43 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001129981" + +path = './factures-2019-2023-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"Total Due","Check").replace("\n"," ") + + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar=',') diff --git a/v1/editorsFunctions/0001157725.py b/v1/editorsFunctions/0001157725.py new file mode 100644 index 0000000..33d3418 --- /dev/null +++ b/v1/editorsFunctions/0001157725.py @@ -0,0 +1,44 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001157725" + +path = './factures-2021-2022-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"Journal Name","DOI").replace("\n"," ") + if item_editor != "": + item_editor = "Description: " + item_editor + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/v1/editorsFunctions/0001167626.py b/v1/editorsFunctions/0001167626.py new file mode 100644 index 0000000..d85bbb0 --- /dev/null +++ b/v1/editorsFunctions/0001167626.py @@ -0,0 +1,44 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001167626" + +path = './factures-2019-2023-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"Article","TOTAL VALUE") + if item_editor != "": + item_editor = "Article" + item_editor + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar=',') diff --git a/v1/editorsFunctions/0001189058.py b/v1/editorsFunctions/0001189058.py new file mode 100644 index 0000000..ffb8927 --- /dev/null +++ b/v1/editorsFunctions/0001189058.py @@ -0,0 +1,45 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001189058" + +path = './factures-2019-2023-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"TITLE","REMARKS").replace( + "USD","").replace("PRICE","").replace("DISC","").replace("NET","").replace( + "VALUE","").replace("UNIT","").replace("TAX","").replace("VAT","").replace("\n","") + + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar=',') diff --git a/v1/editorsFunctions/0001197861.py b/v1/editorsFunctions/0001197861.py new file mode 100644 index 0000000..821172a --- /dev/null +++ b/v1/editorsFunctions/0001197861.py @@ -0,0 +1,44 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001197861" + +path = './factures-2021-2022-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"Description","Net Value") + if item_editor != "": + item_editor = "Description: " + item_editor + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/v1/editorsFunctions/0001200930.py b/v1/editorsFunctions/0001200930.py new file mode 100644 index 0000000..431af63 --- /dev/null +++ b/v1/editorsFunctions/0001200930.py @@ -0,0 +1,48 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001200930" + +# path = './test/%s' %num_editor +path = './factures-2021-2022-triees/%s' %num_editor + +# path_res='./test-results' +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + + # Article + item_editor = get_caracs_between(full_text,'"','"').replace("\n"," ") + if item_editor != "": + item_editor = "Article Title: " + item_editor + + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/v1/editorsFunctions/0001217181.py b/v1/editorsFunctions/0001217181.py new file mode 100644 index 0000000..595050c --- /dev/null +++ b/v1/editorsFunctions/0001217181.py @@ -0,0 +1,42 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001217181" +path = './factures-2021-2022-triees/%s' %num_editor +path_res='./results' +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + + # Article + item_editor = get_caracs_between(full_text,"Title","published in").replace("\n"," ") + if item_editor != "": + item_editor = "Title: " + item_editor + + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/v1/editorsFunctions/0001222959.py b/v1/editorsFunctions/0001222959.py new file mode 100644 index 0000000..4d235b4 --- /dev/null +++ b/v1/editorsFunctions/0001222959.py @@ -0,0 +1,44 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001222959" + +path = './factures-2021-2022-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"Manuscript ID","Net Value").replace("\n"," ") + if item_editor != "": + item_editor = "Manuscript ID: " + item_editor + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/v1/editorsFunctions/0001226137.py b/v1/editorsFunctions/0001226137.py new file mode 100644 index 0000000..d477d57 --- /dev/null +++ b/v1/editorsFunctions/0001226137.py @@ -0,0 +1,47 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001226137" + +path = './factures-2021-2022-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"Title","Author").replace("\n"," ") + if item_editor != "": + item_editor = "Article Title: " + item_editor + + is_oa = get_caracs_between(full_text,"Item Description","Total Amount").replace("\n"," ") + item_editor = item_editor + "\nItem Description :" + is_oa + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/v1/editorsFunctions/0001234181.py b/v1/editorsFunctions/0001234181.py new file mode 100644 index 0000000..ad3dc50 --- /dev/null +++ b/v1/editorsFunctions/0001234181.py @@ -0,0 +1,44 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001234181" + +path = './factures-2021-2022-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"","").replace("\n"," ") + if item_editor != "": + item_editor = "Title: " + item_editor + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar='\\') diff --git a/v1/editorsFunctions/0001247066.py b/v1/editorsFunctions/0001247066.py new file mode 100644 index 0000000..1570c47 --- /dev/null +++ b/v1/editorsFunctions/0001247066.py @@ -0,0 +1,45 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001247066" + +path = './factures-2019-2023-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"INVOICE","Unit") + if item_editor != "": + item_editor = "Description: " + item_editor.replace("tinyLine\n","") + + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar=',') diff --git a/v1/editorsFunctions/0001271047.py b/v1/editorsFunctions/0001271047.py new file mode 100644 index 0000000..04e17a4 --- /dev/null +++ b/v1/editorsFunctions/0001271047.py @@ -0,0 +1,45 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +num_editor = "0001271047" + +path = './factures-2019-2023-triees/%s' %num_editor + +path_res='./results' + + +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","item_editor","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + ## Extract infos + # DOI + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + # Article + item_editor = get_caracs_between(full_text,"Title","Author") + if item_editor != "": + item_editor = "Title: " + item_editor + + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,"item_editor":item_editor,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/%s.csv"%(path_res,num_editor),index=False,header=True,encoding='utf-8',escapechar=',') diff --git a/v1/editorsFunctions/__init__.py b/v1/editorsFunctions/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/v1/editorsFunctions/__init__.py diff --git a/v1/editorsFunctions/autres.py b/v1/editorsFunctions/autres.py new file mode 100644 index 0000000..d84bc1f --- /dev/null +++ b/v1/editorsFunctions/autres.py @@ -0,0 +1,34 @@ +from global_functions import * +import os +import PyPDF2 +import pandas as pd + +path = './factures-2021-2022-triees/autres' +path_res='./results' +df_res = pd.DataFrame(columns=["nom_complet","doi","unverified_doi","full_text"]) + +res = [] +for filename in os.listdir(path): + file = os.path.join(path, filename) + # Read PDF as text + try: + reader = PyPDF2.PdfReader(file, strict=False) + full_text = read_all_page(reader) + except: + full_text= "" + + #Extract infos + unverified_doi = find_doi(full_text) + doi = "" + #only if a doi is found + if unverified_doi: + if verify_doi(unverified_doi): + #If DOI is found, write it in "doi" column instead of 'unverified_doi' column. + doi=unverified_doi + unverified_doi="" + + + new_row = {'nom_complet': filename, 'doi': doi, 'unverified_doi':unverified_doi ,'full_text': re.sub("\n\n+","\n",re.sub("\s\s+"," ",full_text))} + df_res = pd.concat([df_res,pd.DataFrame([new_row])],ignore_index=True) + +df_res.to_csv(path_or_buf="%s/autres-sorted.csv"%path_res,index=False,header=True,encoding='utf-8',escapechar='\\') \ No newline at end of file diff --git a/v1/editorsFunctions/global_functions.py b/v1/editorsFunctions/global_functions.py new file mode 100644 index 0000000..d20b86c --- /dev/null +++ b/v1/editorsFunctions/global_functions.py @@ -0,0 +1,103 @@ +import re +from requests_ratelimiter import LimiterSession +import os +import PyPDF2 + +mail_adress = "[prefix_mail]@[domaine].[fr|com]" +session = LimiterSession(per_second=5) + +def find_doi(text): + """ + return the first doi found in a text (input) + """ + doiRegex = r'\b10.\d{4,}\/[^\s]+\b' + doi = re.search(doiRegex, text) + if doi == None: + return "" + try: + doiStr = doi.group() + return doiStr + except: + return "" + +def verify_doi(doi,mail=mail_adress): + """ + check with crossref api if doi is correct. + """ + url = f"https://api.crossref.org/works/{doi}/agency?mailto={mail}" + + # Return True if DOI exists in crossref api + code_response = session.get(url).status_code + return code_response == 200 + + +def read_all_page(reader): + """ + for a reader, return all his page separate with "\n" in str format + """ + text= "" + lenPages = len(reader.pages) + for i in range(lenPages): + #transform page into text + text+= reader.pages[i].extract_text() + text+= "\n" + return text + + +def ocr_pdf(filename): + # Read PDF as text + try: + reader = PyPDF2.PdfReader(filename, strict=False) + full_text = read_all_page(reader) + except Exception as e: + full_text= f"{e}" + + return full_text + + +def get_caracs_between(chaine, debut, fin): + """return a string between to string "debut" "fin" + + Args: + chaine (str): search in + debut (str): beginning of the string + fin (str): ending + + Returns: + str: string + """ + # Use regex + motif = re.escape(debut) + "(.*?)" + re.escape(fin) + resultat = re.search(motif, chaine,re.DOTALL) + + if resultat: + contenu_entre_chaines = resultat.group(1) + return contenu_entre_chaines + else: + return "" + +import re + +def get_caracs_between_multiple(chaine, debuts, fins): + """return a string between uncertenly beginning or ending + + Args: + chaine (_str_): input text + debuts (_list_): list of possibles begins + fins (_list_): list of possibles endings + + Returns: + _str_: caracters between begins,ends. Can take multiple ends/begins ! + """ + motifs = [re.escape(debut) + "(.*?)" + re.escape(fin) for debut, fin in zip(debuts, fins)] + + global_regex = "|".join(motifs) + + resultat = re.search(global_regex, chaine,re.DOTALL) + + # Vérifier si la correspondance a été trouvée + if resultat: + contenu_entre_chaines = resultat.group(1) + return contenu_entre_chaines + else: + return ""