diff --git a/hal-classifier/requirements.txt b/hal-classifier/requirements.txt deleted file mode 100644 index 5a40abb..0000000 --- a/hal-classifier/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -more-itertools -numpy==1.19.1 -scipy==1.5.4 -fasttext -faiss-cpu==1.7.2 - diff --git a/hal-classifier/v1/en/HALen_matrixKDT.npy b/hal-classifier/v1/en/HALen_matrixKDT.npy deleted file mode 100644 index b1759e2..0000000 --- a/hal-classifier/v1/en/HALen_matrixKDT.npy +++ /dev/null Binary files differ diff --git a/hal-classifier/v1/en/classhalen.ini b/hal-classifier/v1/en/classhalen.ini deleted file mode 100644 index 9f111b7..0000000 --- a/hal-classifier/v1/en/classhalen.ini +++ /dev/null @@ -1,54 +0,0 @@ -# OpenAPI Documentation - JSON format (dot notation) -mimeType = application/json -post.operationId = post-v1-en-classhalen -post.description = Calcule la classe de premier niveau de la base HAL pour des documents en anglais et renvoie la verbalisation de la classe -post.responses.default.description = Renvoie un Json composé de `id`, `value` avec `value` la verbalisation en français de la classe -post.responses.default.content.application/json.schema.$ref = #/components/schemas/JSONStream -post.summary = Classification HAL -post.tags.0 = Classification supervisée -post.requestBody.required = true -post.requestBody.content.application/json.schema.$ref = #/components/schemas/JSONStream -post.parameters.0.in = query -post.parameters.0.name = path -post.parameters.0.schema.type = string -post.parameters.0.description = The path in each object to enrich with an Python script -post.parameters.1.in = query -post.parameters.1.name = indent -post.parameters.1.schema.type = boolean -post.parameters.1.description = Indent or not the JSON Result - -# Examples -post.requestBody.content.application/json.example.0.id = 1 -post.requestBody.content.application/json.example.0.value = In the southern French Massif Central, the Montagne Noire axial zone is a NE-SW elongated granite-migmatite dome emplaced within Visean south-verging recumbent folds and intruded by syn- to late-migmatization granitoids. The tectonic setting of this dome is still disputed, thus several models have been proposed. In order to better understand the emplacement mechanism of this dome, petrofabric and Anisotropy of Magnetic Susceptibility (AMS) studies have been carried out. In the granites and migmatites that form the dome core, magmatic texture and to a lesser extent weak solid-state texture are dominant. As a paramagnetic mineral, biotite is the main carrier of the magnetic susceptibility. On the basis of 135 AMS sites, the magnetic fabrics appear as independent of the lithology but related to the dome architecture. Coupling our results with previous structural and geochronological studies, allows us to propose a new emplacement model. Between 340-325 Ma, the Palaeozoic series underwent a compressional deformation represented by nappes and recumbent folds involving the thermal event leading to partial melting. Until ~325-310 Ma, the dome emplacement was assisted by diapiric processes. An extensional event took place at 300 Ma, after the emplacement of the late to post-migmatitic granitic plutons. In the northeast side of the dome, a brittle normal-dextral faulting controlled the opening of the Graissessac coal-basin. -post.requestBody.content.application/json.example.1.id = 2 -post.requestBody.content.application/json.example.1.value = The COVID-19 pandemic, also known as the coronavirus pandemic, is an ongoing global pandemic of coronavirus disease 2019 (COVID-19) caused by severe acute respiratory syndrome coronavirus2 (SARS-CoV-2). It was first identified in December 2019 in Wuhan, China. The World Health Organization declared the outbreak a Public Health Emergency of International Concern on 20 January 2020, and later a pandemic on 11 March 2020. As of 2 April 2021, more than 129 million cases have been confirmed, with more than 2.82 million deaths attributed to COVID-19, making it one of the deadliest pandemics in history. -post.responses.default.content.application/json.example.0.id = 1 -post.responses.default.content.application/json.example.0.value.code = sdu -post.responses.default.content.application/json.example.0.value.labelFr = Planète et Univers [physics] -post.responses.default.content.application/json.example.0.value.labelEn = Sciences of the Universe [physics] -post.responses.default.content.application/json.example.1.id = 2 -post.responses.default.content.application/json.example.1.value.code = sdv -post.responses.default.content.application/json.example.1.value.labelFr = Sciences du Vivant [q-bio] -post.responses.default.content.application/json.example.1.value.labelEn = Life Sciences [q-bio] - -[use] -plugin = @ezs/spawn -plugin = @ezs/basics -plugin = @ezs/storage -plugin = @ezs/analytics - -[JSONParse] -separator = * - -[expand] -path = env('path', 'value') -size = 100 -# in production mode, uncomment the following line -# cache = boost - -[expand/exec] -# command should be executable ! -command = ./v1/en/dist_kn3_HAL_en_v1.py - -[dump] -indent = env('indent', false) diff --git a/hal-classifier/v1/en/dico_nd.pkl b/hal-classifier/v1/en/dico_nd.pkl deleted file mode 100644 index a4c187a..0000000 --- a/hal-classifier/v1/en/dico_nd.pkl +++ /dev/null Binary files differ diff --git a/hal-classifier/v1/en/dist_kn3_HAL_en_v1.py b/hal-classifier/v1/en/dist_kn3_HAL_en_v1.py deleted file mode 100755 index e4fbe3d..0000000 --- a/hal-classifier/v1/en/dist_kn3_HAL_en_v1.py +++ /dev/null @@ -1,119 +0,0 @@ -#!/opt/bitnami/python/bin/python3.7 -# -*- coding: utf-8 -*- -""" -Created on Thu Aug 12 10:19:31 2021 - -@author: cuxac -""" - - - - -#from scipy.spatial import cKDTree as KDTree -import faiss -from scipy.special import softmax -import numpy as np -import sys -import json -from collections import defaultdict#,Counter -import operator -from more_itertools import locate -import statistics -import pickle -import fasttext - - -model=fasttext.load_model("./v1/en/modelhal0EN2.bin") -D=np.load("./v1/en/HALen_matrixKDT.npy") - -dico_nd_pkl=open("./v1/en/dico_nd.pkl","rb") -dico_nd=pickle.load(dico_nd_pkl) - -d=200 -quantizer = faiss.IndexFlatL2(d) -quantizer.add(D) - -#verb_class={"spi":"Sciences de l'ingénieur [physics]","shs":"Sciences de l'Homme et Société","sdv":"Sciences du Vivant [q-bio]","sdu":"Planète et Univers [physics]","sde":"Sciences de l'environnement","scco":"Sciences cognitives","phys":"Physique [physics]","nlin":"Science non linéaire [physics]","math":"Mathématiques [math]","info":"Informatique [cs]","chim":"Chimie","stat":"Statistiques","qfin":"Économie et finance quantitative [q-fin]"} - -verb_class={"chim":{"code":"chim","labelFr":"Chimie","labelEn":"Chemical Sciences"}, - "info":{"code":"info","labelFr":"Informatique [cs]","labelEn":"Computer Science [cs]"}, - "math":{"code":"math","labelFr":"Mathématiques [math]","labelEn":"Mathematics [math]"}, - "nlin":{"code":"nlin","labelFr":"Science non linéaire [physics]","labelEn":"Nonlinear Sciences [physics]"}, - "phys":{"code":"phys","labelFr":"Physique [physics]","labelEn":"Physics [physics]"}, - "qfin":{"code":"qfin","labelFr":"Économie et finance quantitative [q-fin]","labelEn":"Quantitative Finance [q-fin]"}, - "scco":{"code":"scco","labelFr":"Sciences cognitives","labelEn":"Cognitive science"}, - "sde":{"code":"sde","labelFr":"Sciences de l'environnement","labelEn":"Environmental Sciences"}, - "sdu":{"code":"sdu","labelFr":"Planète et Univers [physics]","labelEn":"Sciences of the Universe [physics]"}, - "sdv":{"code":"sdv","labelFr":"Sciences du Vivant [q-bio]","labelEn":"Life Sciences [q-bio]"}, - "shs":{"code":"shs","labelFr":"Sciences de l'Homme et Société","labelEn":"Humanities and Social Sciences"}, - "spi":{"code":"spi","labelFr":"Sciences de l'ingénieur [physics]","labelEn":"Engineering Sciences [physics]"}, - "stat":{"code":"stat","labelFr":"Statistiques [stat]","labelEn":"Statistics [stat]"}} - -#kdtree=KDTree(D) - - -distlist=[] -list_defis=[] - -n=0 -K=50 - -for line in sys.stdin: - data = json.loads(line) - text=data['value'] - - mv=model.get_sentence_vector(text.strip()) - t=np.asmatrix(mv) - - Dis, Ind = quantizer.search(t,K) - ppv=Dis[0],Ind[0] #ppv=Dis et Ind - - #ppv=kdtree.query(mv,k=K,p=2)#,distance_upper_bound=0.05) - #dmax_k=kdtree.query(mv,k=[K],p=2)[0][0] - - dmax_k=Dis[0][K-1] - dN_ppv=ppv[0]/dmax_k - distlist.append(ppv[0][0]/dmax_k) - - list_defis=list(ppv[1]) - list_defis_label=list(dico_nd[i] for i in list_defis) - - r=zip(list_defis_label,list(ppv[0])) - dis=defaultdict(list) - for i in set(r): - dis[i[0]].append(1/i[1]) - for k in dis.keys(): - dis[k]=sum(dis[k]) - ddd=np.array(list(dis.values())) - sm=softmax(ddd) - lab=list(dis.keys()) - res=zip(lab,sm) - d4=dict(res) - classmax=max(d4.items(),key=operator.itemgetter(1))[0] - cmax=classmax.split('_')[1].split('.')[0] - - - if ppv[0][0]<100: - - mm=max(list_defis_label,key=list_defis_label.count) - - ind=list(locate(list_defis_label, lambda a: a ==mm)) - indd=list(dN_ppv) - dmax=[indd[i] for i in ind] - dist_mean=statistics.mean(dmax) - prob0=round((len(ind))/(K+len(set(list_defis_label))),3) - prob=d4[classmax] - if len(ind)<25: - dist_mean=str(round(dist_mean,3))+' / '+str(len(ind)) - - data['value']=verb_class[cmax]#,prob - sys.stdout.write(json.dumps(data)) - sys.stdout.write('\n') - - n+=1 - - - - - - diff --git a/hal-classifier/v1/en/modelhal0EN2.bin b/hal-classifier/v1/en/modelhal0EN2.bin deleted file mode 100644 index 1a27209..0000000 --- a/hal-classifier/v1/en/modelhal0EN2.bin +++ /dev/null Binary files differ diff --git a/hal-classifier/v1/fr/HALfr_matrixKDT_Model_Class0HalFr_FT_d60_w5_e10_b40m.npy b/hal-classifier/v1/fr/HALfr_matrixKDT_Model_Class0HalFr_FT_d60_w5_e10_b40m.npy deleted file mode 100644 index f110298..0000000 --- a/hal-classifier/v1/fr/HALfr_matrixKDT_Model_Class0HalFr_FT_d60_w5_e10_b40m.npy +++ /dev/null Binary files differ diff --git a/hal-classifier/v1/fr/Model_Class0HalFr_FT_d60_w5_e10_b40m.bin b/hal-classifier/v1/fr/Model_Class0HalFr_FT_d60_w5_e10_b40m.bin deleted file mode 100644 index 4b96ef9..0000000 --- a/hal-classifier/v1/fr/Model_Class0HalFr_FT_d60_w5_e10_b40m.bin +++ /dev/null Binary files differ diff --git a/hal-classifier/v1/fr/classhalfr.ini b/hal-classifier/v1/fr/classhalfr.ini deleted file mode 100644 index a1c33b1..0000000 --- a/hal-classifier/v1/fr/classhalfr.ini +++ /dev/null @@ -1,40 +0,0 @@ -# OpenAPI Documentation - JSON format (dot notation) -mimeType = application/json -post.operationId = post-v1-fr-classhalfr -post.description = Calcule la classe de premier niveau de la base HAL pour des documents en français et renvoie la verbalisation de la classe -post.responses.default.description = Renvoie un Json composé de `id`, `value` avec `value` la verbalisation de la classe -post.responses.default.content.application/json.schema.$ref = #/components/schemas/JSONStream -post.summary = Classification Hal pour le français -post.tags.0 = Classification supervisée -post.requestBody.required = true -post.requestBody.content.application/json.schema.$ref = #/components/schemas/JSONStream -post.parameters.0.in = query -post.parameters.0.name = path -post.parameters.0.schema.type = string -post.parameters.0.description = The path in each object to enrich with an Python script -post.parameters.1.in = query -post.parameters.1.name = indent -post.parameters.1.schema.type = boolean -post.parameters.1.description = Indent or not the JSON Result - -[use] -plugin = @ezs/spawn -plugin = @ezs/basics -plugin = @ezs/storage -plugin = @ezs/analytics - -[JSONParse] -separator = * - -[expand] -path = env('path', 'value') -size = 100 -# in production mode, uncomment the following line -# cache = boost - -[expand/exec] -# command should be executable ! -command = ./v1/fr/dist_FAISSkn3_HAL_fr_v1.py - -[dump] -indent = env('indent', false) diff --git a/hal-classifier/v1/fr/dicoFR_nd.pkl b/hal-classifier/v1/fr/dicoFR_nd.pkl deleted file mode 100644 index 988c32d..0000000 --- a/hal-classifier/v1/fr/dicoFR_nd.pkl +++ /dev/null Binary files differ diff --git a/hal-classifier/v1/fr/dist_FAISSkn3_HAL_fr_v1.py b/hal-classifier/v1/fr/dist_FAISSkn3_HAL_fr_v1.py deleted file mode 100644 index 0e3c3c2..0000000 --- a/hal-classifier/v1/fr/dist_FAISSkn3_HAL_fr_v1.py +++ /dev/null @@ -1,119 +0,0 @@ -#!/opt/bitnami/python/bin/python3.7 -# -*- coding: utf-8 -*- -""" -Created on Thu Aug 12 10:19:31 2021 - -@author: cuxac -""" - - - - -#from scipy.spatial import cKDTree as KDTree -import faiss -from scipy.special import softmax -import numpy as np -import sys -import json -from collections import defaultdict#,Counter -import operator -from more_itertools import locate -import statistics -import pickle -import fasttext - - -model=fasttext.load_model("./v1/fr/Model_Class0HalFr_FT_d60_w5_e10_b40m.bin") -D=np.load("./v1/fr/HALfr_matrixKDT_Model_Class0HalFr_FT_d60_w5_e10_b40m.npy") - -dico_nd_pkl=open("./v1/fr/dicoFR_nd.pkl","rb") -dico_nd=pickle.load(dico_nd_pkl) - -d=60 -quantizer = faiss.IndexFlatL2(d) -quantizer.add(D) - -##verb_class={"spi":"Sciences de l'ingénieur [physics]","shs":"Sciences de l'Homme et Société","sdv":"Sciences du Vivant [q-bio]","sdu":"Planète et Univers [physics]","sde":"Sciences de l'environnement","scco":"Sciences cognitives","phys":"Physique [physics]","nlin":"Science non linéaire [physics]","math":"Mathématiques [math]","info":"Informatique [cs]","chim":"Chimie","stat":"Statistiques","qfin":"Économie et finance quantitative [q-fin]"} - -verb_class={"chim":{"code":"chim","labelFr":"Chimie","labelEn":"Chemical Sciences"}, - "info":{"code":"info","labelFr":"Informatique [cs]","labelEn":"Computer Science [cs]"}, - "math":{"code":"math","labelFr":"Mathématiques [math]","labelEn":"Mathematics [math]"}, - "nlin":{"code":"nlin","labelFr":"Science non linéaire [physics]","labelEn":"Nonlinear Sciences [physics]"}, - "phys":{"code":"phys","labelFr":"Physique [physics]","labelEn":"Physics [physics]"}, - "qfin":{"code":"qfin","labelFr":"Économie et finance quantitative [q-fin]","labelEn":"Quantitative Finance [q-fin]"}, - "scco":{"code":"scco","labelFr":"Sciences cognitives","labelEn":"Cognitive science"}, - "sde":{"code":"sde","labelFr":"Sciences de l'environnement","labelEn":"Environmental Sciences"}, - "sdu":{"code":"sdu","labelFr":"Planète et Univers [physics]","labelEn":"Sciences of the Universe [physics]"}, - "sdv":{"code":"sdv","labelFr":"Sciences du Vivant [q-bio]","labelEn":"Life Sciences [q-bio]"}, - "shs":{"code":"shs","labelFr":"Sciences de l'Homme et Société","labelEn":"Humanities and Social Sciences"}, - "spi":{"code":"spi","labelFr":"Sciences de l'ingénieur [physics]","labelEn":"Engineering Sciences [physics]"}, - "stat":{"code":"stat","labelFr":"Statistiques [stat]","labelEn":"Statistics [stat]"}} - -#kdtree=KDTree(D) - - -distlist=[] -list_defis=[] - -n=0 -K=50 - -for line in sys.stdin: - data = json.loads(line) - text=data['value'] - - mv=model.get_sentence_vector(text.strip()) - t=np.asmatrix(mv) - - Dis, Ind = quantizer.search(t,K) - ppv=Dis[0],Ind[0] #ppv=Dis et Ind - - #ppv=kdtree.query(mv,k=K,p=2)#,distance_upper_bound=0.05) - #dmax_k=kdtree.query(mv,k=[K],p=2)[0][0] - - dmax_k=Dis[0][K-1] - dN_ppv=ppv[0]/dmax_k - distlist.append(ppv[0][0]/dmax_k) - - list_defis=list(ppv[1]) - list_defis_label=list(dico_nd[i] for i in list_defis) - - r=zip(list_defis_label,list(ppv[0])) - dis=defaultdict(list) - for i in set(r): - dis[i[0]].append(1/i[1]) - for k in dis.keys(): - dis[k]=sum(dis[k]) - ddd=np.array(list(dis.values())) - sm=softmax(ddd) - lab=list(dis.keys()) - res=zip(lab,sm) - d4=dict(res) - classmax=max(d4.items(),key=operator.itemgetter(1))[0] - cmax=classmax.split('_')[1].split('.')[0] - - - if ppv[0][0]<100: - - mm=max(list_defis_label,key=list_defis_label.count) - - ind=list(locate(list_defis_label, lambda a: a ==mm)) - indd=list(dN_ppv) - dmax=[indd[i] for i in ind] - dist_mean=statistics.mean(dmax) - prob0=round((len(ind))/(K+len(set(list_defis_label))),3) - prob=d4[classmax] - if len(ind)<25: - dist_mean=str(round(dist_mean,3))+' / '+str(len(ind)) - - data['value']=verb_class[cmax]#,prob - sys.stdout.write(json.dumps(data)) - sys.stdout.write('\n') - - n+=1 - - - - - -