diff --git a/hal-classifier/requirements.txt b/hal-classifier/requirements.txt index c2d846a..5a40abb 100644 --- a/hal-classifier/requirements.txt +++ b/hal-classifier/requirements.txt @@ -1,4 +1,6 @@ more-itertools -numpy -scipy +numpy==1.19.1 +scipy==1.5.4 fasttext +faiss-cpu==1.7.2 + diff --git a/hal-classifier/v1/en/dist_kn3_HAL_en_v1.py b/hal-classifier/v1/en/dist_kn3_HAL_en_v1.py index 3d859d0..bf3d4d1 100644 --- a/hal-classifier/v1/en/dist_kn3_HAL_en_v1.py +++ b/hal-classifier/v1/en/dist_kn3_HAL_en_v1.py @@ -9,7 +9,8 @@ -from scipy.spatial import cKDTree as KDTree +#from scipy.spatial import cKDTree as KDTree +import faiss from scipy.special import softmax import numpy as np import sys @@ -22,15 +23,15 @@ import fasttext +model=fasttext.load_model("modelhal0EN2.bin") +D=np.load("HALen_matrixKDT.npy") - -model=fasttext.load_model("./v1/en/modelhal0EN2.bin") -D=np.load("./v1/en/HALen_matrixKDT.npy") - -dico_nd_pkl=open("./v1/en/dico_nd.pkl","rb") +dico_nd_pkl=open("dico_nd.pkl","rb") dico_nd=pickle.load(dico_nd_pkl) - +d=200 +quantizer = faiss.IndexFlatL2(d) +quantizer.add(D) verb_class={"spi":"Sciences de l'ingénieur [physics]","shs":"Sciences de l'Homme et Société","sdv":"Sciences du Vivant [q-bio]","sdu":"Planète et Univers [physics]","sde":"Sciences de l'environnement","scco":"Sciences cognitives","phys":"Physique [physics]","nlin":"Science non linéaire [physics]","math":"Mathématiques [math]","info":"Informatique [cs]","chim":"Chimie","stat":"Statistiques","qfin":"Économie et finance quantitative [q-fin]"} @@ -48,9 +49,15 @@ text=data['value'] mv=model.get_sentence_vector(text.strip()) + t=np.asmatrix(mv) - ppv=kdtree.query(mv,k=K,p=2)#,distance_upper_bound=0.05) - dmax_k=kdtree.query(mv,k=[K],p=2)[0][0] + Dis, Ind = quantizer.search(t,K) + ppv=Dis[0],Ind[0] #ppv=Dis et Ind + + #ppv=kdtree.query(mv,k=K,p=2)#,distance_upper_bound=0.05) + #dmax_k=kdtree.query(mv,k=[K],p=2)[0][0] + + dmax_k=Dis[0][K-1] dN_ppv=ppv[0]/dmax_k distlist.append(ppv[0][0]/dmax_k)