diff --git a/hal-classifier/v1/fr/HALfr_matrixKDT_Model_Class0HalFr_FT_d60_w5_e10_b40m.npy b/hal-classifier/v1/fr/HALfr_matrixKDT_Model_Class0HalFr_FT_d60_w5_e10_b40m.npy new file mode 100644 index 0000000..f110298 --- /dev/null +++ b/hal-classifier/v1/fr/HALfr_matrixKDT_Model_Class0HalFr_FT_d60_w5_e10_b40m.npy Binary files differ diff --git a/hal-classifier/v1/fr/Model_Class0HalFr_FT_d60_w5_e10_b40m.bin b/hal-classifier/v1/fr/Model_Class0HalFr_FT_d60_w5_e10_b40m.bin new file mode 100644 index 0000000..4b96ef9 --- /dev/null +++ b/hal-classifier/v1/fr/Model_Class0HalFr_FT_d60_w5_e10_b40m.bin Binary files differ diff --git a/hal-classifier/v1/fr/classhalfr.ini b/hal-classifier/v1/fr/classhalfr.ini new file mode 100644 index 0000000..c9c9554 --- /dev/null +++ b/hal-classifier/v1/fr/classhalfr.ini @@ -0,0 +1,41 @@ +# OpenAPI Documentation - JSON format (dot notation) +mimeType = application/json +post.operationId = post-v1-fr-classhalfr +post.description = Calcule la classe de premier niveau de la base HAL pour des documents en français et renvoie la verbalisation de la classe +post.responses.default.description = Renvoie un Json composé de `id`, `value` avec `value` la verbalisation de la classe +post.responses.default.content.application/json.schema.$ref = #/components/schemas/JSONStream +post.summary = Classification Hal pour le français +post.tags.0 = Classification supervisée +post.requestBody.required = true +post.requestBody.content.application/json.schema.$ref = #/components/schemas/JSONStream +post.parameters.0.in = query +post.parameters.0.name = path +post.parameters.0.schema.type = string +post.parameters.0.description = The path in each object to enrich with an Python script +post.parameters.1.in = query +post.parameters.1.name = indent +post.parameters.1.schema.type = boolean +post.parameters.1.description = Indent or not the JSON Result + +[use] +plugin = @ezs/spawn +plugin = @ezs/basics +plugin = @ezs/storage +plugin = @ezs/analytics + +[JSONParse] +#legacy = false +separator = * + +[expand] +path = env('path', 'value') +size = 100 +# in production mode, uncomment the following line +# cache = boost + +[expand/exec] +# command should be executable ! +command = ./v1/fr/dist_FAISSkn3_HAL_fr_v1.py + +[dump] +indent = env('indent', false) diff --git a/hal-classifier/v1/fr/dicoFR_nd.pkl b/hal-classifier/v1/fr/dicoFR_nd.pkl new file mode 100644 index 0000000..988c32d --- /dev/null +++ b/hal-classifier/v1/fr/dicoFR_nd.pkl Binary files differ diff --git a/hal-classifier/v1/fr/dist_FAISSkn3_HAL_fr_v1.py b/hal-classifier/v1/fr/dist_FAISSkn3_HAL_fr_v1.py new file mode 100644 index 0000000..0e3c3c2 --- /dev/null +++ b/hal-classifier/v1/fr/dist_FAISSkn3_HAL_fr_v1.py @@ -0,0 +1,119 @@ +#!/opt/bitnami/python/bin/python3.7 +# -*- coding: utf-8 -*- +""" +Created on Thu Aug 12 10:19:31 2021 + +@author: cuxac +""" + + + + +#from scipy.spatial import cKDTree as KDTree +import faiss +from scipy.special import softmax +import numpy as np +import sys +import json +from collections import defaultdict#,Counter +import operator +from more_itertools import locate +import statistics +import pickle +import fasttext + + +model=fasttext.load_model("./v1/fr/Model_Class0HalFr_FT_d60_w5_e10_b40m.bin") +D=np.load("./v1/fr/HALfr_matrixKDT_Model_Class0HalFr_FT_d60_w5_e10_b40m.npy") + +dico_nd_pkl=open("./v1/fr/dicoFR_nd.pkl","rb") +dico_nd=pickle.load(dico_nd_pkl) + +d=60 +quantizer = faiss.IndexFlatL2(d) +quantizer.add(D) + +##verb_class={"spi":"Sciences de l'ingénieur [physics]","shs":"Sciences de l'Homme et Société","sdv":"Sciences du Vivant [q-bio]","sdu":"Planète et Univers [physics]","sde":"Sciences de l'environnement","scco":"Sciences cognitives","phys":"Physique [physics]","nlin":"Science non linéaire [physics]","math":"Mathématiques [math]","info":"Informatique [cs]","chim":"Chimie","stat":"Statistiques","qfin":"Économie et finance quantitative [q-fin]"} + +verb_class={"chim":{"code":"chim","labelFr":"Chimie","labelEn":"Chemical Sciences"}, + "info":{"code":"info","labelFr":"Informatique [cs]","labelEn":"Computer Science [cs]"}, + "math":{"code":"math","labelFr":"Mathématiques [math]","labelEn":"Mathematics [math]"}, + "nlin":{"code":"nlin","labelFr":"Science non linéaire [physics]","labelEn":"Nonlinear Sciences [physics]"}, + "phys":{"code":"phys","labelFr":"Physique [physics]","labelEn":"Physics [physics]"}, + "qfin":{"code":"qfin","labelFr":"Économie et finance quantitative [q-fin]","labelEn":"Quantitative Finance [q-fin]"}, + "scco":{"code":"scco","labelFr":"Sciences cognitives","labelEn":"Cognitive science"}, + "sde":{"code":"sde","labelFr":"Sciences de l'environnement","labelEn":"Environmental Sciences"}, + "sdu":{"code":"sdu","labelFr":"Planète et Univers [physics]","labelEn":"Sciences of the Universe [physics]"}, + "sdv":{"code":"sdv","labelFr":"Sciences du Vivant [q-bio]","labelEn":"Life Sciences [q-bio]"}, + "shs":{"code":"shs","labelFr":"Sciences de l'Homme et Société","labelEn":"Humanities and Social Sciences"}, + "spi":{"code":"spi","labelFr":"Sciences de l'ingénieur [physics]","labelEn":"Engineering Sciences [physics]"}, + "stat":{"code":"stat","labelFr":"Statistiques [stat]","labelEn":"Statistics [stat]"}} + +#kdtree=KDTree(D) + + +distlist=[] +list_defis=[] + +n=0 +K=50 + +for line in sys.stdin: + data = json.loads(line) + text=data['value'] + + mv=model.get_sentence_vector(text.strip()) + t=np.asmatrix(mv) + + Dis, Ind = quantizer.search(t,K) + ppv=Dis[0],Ind[0] #ppv=Dis et Ind + + #ppv=kdtree.query(mv,k=K,p=2)#,distance_upper_bound=0.05) + #dmax_k=kdtree.query(mv,k=[K],p=2)[0][0] + + dmax_k=Dis[0][K-1] + dN_ppv=ppv[0]/dmax_k + distlist.append(ppv[0][0]/dmax_k) + + list_defis=list(ppv[1]) + list_defis_label=list(dico_nd[i] for i in list_defis) + + r=zip(list_defis_label,list(ppv[0])) + dis=defaultdict(list) + for i in set(r): + dis[i[0]].append(1/i[1]) + for k in dis.keys(): + dis[k]=sum(dis[k]) + ddd=np.array(list(dis.values())) + sm=softmax(ddd) + lab=list(dis.keys()) + res=zip(lab,sm) + d4=dict(res) + classmax=max(d4.items(),key=operator.itemgetter(1))[0] + cmax=classmax.split('_')[1].split('.')[0] + + + if ppv[0][0]<100: + + mm=max(list_defis_label,key=list_defis_label.count) + + ind=list(locate(list_defis_label, lambda a: a ==mm)) + indd=list(dN_ppv) + dmax=[indd[i] for i in ind] + dist_mean=statistics.mean(dmax) + prob0=round((len(ind))/(K+len(set(list_defis_label))),3) + prob=d4[classmax] + if len(ind)<25: + dist_mean=str(round(dist_mean,3))+' / '+str(len(ind)) + + data['value']=verb_class[cmax]#,prob + sys.stdout.write(json.dumps(data)) + sys.stdout.write('\n') + + n+=1 + + + + + +