diff --git a/data-computer/requirements.txt b/data-computer/requirements.txt index 1a0fc4f..7a892f5 100755 --- a/data-computer/requirements.txt +++ b/data-computer/requirements.txt @@ -1,3 +1,4 @@ gensim==4.3.2 spacy==3.6.1 +scikit-learn==1.3.2 en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0.tar.gz diff --git a/data-computer/v1/hdbscan.py b/data-computer/v1/hdbscan.py index 71c07b0..4553f43 100755 --- a/data-computer/v1/hdbscan.py +++ b/data-computer/v1/hdbscan.py @@ -3,11 +3,13 @@ import json from sklearn.cluster import HDBSCAN from sklearn.feature_extraction.text import TfidfVectorizer +from scipy.spatial import distance import sys import unicodedata import re import spacy +import matplotlib.pyplot as plt #normalize text @@ -57,22 +59,33 @@ texts.append(lemmatize(uniformize(line["value"]))) else: texts.append("n/a") -vectorizer = TfidfVectorizer(stop_words=stopwords,analyzer="word",max_df=0.8,min_df=2) #delete terms with high/low frequences +vectorizer = TfidfVectorizer(stop_words=stopwords,analyzer="word",max_df=0.8,min_df=3) #delete terms with high/low frequences X = vectorizer.fit_transform(texts) # HDBSCAN -min_cluster_size = max(2,2*int(len(texts)/100)) - +cosine_dist_X = distance.cdist(X.toarray(), X.toarray(), metric='cosine') clusterer = HDBSCAN( algorithm='auto', - metric='euclidean', #metric = cosine ? - min_cluster_size=min_cluster_size, + metric='precomputed', #metric = cosine ? + min_cluster_size=5, + cluster_selection_epsilon=0.5, + cluster_selection_method="eom", n_jobs=1) -clusterer.fit(X) +clusterer.fit(cosine_dist_X) +print(clusterer) +exit() +tests = [] +for i in range(len(all_data)): + tests.append({"cluster":int(clusterer.labels_[i]+1), "weight":str(clusterer.probabilities_[i])}) +sys.stdout.write(json.dumps(tests)) +plt.scatter(data[:, 0], data[:, 1], c=cluster_labels, cmap='viridis', s=50) +plt.show() + +exit() # extract infos res = [] for i in range(len(all_data)):