diff --git a/data-computer/v1/hdbscan.py b/data-computer/v1/hdbscan.py
index 2d0b3d1..3742b8b 100755
--- a/data-computer/v1/hdbscan.py
+++ b/data-computer/v1/hdbscan.py
@@ -41,17 +41,17 @@
 # # PCA
 # from sklearn.decomposition import PCA
 # from sklearn.preprocessing import StandardScaler
-# def reduce_dim(tfidf_matrix,nb_components=100):
+# def reduce_dim(tfidf_matrix,n_components=0.9):
 #     # center and reduce
 #     scaler = StandardScaler()
 #     scaler.fit(tfidf_matrix)
 #     tfidf_matrix=scaler.transform(tfidf_matrix) 
 
-#     mypca = PCA(nb_components)
+#     mypca = PCA(n_components=n_components,svd_solver="full")
 #     mypca.fit(tfidf_matrix)
 
-#     pca_mat = mypca.fit_transform(tfidf_matrix)
-#     print(mypca.noise_variance_)
+#     pca_mat = mypca.fit_transform(tfidf_matrix)    
+#     # print(mypca.explained_variance_ratio_.sum())
 
 #     return pca_mat
 
@@ -63,7 +63,6 @@
     data=json.loads(line)
     all_data.append(data)
 
-
 # 1 - vectorize using TF IDF
 from sklearn.feature_extraction.text import TfidfVectorizer
 texts=[]
@@ -75,8 +74,9 @@
 vectorizer = TfidfVectorizer(stop_words=stopwords,analyzer="word",max_df=0.8,min_df=3) #delete terms with high/low frequences
 X = vectorizer.fit_transform(texts)
 X = X.toarray()
+print(X.shape)
 # # PCA here
-# X = reduce_dim(X.toarray())
+# X = reduce_dim(X,n_components=0.9)
 # cosine dist
 cosine_dist_X = distance.cdist(X, X, metric='cosine')
 
@@ -112,10 +112,10 @@
     cluster_selection_method="eom",
     n_jobs=-1) 
 
-clusterer.fit(X)
+clusterer.fit(cosine_dist_X)
 
 
-# # To test it with a jsonl
+# # To test it with a jsonl (comment from here to exit)
 # tests = []
 # for i in range(len(all_data)):
 #     tests.append({"cluster":int(clusterer.labels_[i]+1), "weight":str(clusterer.probabilities_[i])})