diff --git a/data-computer/v1/hdbscan.py b/data-computer/v1/hdbscan.py index 2d0b3d1..3742b8b 100755 --- a/data-computer/v1/hdbscan.py +++ b/data-computer/v1/hdbscan.py @@ -41,17 +41,17 @@ # # PCA # from sklearn.decomposition import PCA # from sklearn.preprocessing import StandardScaler -# def reduce_dim(tfidf_matrix,nb_components=100): +# def reduce_dim(tfidf_matrix,n_components=0.9): # # center and reduce # scaler = StandardScaler() # scaler.fit(tfidf_matrix) # tfidf_matrix=scaler.transform(tfidf_matrix) -# mypca = PCA(nb_components) +# mypca = PCA(n_components=n_components,svd_solver="full") # mypca.fit(tfidf_matrix) -# pca_mat = mypca.fit_transform(tfidf_matrix) -# print(mypca.noise_variance_) +# pca_mat = mypca.fit_transform(tfidf_matrix) +# # print(mypca.explained_variance_ratio_.sum()) # return pca_mat @@ -63,7 +63,6 @@ data=json.loads(line) all_data.append(data) - # 1 - vectorize using TF IDF from sklearn.feature_extraction.text import TfidfVectorizer texts=[] @@ -75,8 +74,9 @@ vectorizer = TfidfVectorizer(stop_words=stopwords,analyzer="word",max_df=0.8,min_df=3) #delete terms with high/low frequences X = vectorizer.fit_transform(texts) X = X.toarray() +print(X.shape) # # PCA here -# X = reduce_dim(X.toarray()) +# X = reduce_dim(X,n_components=0.9) # cosine dist cosine_dist_X = distance.cdist(X, X, metric='cosine') @@ -112,10 +112,10 @@ cluster_selection_method="eom", n_jobs=-1) -clusterer.fit(X) +clusterer.fit(cosine_dist_X) -# # To test it with a jsonl +# # To test it with a jsonl (comment from here to exit) # tests = [] # for i in range(len(all_data)): # tests.append({"cluster":int(clusterer.labels_[i]+1), "weight":str(clusterer.probabilities_[i])})