#!/usr/bin/env python3 # -*- coding: utf-8 -*- import json from sklearn.cluster import HDBSCAN from scipy.spatial import distance import sys import unicodedata import re import spacy #normalize text def remove_accents(text): if text == "" or type(text)!= str: return "" normalized_text = unicodedata.normalize("NFD", text) text_with_no_accent = re.sub("[\u0300-\u036f]", "", normalized_text) return text_with_no_accent def uniformize(text): # del accents text = remove_accents(text) # remove punctuation except " ' " text = ''.join(char if char.isalpha() or char == "'" else ' ' for char in text) return ' '.join(text.lower().split()) #lemmatize nlp = spacy.load('en_core_web_sm', disable = ['parser','ner']) #load lemmatizer def lemmatize(text): if text == "": return text doc = nlp(text) return " ".join([token.lemma_ for token in doc]) #stopwords with open('./v1/stopwords/en.json','r') as f_in: stopwords =json.load(f_in) # # PCA # from sklearn.decomposition import PCA # from sklearn.preprocessing import StandardScaler # def reduce_dim(tfidf_matrix,n_components=0.9): # # center and reduce # scaler = StandardScaler() # scaler.fit(tfidf_matrix) # tfidf_matrix=scaler.transform(tfidf_matrix) # mypca = PCA(n_components=n_components,svd_solver="full") # mypca.fit(tfidf_matrix) # pca_mat = mypca.fit_transform(tfidf_matrix) # # print(mypca.explained_variance_ratio_.sum()) # return pca_mat ## WS # Datas all_data = [] for line in sys.stdin: data=json.loads(line) all_data.append(data) # 1 - vectorize using TF IDF from sklearn.feature_extraction.text import TfidfVectorizer texts=[] for line in all_data: if "value" in line: texts.append(lemmatize(uniformize(line["value"]))) else: texts.append("n/a") vectorizer = TfidfVectorizer(stop_words=stopwords,analyzer="word",max_df=0.8,min_df=3) #delete terms with high/low frequences X = vectorizer.fit_transform(texts) X = X.toarray() print(X.shape) # # PCA here # X = reduce_dim(X,n_components=0.9) # cosine dist cosine_dist_X = distance.cdist(X, X, metric='cosine') # # 2 - vectorize using embedding # import numpy as np # from gensim.models import KeyedVectors # model_path = '../../../test-hdbscan/wiki-news-300d-1M.vec' # model_embedding = KeyedVectors.load_word2vec_format(model_path) # def tokenize(text):autres-sorted # return doc_embedding # X=[] # for line in all_data: # if "value" in line: # X.append(document_embedding(tokenize(lemmatize(uniformize(line["value"]))),model_embedding)) # else: # X.append(np.zeros(model_embedding.vector_size)) # #cosine dist # cosine_dist_X = distance.cdist(X, X, metric='cosine') # HDBSCAN clusterer = HDBSCAN( algorithm='auto', metric='precomputed', #metric = cosine ? min_cluster_size=5, cluster_selection_epsilon=0, alpha=1, cluster_selection_method="eom", n_jobs=-1) clusterer.fit(cosine_dist_X) # # To test it with a jsonl (comment from here to exit) # tests = [] # for i in range(len(all_data)): # tests.append({"cluster":int(clusterer.labels_[i]+1), "weight":str(clusterer.probabilities_[i])}) # sys.stdout.write(json.dumps(tests)) # exit() # extract infos res = [] for i in range(len(all_data)): all_data[i]["value"]={"cluster":int(clusterer.labels_[i]+1), "weight":str(clusterer.probabilities_[i])} # Write all corpus in once for line in all_data: sys.stdout.write(json.dumps(line)) sys.stdout.write("\n")