#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
from sklearn.cluster import HDBSCAN
from scipy.spatial import distance
import sys
import unicodedata
import re
import spacy
#normalize text
def remove_accents(text):
if text == "" or type(text)!= str:
return ""
normalized_text = unicodedata.normalize("NFD", text)
text_with_no_accent = re.sub("[\u0300-\u036f]", "", normalized_text)
return text_with_no_accent
def uniformize(text):
# del accents
text = remove_accents(text)
# remove punctuation except " ' "
text = ''.join(char if char.isalpha() or char == "'" else ' ' for char in text)
return ' '.join(text.lower().split())
#lemmatize
nlp = spacy.load('en_core_web_sm', disable = ['parser','ner']) #load lemmatizer
def lemmatize(text):
if text == "":
return text
doc = nlp(text)
return " ".join([token.lemma_ for token in doc])
#stopwords
with open('./v1/stopwords/en.json','r') as f_in:
stopwords =json.load(f_in)
# # PCA
# from sklearn.decomposition import PCA
# from sklearn.preprocessing import StandardScaler
# def reduce_dim(tfidf_matrix,n_components=0.9):
# # center and reduce
# scaler = StandardScaler()
# scaler.fit(tfidf_matrix)
# tfidf_matrix=scaler.transform(tfidf_matrix)
# mypca = PCA(n_components=n_components,svd_solver="full")
# mypca.fit(tfidf_matrix)
# pca_mat = mypca.fit_transform(tfidf_matrix)
# # print(mypca.explained_variance_ratio_.sum())
# return pca_mat
## WS
# Datas
all_data = []
for line in sys.stdin:
data=json.loads(line)
all_data.append(data)
# 1 - vectorize using TF IDF
from sklearn.feature_extraction.text import TfidfVectorizer
texts=[]
for line in all_data:
if "value" in line:
texts.append(lemmatize(uniformize(line["value"])))
else:
texts.append("n/a")
vectorizer = TfidfVectorizer(stop_words=stopwords,analyzer="word",max_df=0.8,min_df=3) #delete terms with high/low frequences
X = vectorizer.fit_transform(texts)
X = X.toarray()
print(X.shape)
# # PCA here
# X = reduce_dim(X,n_components=0.9)
# cosine dist
cosine_dist_X = distance.cdist(X, X, metric='cosine')
# # 2 - vectorize using embedding
# import numpy as np
# from gensim.models import KeyedVectors
# model_path = '../../../test-hdbscan/wiki-news-300d-1M.vec'
# model_embedding = KeyedVectors.load_word2vec_format(model_path)
# def tokenize(text):autres-sorted
# return doc_embedding
# X=[]
# for line in all_data:
# if "value" in line:
# X.append(document_embedding(tokenize(lemmatize(uniformize(line["value"]))),model_embedding))
# else:
# X.append(np.zeros(model_embedding.vector_size))
# #cosine dist
# cosine_dist_X = distance.cdist(X, X, metric='cosine')
# HDBSCAN
clusterer = HDBSCAN(
algorithm='auto',
metric='precomputed', #metric = cosine ?
min_cluster_size=5,
cluster_selection_epsilon=0,
alpha=1,
cluster_selection_method="eom",
n_jobs=-1)
clusterer.fit(cosine_dist_X)
# # To test it with a jsonl (comment from here to exit)
# tests = []
# for i in range(len(all_data)):
# tests.append({"cluster":int(clusterer.labels_[i]+1), "weight":str(clusterer.probabilities_[i])})
# sys.stdout.write(json.dumps(tests))
# exit()
# extract infos
res = []
for i in range(len(all_data)):
all_data[i]["value"]={"cluster":int(clusterer.labels_[i]+1), "weight":str(clusterer.probabilities_[i])}
# Write all corpus in once
for line in all_data:
sys.stdout.write(json.dumps(line))
sys.stdout.write("\n")