Newer
Older
web-services / data-computer / v1 / hdbscan.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
from sklearn.cluster import HDBSCAN
from scipy.spatial import distance
import sys
import unicodedata
import re
import spacy

#normalize text
def remove_accents(text):
    if text == "" or type(text)!= str:
        return ""
    normalized_text = unicodedata.normalize("NFD", text)
    text_with_no_accent = re.sub("[\u0300-\u036f]", "", normalized_text)
    return text_with_no_accent

def uniformize(text):
    # del accents
    text = remove_accents(text)

    # remove punctuation except " ' "
    text = ''.join(char if char.isalpha() or char == "'" else ' ' for char in text)

    return ' '.join(text.lower().split())

#lemmatize
nlp = spacy.load('en_core_web_sm', disable = ['parser','ner']) #load lemmatizer

def lemmatize(text):
    if text == "":
        return text
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

#stopwords
with open('./v1/stopwords/en.json','r') as f_in:
    stopwords =json.load(f_in)

# # PCA
# from sklearn.decomposition import PCA
# from sklearn.preprocessing import StandardScaler
# def reduce_dim(tfidf_matrix,n_components=0.9):
#     # center and reduce
#     scaler = StandardScaler()
#     scaler.fit(tfidf_matrix)
#     tfidf_matrix=scaler.transform(tfidf_matrix) 

#     mypca = PCA(n_components=n_components,svd_solver="full")
#     mypca.fit(tfidf_matrix)

#     pca_mat = mypca.fit_transform(tfidf_matrix)    
#     # print(mypca.explained_variance_ratio_.sum())

#     return pca_mat


## WS
# Datas
all_data = []
for line in sys.stdin:
    data=json.loads(line)
    all_data.append(data)

# 1 - vectorize using TF IDF
from sklearn.feature_extraction.text import TfidfVectorizer
texts=[]
for line in all_data:
    if "value" in line:
        texts.append(lemmatize(uniformize(line["value"])))
    else:
        texts.append("n/a")
vectorizer = TfidfVectorizer(stop_words=stopwords,analyzer="word",max_df=0.8,min_df=3) #delete terms with high/low frequences
X = vectorizer.fit_transform(texts)
X = X.toarray()
print(X.shape)
# # PCA here
# X = reduce_dim(X,n_components=0.9)
# cosine dist
cosine_dist_X = distance.cdist(X, X, metric='cosine')



# # 2 - vectorize using embedding
# import numpy as np
# from gensim.models import KeyedVectors
# model_path = '../../../test-hdbscan/wiki-news-300d-1M.vec'
# model_embedding = KeyedVectors.load_word2vec_format(model_path)

# def tokenize(text):autres-sorted
#     return doc_embedding

# X=[]
# for line in all_data:
#     if "value" in line:
#         X.append(document_embedding(tokenize(lemmatize(uniformize(line["value"]))),model_embedding))
#     else:
#         X.append(np.zeros(model_embedding.vector_size))
# #cosine dist
# cosine_dist_X = distance.cdist(X, X, metric='cosine')



# HDBSCAN
clusterer = HDBSCAN(
    algorithm='auto',
    metric='precomputed', #metric = cosine ?
    min_cluster_size=5,
    cluster_selection_epsilon=0,
    alpha=1,
    cluster_selection_method="eom",
    n_jobs=-1) 

clusterer.fit(cosine_dist_X)


# # To test it with a jsonl (comment from here to exit)
# tests = []
# for i in range(len(all_data)):
#     tests.append({"cluster":int(clusterer.labels_[i]+1), "weight":str(clusterer.probabilities_[i])})
# sys.stdout.write(json.dumps(tests))
# exit()

# extract infos
res = []
for i in range(len(all_data)):
    all_data[i]["value"]={"cluster":int(clusterer.labels_[i]+1), "weight":str(clusterer.probabilities_[i])}


# Write all corpus in once
for line in all_data:
    sys.stdout.write(json.dumps(line))
    sys.stdout.write("\n")