Newer
Older
web-services / data-computer / v1 / hdbscan.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
from sklearn.cluster import HDBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
import sys
import unicodedata
import re
import spacy



#normalize text
def remove_accents(text):
    if text == "" or type(text)!= str:
        return ""
    normalized_text = unicodedata.normalize("NFD", text)
    text_with_no_accent = re.sub("[\u0300-\u036f]", "", normalized_text)
    return text_with_no_accent

def uniformize(text):
    # del accents
    text = remove_accents(text)

    # remove punctuation except " ' "
    text = ''.join(char if char.isalpha() or char == "'" else ' ' for char in text)

    return ' '.join(text.lower().split())

#lemmatize
nlp = spacy.load('en_core_web_sm', disable = ['parser','ner']) #load lemmatizer

def lemmatize(text):
    if text == "":
        return text
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

#stopwords
with open('./v1/stopwords/en.json','r') as f_in:
    stopwords =json.load(f_in)



## WS
# Datas
all_data = []
for line in sys.stdin:
    data=json.loads(line)
    all_data.append(data)


#TF IDF
texts=[]
for line in all_data:
    if "value" in line:
        texts.append(lemmatize(uniformize(line["value"])))
    else:
        texts.append("n/a")
vectorizer = TfidfVectorizer(stop_words=stopwords,analyzer="word",max_df=0.8,min_df=2) #delete terms with high/low frequences
X = vectorizer.fit_transform(texts)


# HDBSCAN
min_cluster_size = max(2,2*int(len(texts)/100))

clusterer = HDBSCAN(
    algorithm='auto',
    metric='euclidean', #metric = cosine ?
    min_cluster_size=min_cluster_size,
    n_jobs=1) 

clusterer.fit(X)


# extract infos
res = []
for i in range(len(all_data)):
    all_data[i]["hdbscan"]={"cluster":int(clusterer.labels_[i]+1), "weight":str(clusterer.probabilities_[i])}


# Write all corpus in once
for line in all_data:
    sys.stdout.write(json.dumps(line))
    sys.stdout.write("\n")