#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
from sklearn.cluster import HDBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
import sys
import unicodedata
import re
import spacy
#normalize text
def remove_accents(text):
if text == "" or type(text)!= str:
return ""
normalized_text = unicodedata.normalize("NFD", text)
text_with_no_accent = re.sub("[\u0300-\u036f]", "", normalized_text)
return text_with_no_accent
def uniformize(text):
# del accents
text = remove_accents(text)
# remove punctuation except " ' "
text = ''.join(char if char.isalpha() or char == "'" else ' ' for char in text)
return ' '.join(text.lower().split())
#lemmatize
nlp = spacy.load('en_core_web_sm', disable = ['parser','ner']) #load lemmatizer
def lemmatize(text):
if text == "":
return text
doc = nlp(text)
return " ".join([token.lemma_ for token in doc])
#stopwords
with open('./v1/stopwords/en.json','r') as f_in:
stopwords =json.load(f_in)
## WS
# Datas
all_data = []
for line in sys.stdin:
data=json.loads(line)
all_data.append(data)
#TF IDF
texts=[]
for line in all_data:
if "value" in line:
texts.append(lemmatize(uniformize(line["value"])))
else:
texts.append("n/a")
vectorizer = TfidfVectorizer(stop_words=stopwords,analyzer="word",max_df=0.8,min_df=2) #delete terms with high/low frequences
X = vectorizer.fit_transform(texts)
# HDBSCAN
min_cluster_size = max(2,2*int(len(texts)/100))
clusterer = HDBSCAN(
algorithm='auto',
metric='euclidean', #metric = cosine ?
min_cluster_size=min_cluster_size,
n_jobs=1)
clusterer.fit(X)
# extract infos
res = []
for i in range(len(all_data)):
all_data[i]["hdbscan"]={"cluster":int(clusterer.labels_[i]+1), "weight":str(clusterer.probabilities_[i])}
# Write all corpus in once
for line in all_data:
sys.stdout.write(json.dumps(line))
sys.stdout.write("\n")