-
Notifications
You must be signed in to change notification settings - Fork 1
/
Clusterer.py
59 lines (51 loc) · 3.09 KB
/
Clusterer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
class Clusterer:
def __init__(self, show_dendrogram=False, affinity="euclidean", linkage="ward"):
self.show_dendrogram = show_dendrogram
self.affinity = affinity
self.linkage = linkage
# def cluster_task_variants_agglomerative(self, df_task_variants, df_task_variants_encoded):
# if self.show_dendrogram:
# fig = plt.figure(figsize=(25, 10))
# dn = dendrogram(linkage(df_task_variants_encoded.values, 'ward'))
# plt.show()
# num_clusters = int(input("Specify number of clusters: "))
#
# clusters = AgglomerativeClustering(n_clusters=num_clusters, affinity=self.affinity, linkage=self.linkage) \
# .fit_predict(df_task_variants_encoded.values)
# # df_task_variants_clustered = df_task_variants.copy()
# index = list(df_task_variants_encoded.index.values)
# df_clusters = pd.DataFrame(index=index, data={'cluster': clusters})
# # df_task_variants_clustered["cluster"] = clusters
# # df_task_variants_clustered = df_task_variants_clustered[['cluster', 'path']]
# # df_task_variants_clustered = df_task_variants_clustered.sort_values(by=['cluster', 'rank'],
# # ascending=True)
# df_task_variants_clustered = pd.concat([df_task_variants, df_clusters], axis=1)
# return df_task_variants_clustered, num_clusters
def cluster_task_variants_agglomerative(self, df_task_variants_encoded, num_clusters=""):
if self.show_dendrogram:
fig = plt.figure(figsize=(25, 10))
dn = dendrogram(linkage(df_task_variants_encoded.values, 'ward'))
plt.show()
num_clusters = int(input("Specify number of clusters: "))
else:
num_clusters = num_clusters
clusters = AgglomerativeClustering(n_clusters=num_clusters, affinity=self.affinity, linkage=self.linkage) \
.fit_predict(df_task_variants_encoded.values)
# df_task_variants_clustered = df_task_variants.copy()
index = list(df_task_variants_encoded.index.values)
df_clusters = pd.DataFrame(index=index, data={'cluster': clusters})
# df_task_variants_clustered["cluster"] = clusters
# df_task_variants_clustered = df_task_variants_clustered[['cluster', 'path']]
# df_task_variants_clustered = df_task_variants_clustered.sort_values(by=['cluster', 'rank'],
# ascending=True)
return df_clusters, num_clusters
def get_silhouette_score(self, df_task_variants_encoded, num_clusters):
clusters = AgglomerativeClustering(n_clusters=num_clusters, affinity=self.affinity, linkage=self.linkage) \
.fit_predict(df_task_variants_encoded.values)
s_score = metrics.silhouette_score(df_task_variants_encoded.values, clusters)
return s_score