Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add files via upload #4

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions dunn_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# -*- coding: utf-8 -*-
"""dunn_index.ipynb

Automatically generated by Colaboratory.

Original file is located at
https://colab.research.google.com/drive/16IcXRcP3ke55vTQ52dO2Lxo41ldhYMlw
"""

from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score

import numpy as np
temp =0
# loading the dataset
data_file = "iris_data_012.txt"
X= np.loadtxt(data_file, delimiter=",", usecols=range(0,4), dtype=np.float64)
z=[]
# K-Means
for i in range(2,50):
kmeans = KMeans(n_clusters=i, random_state=1).fit(X)

# we store the cluster labels
labels = kmeans.labels_

# print(davies_bouldin_score(X, labels))
z.append(davies_bouldin_score(X, labels))
for i in range(1,47):
if z[i]<z[i-1] and z[i]<z[i+1]:
temp=i+2
break
print(temp)
34 changes: 34 additions & 0 deletions silhouette.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# -*- coding: utf-8 -*-
"""silhouette.ipynb

Automatically generated by Colaboratory.

Original file is located at
https://colab.research.google.com/drive/1gDnSfreOsusI924aP4ZaDw6_K-AImb3J
"""

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import numpy as np
mini = 1000000
temp=0
z=[]
print("\nLoading Iris data into memory \n")
data_file = "iris_data_012.txt"
data_x = np.loadtxt(data_file, delimiter=",", usecols=range(0,4), dtype=np.float64)
data_y = np.loadtxt(data_file, delimiter=",", usecols=[4], dtype=np.int)
X, Y = make_blobs()
for i in range(2,50):
cluster = KMeans(n_clusters = i)
cluster_labels = cluster.fit_predict(data_x)
silhouette_avg = silhouette_score(data_x, cluster_labels)
z.append(silhouette_avg)
for i in range(1,47):
if z[i]<z[i-1] and z[i]<z[i+1]:
temp=z[i]
mini = i+2
break
print(temp,mini)

155 changes: 155 additions & 0 deletions sompy_.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
# -*- coding: utf-8 -*-
"""sompy_.ipynb

Automatically generated by Colaboratory.

Original file is located at
https://colab.research.google.com/drive/1k7dWaAq6FcNWEFJQmB6ShmPu2dYCHzQt
"""

# som_iris.py
# SOM for Iris dataset
# Anaconda3 5.2.0 (Python 3.6.5)
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt


# ==================================================================

import numpy as np
import matplotlib.pyplot as plt
# note: if this fails, try >pip uninstall matplotlib
# and then >pip install matplotlib

def closest_node(data, t, map, m_rows, m_cols):
# (row,col) of map node closest to data[t]
result = (0,0)
small_dist = 1.0e20
for i in range(m_rows):
for j in range(m_cols):
ed = euc_dist(map[i][j], data[t])
if ed < small_dist:
small_dist = ed
result = (i, j)
return result

def euc_dist(v1, v2):
return np.linalg.norm(v1 - v2)

def manhattan_dist(r1, c1, r2, c2):
return np.abs(r1-r2) + np.abs(c1-c2)

def most_common(lst, n):
# lst is a list of values 0 . . n
if len(lst) == 0: return -1
counts = np.zeros(shape=n, dtype=np.int)
for i in range(len(lst)):
counts[lst[i]] += 1
return np.argmax(counts)

# ==================================================================

def main():
# 0. get started
np.random.seed(1)
Dim = 2
Rows = 30; Cols = 30
RangeMax = Rows + Cols
LearnMax = 0.5
StepsMax = 2000

# 1. load data
print("\nLoading Iris data into memory \n")
data_file = "iris_data_012.txt"
data_x = np.loadtxt(data_file, delimiter=",", usecols=range(0,4), dtype=np.float64)
data_y = np.loadtxt(data_file, delimiter=",", usecols=[4], dtype=np.int)


# option: normalize data
mini = 1000000
temp=0
z=[]
#finding minimum clusters required
for i in range(2,50):
cluster = KMeans(n_clusters = i)
cluster_labels = cluster.fit_predict(data_x)
silhouette_avg = silhouette_score(data_x, cluster_labels)
z.append(silhouette_avg)
for i in range(1,47):
if z[i]<z[i-1] and z[i]<z[i+1]:
temp=z[i]
mini = i+2
break
print(temp,mini)

# 2. construct the SOM
print("Constructing a 30x30 SOM from the iris data")
map = np.random.random_sample(size=(Rows,Cols,Dim))
for s in range(StepsMax):
if s % (StepsMax/10) == 0: print("step = ", str(s))
pct_left = 1.0 - ((s * 1.0) / StepsMax)
curr_range = (int)(pct_left * RangeMax)
curr_rate = pct_left * LearnMax

t = np.random.randint(len(data_x))
(bmu_row, bmu_col) = closest_node(data_x, t, map, Rows, Cols)
for i in range(Rows):
for j in range(Cols):
if manhattan_dist(bmu_row, bmu_col, i, j) < curr_range:
map[i][j] = map[i][j] + curr_rate * \
(data_x[t] - map[i][j])
print("SOM construction complete \n")

# 3. construct U-Matrix
print("Constructing U-Matrix from SOM")
u_matrix = np.zeros(shape=(Rows,Cols), dtype=np.float64)
for i in range(Rows):
for j in range(Cols):
v = map[i][j] # a vector
sum_dists = 0.0; ct = 0

if i-1 >= 0: # above
sum_dists += euc_dist(v, map[i-1][j]); ct += 1
if i+1 <= Rows-1: # below
sum_dists += euc_dist(v, map[i+1][j]); ct += 1
if j-1 >= 0: # left
sum_dists += euc_dist(v, map[i][j-1]); ct += 1
if j+1 <= Cols-1: # right
sum_dists += euc_dist(v, map[i][j+1]); ct += 1

u_matrix[i][j] = sum_dists / ct
print("U-Matrix constructed \n")

# display U-Matrix
plt.imshow(u_matrix, cmap='gray') # black = close = clusters
plt.show()

# 4. because the data has labels, another possible visualization:
# associate each data label with a map node
print("Associating each data label to one map node ")
mapping = np.empty(shape=(Rows,Cols), dtype=object)
for i in range(Rows):
for j in range(Cols):
mapping[i][j] = []

for t in range(len(data_x)):
(m_row, m_col) = closest_node(data_x, t, map, Rows, Cols)
mapping[m_row][m_col].append(data_y[t])

label_map = np.zeros(shape=(Rows,Cols), dtype=np.int)
for i in range(Rows):
for j in range(Cols):
label_map[i][j] = most_common(mapping[i][j], mini)

plt.imshow(label_map, cmap=plt.cm.get_cmap('terrain_r', mini+1))
plt.colorbar()
plt.show()

# ==================================================================

if __name__=="__main__":
main()

#sarthak agarwal