-
Notifications
You must be signed in to change notification settings - Fork 11
/
dimensionality_reduction.py
169 lines (138 loc) · 5.89 KB
/
dimensionality_reduction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import glob
import cv2
import numpy as np
from utils import *
def pca(img, n_components=9):
"""
Perform PCA dimensionality reduction on the input image
:param img: a numpy array of shape (n_samples, dim_images)
:param n_components: number of principal components for projection
:return: image in PCA projection, a numpy array of shape (n_samples, n_components)
"""
# Compute the covariance matrix
cov_mat = np.cov(img.T)
# Compute the eigenvectors and eigenvalues
eig_val_cov, eig_vec_cov = np.linalg.eig(cov_mat)
# Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [
(np.abs(eig_val_cov[i]), eig_vec_cov[:, i]) for i in range(len(eig_val_cov))
]
# Select n_components eigenvectors with largest eigenvalues, obtain subspace transform matrix
eig_pairs.sort(key=lambda x: x[0], reverse=True)
eig_pairs = np.array(eig_pairs)
matrix_w = np.hstack(
[eig_pairs[i, 1].reshape(img.shape[1], 1) for i in range(n_components)]
)
# Return samples in new subspace
return np.dot(img, matrix_w), matrix_w
def inverse_pca(img, components):
"""
Obtain the reconstruction after PCA dimensionality reduction
:param img: Reduced image, a numpy array of shape (n_samples, n_components)
:param components: a numpy array of size (original_dimension, n_components)
:return:
"""
reconstruct = np.dot(img, components.T).astype(int)
return reconstruct.reshape(-1,28,28)
def mds(data, n_components=2):
"""
Apply multidimensional scaling (aka Principal Coordinates Analysis)
:param data: nxn square distance matrix
:param n_components: number of components for projection
:return: projected output of shape (n_components, n)
"""
# Center distance matrix
center(data)
# Make a list of (eigenvalue, eigenvector) tuples
eig_val_cov, eig_vec_cov = np.linalg.eig(data)
eig_pairs = [
(np.abs(eig_val_cov[i]), eig_vec_cov[:, i]) for i in range(len(eig_val_cov))
]
# Select n_components eigenvectors with largest eigenvalues, obtain subspace transform matrix
eig_pairs.sort(key=lambda x: x[0], reverse=True)
eig_pairs = np.array(eig_pairs)
matrix_w = np.hstack(
[eig_pairs[i, 1].reshape(data.shape[1], 1) for i in range(n_components)]
)
# Return samples in new subspace
return matrix_w
def isomap(data, n_components=2, n_neighbors=6, dist=False):
"""
Dimensionality reduction with isomap algorithm
:param data: input image matrix of shape (n,m) if dist=False, square distance matrix of size (n,n) if dist=True
:param n_components: number of components for projection
:param n_neighbors: number of neighbors for distance matrix computation
:param dist: boolean indicating the data type
:return: Projected output of shape (n_components, n)
"""
if not dist:
# Compute distance matrix
data, _ = distance_mat(data, n_neighbors)
# Compute shortest paths from distance matrix
from sklearn.utils.graph import graph_shortest_path
graph = graph_shortest_path(data, directed=False)
graph = -0.5 * (graph ** 2)
# Return the MDS projection on the shortest paths graph
return mds(graph, n_components)
def LLE(data, neighbors_idx=None, n_components=2, n_neighbors=6):
"""
Dimensionality reduction with FastLLE algorithm
:param data: input image matrix of shape (n,m)
:param n_components: number of components for projection
:param n_neighbors: number of neighbors for the weight extraction
:return: Projected output of shape (n_components, n)
"""
if neighbors_idx is None:
# Compute the nearest neighbors
_, neighbors_idx = distance_mat(data, n_neighbors)
n = data.shape[0]
w = np.zeros((n, n))
for i in range(n):
# Center the neighbors matrix
k_indexes = neighbors_idx[i, :]
neighbors = data[k_indexes, :] - data[i, :]
# Compute the corresponding gram matrix
gram_inv = np.linalg.pinv(np.dot(neighbors, neighbors.T))
# Setting the weight values according to the lagrangian
lambda_par = 2/np.sum(gram_inv)
w[i, k_indexes] = lambda_par*np.sum(gram_inv, axis=1)/2
m = np.subtract(np.eye(n), w)
values, u = np.linalg.eigh(np.dot(np.transpose(m), m))
return u[:, 1:n_components+1]
def demo():
images = []
for path in glob.glob("four_dataset/*.jpg"):
images.append(cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2GRAY))
flat_images = np.array(images).reshape(982, -1)
for k in [2, 16, 64, 256]:
# Perform PCA transform
pca_images, components = pca(flat_images, k)
# Check the dimension of the reduced dataset
print(pca_images.shape)
# Reconstruct from PCA reduction
reconstruct = inverse_pca(pca_images, components)
# Compute MSE between original image and reconstructed image
mse = [mean_square_error(a,b) for a,b in zip(images, reconstruct)]
print("Dimension {}, best MSE: {}, worst MSE: {}, MSE over all samples: {}".format(k, min(mse), max(mse), sum(mse)/len(mse)))
# Save visualization of reductions on four0.jpg
cv2.imwrite("output/pca_{}.jpg".format(k), reconstruct[0])
if __name__ == "__main__":
data = np.load("digits-labels.npz")
# Select columns of five
d = data['d']
l = data['l']
five_idx = np.argwhere(l == 5)
d_fives = d[:, five_idx].squeeze().T
# Perform PCA
transform_pca, components = pca(d_fives, n_components=2)
print(transform_pca.shape)
show(transform_pca, d_fives, 'output/pca.png')
# Perform Isomap
distances, neighbors_idx = distance_mat(d_fives)
transform_isomap = isomap(distances, dist=True)
print(transform_isomap.shape)
show(transform_isomap, d_fives, 'output/isomap.png')
# Perform LLE
transform_lle = LLE(d_fives, neighbors_idx)
print(transform_lle.shape)
show(transform_lle, d_fives, 'output/lle.png')