Principal components analysis (PCA)#
Let \(\boldsymbol X \in \mathbb R^{n\times d}\) be feature matrix which consists of centered data: \(\sum\limits_{i=1}^n \boldsymbol x_i^\mathsf{T} = \boldsymbol 0\). We would like to approximate each \(\boldsymbol x_i\) by a low dimensional representation, \(\boldsymbol z_i \in \mathbb R^\ell\), \(\ell < d\). In PCA we assume that each \(\boldsymbol x_i\) can be approximately represented as a linear combination of vectors \(\boldsymbol w_1,\ldots, \boldsymbol w_\ell \in \mathbb R^d\):
Let \(\boldsymbol Z = (z_{ik})\) be a \(n\times\ell\) matrix,
In PCA we minimize the loss function
This loss function attains its minimum when
where \(\boldsymbol u_1, \ldots, \boldsymbol u_\ell\) are eigenvectors of \(\ell\) largest eigenvalues of \(\boldsymbol X^\mathsf{T}\boldsymbol X\).
One can also show based on singular value decomposition of \(\boldsymbol X\)
that \(\widehat{\boldsymbol W}\) equals to first \(\ell\) columns of \(\boldsymbol V\).
PCA on MNIST#
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
%config InlineBackend.figure_format = 'svg'
X, y = fetch_openml('mnist_784', return_X_y=True, parser='auto')
X = X.astype(float).values / 255
y = y.astype(int).values
X -= np.mean(X, axis=0, keepdims=True)
X_train, X_vis, y_train, y_vis = train_test_split(X, y, test_size=10000, stratify=y, random_state=12)
Perform PCA with 2 components:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_vis)
print("Explained variance:", pca.explained_variance_)
print("Explained variance ratio:", pca.explained_variance_ratio_.sum())
Explained variance: [5.16209074 3.77201984]
Explained variance ratio: 0.1693879257076425
Visulize the result of PCA:
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_vis, cmap='tab10',
s=15, alpha=0.8, edgecolors='k')
legend_labels = [str(i) for i in range(10)]
plt.legend(handles=scatter.legend_elements()[0], title='Digits', labels=legend_labels)
plt.title('PCA on MNIST')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2');
t-SNE on MNIST#
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X_vis)
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
Cell In[4], line 4
1 from sklearn.manifold import TSNE
3 tsne = TSNE(n_components=2)
----> 4 X_tsne = tsne.fit_transform(X_vis)
File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/utils/_set_output.py:140, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
138 @wraps(f)
139 def wrapped(self, X, *args, **kwargs):
--> 140 data_to_wrap = f(self, X, *args, **kwargs)
141 if isinstance(data_to_wrap, tuple):
142 # only wrap the first output for cross decomposition
143 return_tuple = (
144 _wrap_data_with_container(method, data_to_wrap[0], X, self),
145 *data_to_wrap[1:],
146 )
File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py:1151, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
1144 estimator._validate_params()
1146 with config_context(
1147 skip_parameter_validation=(
1148 prefer_skip_nested_validation or global_skip_validation
1149 )
1150 ):
-> 1151 return fit_method(estimator, *args, **kwargs)
File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/manifold/_t_sne.py:1111, in TSNE.fit_transform(self, X, y)
1090 """Fit X into an embedded space and return that transformed output.
1091
1092 Parameters
(...)
1108 Embedding of the training data in low-dimensional space.
1109 """
1110 self._check_params_vs_input(X)
-> 1111 embedding = self._fit(X)
1112 self.embedding_ = embedding
1113 return self.embedding_
File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/manifold/_t_sne.py:1001, in TSNE._fit(self, X, skip_num_points)
995 # Degrees of freedom of the Student's t-distribution. The suggestion
996 # degrees_of_freedom = n_components - 1 comes from
997 # "Learning a Parametric Embedding by Preserving Local Structure"
998 # Laurens van der Maaten, 2009.
999 degrees_of_freedom = max(self.n_components - 1, 1)
-> 1001 return self._tsne(
1002 P,
1003 degrees_of_freedom,
1004 n_samples,
1005 X_embedded=X_embedded,
1006 neighbors=neighbors_nn,
1007 skip_num_points=skip_num_points,
1008 )
File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/manifold/_t_sne.py:1069, in TSNE._tsne(self, P, degrees_of_freedom, n_samples, X_embedded, neighbors, skip_num_points)
1067 opt_args["momentum"] = 0.8
1068 opt_args["n_iter_without_progress"] = self.n_iter_without_progress
-> 1069 params, kl_divergence, it = _gradient_descent(obj_func, params, **opt_args)
1071 # Save the final number of iterations
1072 self.n_iter_ = it
File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/manifold/_t_sne.py:402, in _gradient_descent(objective, p0, it, n_iter, n_iter_check, n_iter_without_progress, momentum, learning_rate, min_gain, min_grad_norm, verbose, args, kwargs)
399 # only compute the error when needed
400 kwargs["compute_error"] = check_convergence or i == n_iter - 1
--> 402 error, grad = objective(p, *args, **kwargs)
404 inc = update * grad < 0.0
405 dec = np.invert(inc)
File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/manifold/_t_sne.py:283, in _kl_divergence_bh(params, P, degrees_of_freedom, n_samples, n_components, angle, skip_num_points, verbose, compute_error, num_threads)
280 indptr = P.indptr.astype(np.int64, copy=False)
282 grad = np.zeros(X_embedded.shape, dtype=np.float32)
--> 283 error = _barnes_hut_tsne.gradient(
284 val_P,
285 X_embedded,
286 neighbors,
287 indptr,
288 grad,
289 angle,
290 n_components,
291 verbose,
292 dof=degrees_of_freedom,
293 compute_error=compute_error,
294 num_threads=num_threads,
295 )
296 c = 2.0 * (degrees_of_freedom + 1.0) / degrees_of_freedom
297 grad = grad.ravel()
KeyboardInterrupt:
scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_vis, cmap='tab10',
s=15, alpha=0.8, edgecolors='k')
legend_labels = [str(i) for i in range(10)]
plt.legend(handles=scatter.legend_elements()[0], title='Digits', labels=legend_labels)
plt.title('t-SNE on MNIST')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2');