Principal components analysis (PCA)#

Let \(\boldsymbol X \in \mathbb R^{n\times d}\) be feature matrix which consists of centered data: \(\sum\limits_{i=1}^n \boldsymbol x_i^\mathsf{T} = \boldsymbol 0\). We would like to approximate each \(\boldsymbol x_i\) by a low dimensional representation, \(\boldsymbol z_i \in \mathbb R^\ell\), \(\ell < d\). In PCA we assume that each \(\boldsymbol x_i\) can be approximately represented as a linear combination of vectors \(\boldsymbol w_1,\ldots, \boldsymbol w_\ell \in \mathbb R^d\):

\[ \boldsymbol x_i \approx \sum \limits_{k=1}^\ell z_{ik} \boldsymbol w_k. \]

Let \(\boldsymbol Z = (z_{ik})\) be a \(n\times\ell\) matrix,

\[ \boldsymbol W = [\boldsymbol w_1 \ldots \boldsymbol w_\ell] \in \mathbb R^{d \times \ell}. \]

In PCA we minimize the loss function

\[ \mathcal L(\boldsymbol W) = \frac 1n\big\Vert \boldsymbol X - \boldsymbol{ZW}^\mathsf{T}\big\Vert_F^2 = \frac 1n\big\Vert \boldsymbol X^\mathsf{T} - \boldsymbol{WZ}^\mathsf{T}\big\Vert_F^2 = \frac 1n\sum\limits_{i=1}^n \big\Vert \boldsymbol x_i - \boldsymbol {Wz}_i\big\Vert_2^2. \]

This loss function attains its minimum when

\[ \widehat{\boldsymbol W} = \boldsymbol U_\ell = [\boldsymbol u_1 \ldots \boldsymbol u_\ell] \]

where \(\boldsymbol u_1, \ldots, \boldsymbol u_\ell\) are eigenvectors of \(\ell\) largest eigenvalues of \(\boldsymbol X^\mathsf{T}\boldsymbol X\).

One can also show based on singular value decomposition of \(\boldsymbol X\)

\[ \boldsymbol X = \boldsymbol{U \Sigma V}^\mathsf{T} \]

that \(\widehat{\boldsymbol W}\) equals to first \(\ell\) columns of \(\boldsymbol V\).

PCA on MNIST#

import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

%config InlineBackend.figure_format = 'svg'

X, y = fetch_openml('mnist_784', return_X_y=True, parser='auto')
X = X.astype(float).values / 255
y = y.astype(int).values
X -= np.mean(X, axis=0, keepdims=True)

X_train, X_vis, y_train, y_vis = train_test_split(X, y, test_size=10000, stratify=y, random_state=12)

Perform PCA with 2 components:

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_vis)
print("Explained variance:", pca.explained_variance_) 
print("Explained variance ratio:", pca.explained_variance_ratio_.sum())
Explained variance: [5.16209074 3.77201984]
Explained variance ratio: 0.1693879257076425

Visulize the result of PCA:

scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_vis, cmap='tab10',
                      s=15, alpha=0.8, edgecolors='k')
legend_labels = [str(i) for i in range(10)]
plt.legend(handles=scatter.legend_elements()[0], title='Digits', labels=legend_labels)
plt.title('PCA on MNIST')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2');
../_images/d1caa0d537e4f9b497ad6d2af28c910d866e8e0dbbdae2e2cdf85ae54eb36cf8.svg

t-SNE on MNIST#

from sklearn.manifold import TSNE

tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X_vis)
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[4], line 4
      1 from sklearn.manifold import TSNE
      3 tsne = TSNE(n_components=2)
----> 4 X_tsne = tsne.fit_transform(X_vis)

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/utils/_set_output.py:140, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
    138 @wraps(f)
    139 def wrapped(self, X, *args, **kwargs):
--> 140     data_to_wrap = f(self, X, *args, **kwargs)
    141     if isinstance(data_to_wrap, tuple):
    142         # only wrap the first output for cross decomposition
    143         return_tuple = (
    144             _wrap_data_with_container(method, data_to_wrap[0], X, self),
    145             *data_to_wrap[1:],
    146         )

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py:1151, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1144     estimator._validate_params()
   1146 with config_context(
   1147     skip_parameter_validation=(
   1148         prefer_skip_nested_validation or global_skip_validation
   1149     )
   1150 ):
-> 1151     return fit_method(estimator, *args, **kwargs)

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/manifold/_t_sne.py:1111, in TSNE.fit_transform(self, X, y)
   1090 """Fit X into an embedded space and return that transformed output.
   1091 
   1092 Parameters
   (...)
   1108     Embedding of the training data in low-dimensional space.
   1109 """
   1110 self._check_params_vs_input(X)
-> 1111 embedding = self._fit(X)
   1112 self.embedding_ = embedding
   1113 return self.embedding_

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/manifold/_t_sne.py:1001, in TSNE._fit(self, X, skip_num_points)
    995 # Degrees of freedom of the Student's t-distribution. The suggestion
    996 # degrees_of_freedom = n_components - 1 comes from
    997 # "Learning a Parametric Embedding by Preserving Local Structure"
    998 # Laurens van der Maaten, 2009.
    999 degrees_of_freedom = max(self.n_components - 1, 1)
-> 1001 return self._tsne(
   1002     P,
   1003     degrees_of_freedom,
   1004     n_samples,
   1005     X_embedded=X_embedded,
   1006     neighbors=neighbors_nn,
   1007     skip_num_points=skip_num_points,
   1008 )

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/manifold/_t_sne.py:1069, in TSNE._tsne(self, P, degrees_of_freedom, n_samples, X_embedded, neighbors, skip_num_points)
   1067     opt_args["momentum"] = 0.8
   1068     opt_args["n_iter_without_progress"] = self.n_iter_without_progress
-> 1069     params, kl_divergence, it = _gradient_descent(obj_func, params, **opt_args)
   1071 # Save the final number of iterations
   1072 self.n_iter_ = it

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/manifold/_t_sne.py:402, in _gradient_descent(objective, p0, it, n_iter, n_iter_check, n_iter_without_progress, momentum, learning_rate, min_gain, min_grad_norm, verbose, args, kwargs)
    399 # only compute the error when needed
    400 kwargs["compute_error"] = check_convergence or i == n_iter - 1
--> 402 error, grad = objective(p, *args, **kwargs)
    404 inc = update * grad < 0.0
    405 dec = np.invert(inc)

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/manifold/_t_sne.py:283, in _kl_divergence_bh(params, P, degrees_of_freedom, n_samples, n_components, angle, skip_num_points, verbose, compute_error, num_threads)
    280 indptr = P.indptr.astype(np.int64, copy=False)
    282 grad = np.zeros(X_embedded.shape, dtype=np.float32)
--> 283 error = _barnes_hut_tsne.gradient(
    284     val_P,
    285     X_embedded,
    286     neighbors,
    287     indptr,
    288     grad,
    289     angle,
    290     n_components,
    291     verbose,
    292     dof=degrees_of_freedom,
    293     compute_error=compute_error,
    294     num_threads=num_threads,
    295 )
    296 c = 2.0 * (degrees_of_freedom + 1.0) / degrees_of_freedom
    297 grad = grad.ravel()

KeyboardInterrupt: 
scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_vis, cmap='tab10',
                      s=15, alpha=0.8, edgecolors='k')
legend_labels = [str(i) for i in range(10)]
plt.legend(handles=scatter.legend_elements()[0], title='Digits', labels=legend_labels)
plt.title('t-SNE on MNIST')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2');
../_images/f4864e42ad23d6f99106b36ddb8ce6e465d0b7693f705405999f1a0e8df11d55.svg