from sklearn.datasets import fetch_openml
mnist = fetch_openml("mnist_784", as_frame=False)
Xtrain, ytrain = mnist.data[:60000], mnist.target[:60000]
Xtest, ytest = mnist.data[60000:], mnist.target[60000:]Testing Dimensionality Reduction Methods
First: reducing dimensionality of handwritten images
import matplotlib.pyplot as plt
def plotDigit(data):
plt.imshow(data.reshape(28,28), cmap="binary")
plt.axis("off")from sklearn.ensemble import RandomForestClassifier
rfModel = RandomForestClassifier(n_estimators=200, max_depth=20, n_jobs=-1)
rfModel.fit(Xtrain, ytrain)RandomForestClassifier(max_depth=20, n_estimators=200, n_jobs=-1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(max_depth=20, n_estimators=200, n_jobs=-1)
from sklearn.metrics import ConfusionMatrixDisplay
preds = rfModel.predict(Xtest)
print("Score:",rfModel.score(Xtest, ytest))
ConfusionMatrixDisplay.from_predictions(ytest, preds, normalize="true", values_format=".0%")Score: 0.9696
No PCA: Random Forest takes around 6-7 seconds, 97% accuracy, worst digit is 95-96%
from sklearn.decomposition import PCA
pca = PCA(n_components=.95)
Xreduced = pca.fit_transform(Xtrain)
Xreducedarray([[ 123.93258866, -312.67426202, -24.51405176, ..., 55.01899792,
-20.08327427, 39.58995229],
[1011.71837587, -294.85703827, 596.33956104, ..., 7.24129874,
-12.45780869, -12.7432306 ],
[ -51.84960805, 392.17315286, -188.50974943, ..., -54.19582221,
48.47979747, -73.27826256],
...,
[-178.0534496 , 160.07821109, -257.61308227, ..., 55.54485537,
87.99883556, -5.78979735],
[ 130.60607208, -5.59193642, 513.85867395, ..., 23.30835402,
5.06237836, -65.26525587],
[-173.43595244, -24.71880226, 556.01889393, ..., 52.4956069 ,
12.63192292, -45.74001227]])
rfModel = RandomForestClassifier(n_estimators=200, max_depth=20, n_jobs=-1)
rfModel.fit(Xreduced, ytrain)RandomForestClassifier(max_depth=20, n_estimators=200, n_jobs=-1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(max_depth=20, n_estimators=200, n_jobs=-1)
test2 = pca.transform(Xtest)
preds = rfModel.predict(test2)
print("Score:",rfModel.score(test2, ytest))
ConfusionMatrixDisplay.from_predictions(ytest, preds, normalize="true", values_format=".0%")Score: 0.9507
Using PCA here reduced accuracy 2%, made our worst digit 91% accurate, and made training 4x slower.
Why this happened: -input isnt dense anymore -random forests dont go well with pca reduced data
# Trying w/ SGD Classifier
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(n_jobs=-1)
sgd.fit(Xtrain, ytrain)
preds = sgd.predict(Xtest)
print("Score:",sgd.score(Xtest, ytest))
ConfusionMatrixDisplay.from_predictions(ytest, preds, normalize="true", values_format=".0%")Score: 0.8761
# Trying with SGD and PCA-reduced X
sgd = SGDClassifier(n_jobs=-1)
sgd.fit(Xreduced, ytrain)
preds = sgd.predict(pca.transform(Xtest))
print("Score:",sgd.score(pca.transform(Xtest), ytest))
ConfusionMatrixDisplay.from_predictions(ytest, preds, normalize="true", values_format=".0%")Score: 0.8925
For SGD Classification, PCA massively accelerates training and helps accuracy (marginally).
\[ \begin{array}{|l|l|l|} \hline \textbf{Classifier} & \textbf{Benefit Level} & \textbf{Why} \\ \hline \text{k-Nearest Neighbors} & \text{High} & \text{Curse of dimensionality; faster distance calc} \\ \hline \text{Support Vector Machines} & \text{High} & \text{Reduced training complexity, better kernels} \\ \hline \text{Logistic Regression} & \text{Medium} & \text{Faster training, fewer parameters} \\ \hline \text{Linear Discriminant Analysis} & \text{Medium} & \text{Speed improvement, cleaner separation} \\ \hline \text{Naive Bayes} & \text{Medium} & \text{PCA creates uncorrelated features} \\ \hline \text{Neural Networks} & \text{Medium} & \text{Faster training, but learns own features} \\ \hline \text{Random Forests} & \text{Low} & \text{Handles high dimensions well natively} \\ \hline \text{Decision Trees} & \text{Low} & \text{Built-in feature selection} \\ \hline \text{Gradient Boosting} & \text{Low} & \text{Robust to dimensionality} \\ \hline \end{array} \]
Compressing 784-dimension image data into 2 dimensions
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, n_jobs=-1)
tsneReduced = tsne.fit_transform(Xtrain[:5000])
tsneReducedarray([[ 17.097141, 9.229305],
[ 60.055847, 24.304974],
[ 14.299513, -53.212894],
...,
[-29.963684, 47.90794 ],
[-35.99484 , 7.573737],
[-33.8817 , 44.292606]], dtype=float32)
import matplotlib.pyplot as plt
import numpy as np
plt.scatter(tsneReduced[:,0], tsneReduced[:,1], alpha=.35, c=ytrain[:5000].astype("int"), cmap="jet")
# handles = []
# for label in np.unique(ytrain[:5000].astype("int")):
# handles.append(plt.Line2D([0], [0], marker='o', color='w',
# markerfacecolor=plt.cm.tab10(label/10),
# markersize=8, label=f'{label}'))
# plt.legend(handles=handles, loc='upper right', bbox_to_anchor=(1.15, 1))
plt.colorbar()
plt.tight_layout()
plt.show()The Y and X axes are basically meaningless for humans. Note how similar 4’s and 9’s are, it makes sense why they are the most confused by models. Interestingly, you can see 3’s all over the place, like the clump between 1 and 9. It’s hard to imagine a human drawing a 3 that could be confused for both a 1 and a 9.
# This code directly taken from https://colab.research.google.com/github/ageron/handson-mlp/blob/main/07_dimensionality_reduction.ipynb#scrollTo=koiUl6sFjdZv
from sklearn.preprocessing import MinMaxScaler
from matplotlib.offsetbox import AnnotationBbox, OffsetImage
def plot_digits(X, y, min_distance=0.04, images=None, figsize=(13, 10)):
# Let's scale the input features so that they range from 0 to 1
X_normalized = MinMaxScaler().fit_transform(X)
# Now we create the list of coordinates of the digits plotted so far.
# We pretend that one is already plotted far away at the start, to
# avoid `if` statements in the loop below
neighbors = np.array([[10., 10.]])
# The rest should be self-explanatory
plt.figure(figsize=figsize)
cmap = plt.cm.jet
digits = np.unique(y)
for digit in digits:
plt.scatter(X_normalized[y == digit, 0], X_normalized[y == digit, 1],
c=[cmap(float(digit) / 9)], alpha=0.5)
plt.axis("off")
ax = plt.gca() # get current axes
for index, image_coord in enumerate(X_normalized):
closest_distance = np.linalg.norm(neighbors - image_coord, axis=1).min()
if closest_distance > min_distance:
neighbors = np.r_[neighbors, [image_coord]]
if images is None:
plt.text(image_coord[0], image_coord[1], str(int(y[index])),
color=cmap(float(y[index]) / 9),
fontdict={"weight": "bold", "size": 16})
else:
image = images[index].reshape(28, 28)
imagebox = AnnotationBbox(OffsetImage(image, cmap="binary"),
image_coord)
ax.add_artist(imagebox)# Compare two digits with this cell
digit1 = 9
digit2= 4
index = (ytrain.astype("int")[:5000] == digit1) | (ytrain.astype("int")[:5000] == digit2)
xsmall = tsneReduced[:5000][index]
ysmall = ytrain[:5000][index]
imgs = Xtrain[:5000][index]
plot_digits(xsmall, ysmall, images=imgs, figsize=(17,17))