# Daily Dose of Data Science

[Effortlessly Scale tSNE to Millions of Data Points With openTSNE](https://www.blog.dailydoseofds.com/p/effortlessly-scale-tsne-to-millions)

Author: Avi Chawla

In [None]:
!conda install --channel conda-forge opentsne

## Imports

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_openml
from openTSNE import TSNE
from sklearn.manifold import TSNE as TSNE_SK

sns.set()

In [None]:
colors = ['#fe7c73', '#2471A3', '#3498DB', '#27AE60', '#82E0AA', '#D35400', '#5D6D7E', '#E74C3C', '#21618C', '#B7950B', '#46C7C7', '#00B9FF']

## Dataset

In [None]:
# Load MNIST dataset
mnist = fetch_openml('mnist_784')

# Extract features and labels
X, y = mnist['data'], mnist['target']

# Convert to numpy arrays
X = X.to_numpy().astype('float32')
y = y.to_numpy().astype('int')

# Print the shape of the arrays
print("Shape of features (X):", X.shape)
print("Shape of labels (y):", y.shape)

## openTSNE

In [None]:
tsne = TSNE(
    perplexity=25,
    metric="euclidean",
    n_jobs=8,
    random_state=42,
    verbose=True,
)

In [None]:
embeddings = tsne.fit(X)

In [None]:
embeddings.shape

In [None]:
fig, ax = plt.subplots()

ax.scatter(embeddings[:, 0], embeddings[:, 1], c = [colors[i] for i in y], s=5)
ax.set_title("openTSNE Visualisation", fontsize=20, weight="bold")

plt.show()

## Sklearn

In [None]:
embeddings_sklearn = TSNE_SK(n_components=2,
                             learning_rate=20,
                             perplexity=25,
                             random_state=42,
                             n_jobs=8).fit_transform(X)

In [None]:
fig, ax = plt.subplots()

ax.scatter(embeddings_sklearn[:, 0], embeddings_sklearn[:, 1], c = [colors[i] for i in y], s=5)

ax.set_title("Sklearn tSNE", fontsize=20, weight="bold")

plt.show()