Skip to content

Commit 8c581f7

Browse files
committed
Added t-SNE with Iris dataset example
1 parent 788d95b commit 8c581f7

File tree

1 file changed

+158
-0
lines changed

1 file changed

+158
-0
lines changed
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
import doctest
2+
3+
import numpy as np
4+
from numpy import ndarray
5+
from sklearn.datasets import load_iris
6+
7+
8+
def collect_dataset() -> tuple[ndarray, ndarray]:
9+
"""
10+
Load Iris dataset and return features and labels.
11+
Returns:
12+
tuple[ndarray, ndarray]: feature matrix and target labels
13+
Example:
14+
>>> x, y = collect_dataset()
15+
>>> x.shape
16+
(150, 4)
17+
>>> y.shape
18+
(150,)
19+
"""
20+
data = load_iris()
21+
return np.array(data.data), np.array(data.target)
22+
23+
24+
def compute_pairwise_affinities(data_x: ndarray, sigma: float = 1.0) -> ndarray:
25+
"""
26+
Compute high-dimensional affinities (P matrix) using Gaussian kernel.
27+
Args:
28+
data_x: Input data of shape (n_samples, n_features)
29+
sigma: Gaussian kernel bandwidth
30+
Returns:
31+
ndarray: Symmetrized probability matrix
32+
Example:
33+
>>> x = np.array([[0.0, 0.0], [1.0, 0.0]])
34+
>>> p = compute_pairwise_affinities(x)
35+
>>> float(round(p[0, 1], 3))
36+
0.25
37+
"""
38+
n_samples = data_x.shape[0]
39+
sum_x = np.sum(np.square(data_x), axis=1)
40+
dist_sq = np.add(np.add(-2 * np.dot(data_x, data_x.T), sum_x).T, sum_x)
41+
p = np.exp(-dist_sq / (2 * sigma**2))
42+
np.fill_diagonal(p, 0)
43+
p /= np.sum(p)
44+
return (p + p.T) / (2 * n_samples)
45+
46+
47+
def compute_low_dim_affinities(low_dim_embedding: ndarray) -> tuple[ndarray, ndarray]:
48+
"""
49+
Compute low-dimensional affinities (Q matrix) using Student-t distribution.
50+
Args:
51+
low_dim_embedding: shape (n_samples, n_components)
52+
Returns:
53+
tuple[ndarray, ndarray]: Q probability matrix and numerator
54+
Example:
55+
>>> y = np.array([[0.0, 0.0], [1.0, 0.0]])
56+
>>> q, num = compute_low_dim_affinities(y)
57+
>>> q.shape
58+
(2, 2)
59+
"""
60+
sum_y = np.sum(np.square(low_dim_embedding), axis=1)
61+
numerator = 1 / (
62+
1
63+
+ np.add(
64+
np.add(-2 * np.dot(low_dim_embedding, low_dim_embedding.T), sum_y).T,
65+
sum_y,
66+
)
67+
)
68+
np.fill_diagonal(numerator, 0)
69+
q = numerator / np.sum(numerator)
70+
return q, numerator
71+
72+
73+
def apply_tsne(
74+
data_x: ndarray,
75+
n_components: int = 2,
76+
learning_rate: float = 200.0,
77+
n_iter: int = 500,
78+
) -> ndarray:
79+
"""
80+
Apply t-SNE for dimensionality reduction.
81+
Args:
82+
data_x: Original dataset (features)
83+
n_components: Target dimension (2D or 3D)
84+
learning_rate: Step size for gradient descent
85+
n_iter: Number of iterations
86+
Returns:
87+
ndarray: Low-dimensional embedding of the data
88+
Example:
89+
>>> x, _ = collect_dataset()
90+
>>> y_emb = apply_tsne(x, n_components=2, n_iter=50)
91+
>>> y_emb.shape
92+
(150, 2)
93+
"""
94+
if n_components < 1 or n_iter < 1:
95+
raise ValueError("n_components and n_iter must be >= 1")
96+
97+
n_samples = data_x.shape[0]
98+
rng = np.random.default_rng()
99+
y = rng.standard_normal((n_samples, n_components)) * 1e-4
100+
101+
p = compute_pairwise_affinities(data_x)
102+
p = np.maximum(p, 1e-12)
103+
104+
y_inc = np.zeros_like(y)
105+
momentum = 0.5
106+
107+
for i in range(n_iter):
108+
q, num = compute_low_dim_affinities(y)
109+
q = np.maximum(q, 1e-12)
110+
111+
pq = p - q
112+
d_y = 4 * (
113+
np.dot((pq * num), y)
114+
- np.multiply(np.sum(pq * num, axis=1)[:, np.newaxis], y)
115+
)
116+
117+
y_inc = momentum * y_inc - learning_rate * d_y
118+
y += y_inc
119+
120+
if i == int(n_iter / 4):
121+
momentum = 0.8
122+
123+
return y
124+
125+
126+
def main() -> None:
127+
"""
128+
Run t-SNE on Iris dataset and display the first 5 embeddings.
129+
Example:
130+
>>> main() # doctest: +ELLIPSIS
131+
t-SNE embedding (first 5 points):
132+
[[...
133+
"""
134+
data_x,labels = collect_dataset()
135+
y_emb = apply_tsne(data_x, n_components=2, n_iter=300)
136+
137+
if not isinstance(y_emb, np.ndarray):
138+
raise TypeError("t-SNE embedding must be an ndarray")
139+
140+
print("t-SNE embedding (first 5 points):")
141+
print(y_emb[:5])
142+
143+
# Optional visualization ( Ruff/mypy compliant)
144+
import matplotlib.pyplot as plt
145+
plt.scatter(
146+
y_emb[:, 0],
147+
y_emb[:, 1],
148+
c=labels,
149+
cmap="viridis"
150+
)
151+
plt.title("t-SNE Visualization of Iris Dataset")
152+
plt.xlabel("Dimension 1")
153+
plt.ylabel("Dimension 2")
154+
plt.show()
155+
156+
if __name__ == "__main__":
157+
# doctest.testmod()
158+
main()

0 commit comments

Comments
 (0)