Skip to content

Commit 94c100a

Browse files
committed
deploy: d13a272
0 parents  commit 94c100a

File tree

85 files changed

+13631
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

85 files changed

+13631
-0
lines changed

.buildinfo

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# Sphinx build info version 1
2+
# This file records the configuration used when building these files. When it is not found, a full rebuild will be done.
3+
config: 5b3d3f4f74ac37c72e560212dde728c0
4+
tags: 645f666f9bcd5a90fca523b33c5a78b7

.nojekyll

Whitespace-only changes.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Lines changed: 360 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,360 @@
1+
"""
2+
===============================
3+
Iris Dataset Clustering Example
4+
===============================
5+
6+
This example is meant to illustrate the use of the Radius clustering library on the Iris dataset.
7+
It comes with a simple example of how to use the library to cluster the Iris dataset and a comparison with
8+
kmeans clustering algorithms.
9+
10+
The example includes:
11+
1. Loading the Iris dataset
12+
2. Applying Radius clustering and k-means clustering
13+
3. Visualizing the clustering results
14+
15+
This example serves as a simple introduction to using the Radius clustering library
16+
on a well-known dataset.
17+
"""
18+
# Author: Haenn Quentin
19+
# SPDX-License-Identifier: MIT
20+
21+
22+
# %%
23+
# Load the Iris dataset
24+
# ---------------------
25+
#
26+
# We start by loading the Iris dataset using the `fetch_openml` function from `sklearn.datasets`.
27+
# The Iris dataset is a well-known dataset that contains 150 samples of iris flowers.
28+
# Each sample has 4 features: sepal length, sepal width, petal length, and petal width.
29+
# The dataset is labeled with 3 classes: setosa, versicolor, and virginica.
30+
31+
import numpy as np
32+
from sklearn import datasets
33+
from radius_clustering import RadiusClustering
34+
35+
# Load the Iris dataset
36+
iris = datasets.load_iris()
37+
X = iris["data"]
38+
y = iris.target
39+
40+
41+
# %%
42+
# Visualize the Iris dataset
43+
# --------------------------
44+
#
45+
# We can visualize the Iris dataset by plotting the dataset. We use PCA to reduce the dimensionality to 3D
46+
# and plot the dataset in a 3D scatter plot.
47+
import matplotlib.pyplot as plt
48+
from sklearn.decomposition import PCA
49+
import mpl_toolkits.mplot3d
50+
51+
# Reduce the dimensionality of the dataset to 3D using PCA
52+
pca = PCA(n_components=3)
53+
iris_reduced = pca.fit_transform(X)
54+
fig = plt.figure(figsize=(8, 6))
55+
ax = fig.add_subplot(111, projection="3d", elev=48, azim=134)
56+
ax.scatter(
57+
iris_reduced[:, 0],
58+
iris_reduced[:, 1],
59+
iris_reduced[:, 2],
60+
c=y,
61+
cmap="Dark2",
62+
s=40,
63+
)
64+
# Set plot labels
65+
ax.set_title("Iris dataset in first 3 PCA components")
66+
ax.set_xlabel("1st eigenvector")
67+
ax.set_ylabel("2nd eigenvector")
68+
ax.set_zlabel("3rd eigenvector")
69+
70+
# Hide tick labels
71+
ax.xaxis.set_ticklabels([])
72+
ax.yaxis.set_ticklabels([])
73+
ax.zaxis.set_ticklabels([])
74+
75+
plt.show()
76+
77+
# %%
78+
# Compute Clustering with Radius Clustering
79+
# -----------------------------------------
80+
#
81+
# We can now apply Radius clustering to the Iris dataset.
82+
# We create an instance of the `RadiusClustering` class and fit it to the Iris dataset.
83+
import time
84+
85+
rad = RadiusClustering(manner="exact", threshold=1.43)
86+
t0 = time.time()
87+
rad.fit(X)
88+
t_rad = time.time() - t0
89+
90+
# %%
91+
# Compute KMeans Clustering for Comparison
92+
# ----------------------------------------
93+
#
94+
# We can also apply KMeans clustering to the Iris dataset for comparison.
95+
96+
from sklearn.cluster import KMeans
97+
98+
k_means = KMeans(n_clusters=3, n_init=10)
99+
t0 = time.time()
100+
k_means.fit(X)
101+
t_kmeans = time.time() - t0
102+
103+
# %% Establishing parity between clusters
104+
# --------------------------------------
105+
#
106+
# We want to have the same color for the same cluster in both plots.
107+
# We can achieve this by matching the cluster labels of the Radius clustering and the KMeans clustering.
108+
# First we define a function to retrieve the cluster centers from the Radius clustering and KMeans clustering and
109+
# match them pairwise.
110+
111+
112+
def get_order_labels(kmeans, rad, data):
113+
centers1_cpy = kmeans.cluster_centers_.copy()
114+
centers2_cpy = data[rad.centers_].copy()
115+
order = []
116+
# For each center in the first clustering, find the closest center in the second clustering
117+
for center in centers1_cpy:
118+
match = pairwise_distances_argmin([center], centers2_cpy)
119+
# if there is only one center left, assign it to the last cluster label not yet assigned
120+
if len(centers2_cpy) == 1:
121+
for i in range(len(centers1_cpy)):
122+
if i not in order:
123+
order.append(i)
124+
break
125+
break
126+
# get coordinates of the center in the second clustering
127+
coordinates = centers2_cpy[match]
128+
# find the closest point in the data to the center to get the cluster label
129+
closest_point = pairwise_distances_argmin(coordinates, data)
130+
match_label = rad.labels_[closest_point]
131+
# remove the center from the second clustering
132+
centers2_cpy = np.delete(centers2_cpy, match, axis=0)
133+
# add the cluster label to the order
134+
order.append(int(match_label[0]))
135+
return order
136+
137+
138+
from sklearn.metrics.pairwise import pairwise_distances_argmin
139+
140+
rad_centers_index = np.array(rad.centers_)
141+
order = get_order_labels(k_means, rad, X)
142+
143+
kmeans_centers = k_means.cluster_centers_
144+
rad_centers = rad_centers_index[order]
145+
rad_centers_coordinates = X[rad_centers]
146+
147+
# Pair the cluster labels
148+
kmeans_labels = pairwise_distances_argmin(X, kmeans_centers)
149+
rad_labels = pairwise_distances_argmin(X, rad_centers_coordinates)
150+
151+
# %%
152+
# Plotting the results and the difference
153+
# ---------------------------------------
154+
155+
fig = plt.figure(figsize=(12, 6))
156+
fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
157+
colors = ["#4EACC5", "#FF9C34", "#4E9A06"]
158+
159+
# KMeans
160+
ax = fig.add_subplot(1, 3, 1, projection="3d", elev=48, azim=134, roll=0)
161+
162+
ax.scatter(
163+
iris_reduced[:, 0],
164+
iris_reduced[:, 1],
165+
iris_reduced[:, 2],
166+
c=kmeans_labels,
167+
cmap="Dark2",
168+
s=40,
169+
)
170+
# adapting center coordinates to the 3D plot
171+
kmeans_centers = pca.transform(kmeans_centers)
172+
ax.scatter(
173+
kmeans_centers[:, 0],
174+
kmeans_centers[:, 1],
175+
kmeans_centers[:, 2],
176+
c="r",
177+
s=200,
178+
)
179+
ax.set_title("KMeans")
180+
ax.set_xticks(())
181+
ax.set_yticks(())
182+
ax.set_zticks(())
183+
184+
ax.text3D(-3.5, 3, 1.0, "train time: %.2fs\ninertia: %f" % (t_kmeans, k_means.inertia_))
185+
186+
# MDS
187+
ax = fig.add_subplot(1, 3, 2, projection="3d", elev=48, azim=134, roll=0)
188+
ax.scatter(
189+
iris_reduced[:, 0],
190+
iris_reduced[:, 1],
191+
iris_reduced[:, 2],
192+
c=rad_labels,
193+
cmap="Dark2",
194+
s=40,
195+
)
196+
# adapting center coordinates to the 3D plot
197+
rad_centers_coordinates = pca.transform(rad_centers_coordinates)
198+
ax.scatter(
199+
rad_centers_coordinates[:, 0],
200+
rad_centers_coordinates[:, 1],
201+
rad_centers_coordinates[:, 2],
202+
c="r",
203+
s=200,
204+
)
205+
ax.set_title("MDS Clustering")
206+
ax.set_xticks(())
207+
ax.set_yticks(())
208+
ax.set_zticks(())
209+
ax.text3D(-3.5, 3, 0.0, "train time: %.2fs" % t_rad)
210+
211+
# Initialize the different array to all False
212+
different = rad_labels == 4
213+
ax = fig.add_subplot(1, 3, 3, projection="3d", elev=48, azim=134, roll=0)
214+
215+
for k in range(3):
216+
different += (kmeans_labels == k) != (rad_labels == k)
217+
218+
identical = np.logical_not(different)
219+
ax.scatter(
220+
iris_reduced[identical, 0], iris_reduced[identical, 1], color="#bbbbbb", marker="."
221+
)
222+
ax.scatter(iris_reduced[different, 0], iris_reduced[different, 1], color="m")
223+
ax.set_title("Difference")
224+
ax.set_xticks(())
225+
ax.set_yticks(())
226+
ax.set_zticks(())
227+
228+
plt.show()
229+
230+
# %%
231+
# Another difference plot
232+
# -----------------------
233+
#
234+
# As we saw, the difference plot is not very informative using Iris.
235+
# We'll use a different dataset to show the difference plot.
236+
237+
wine = datasets.load_wine()
238+
X = wine.data
239+
y = wine.target
240+
pca = PCA(n_components=3)
241+
wine_reduced = pca.fit_transform(X)
242+
243+
# Compute clustering with MDS
244+
245+
rad = RadiusClustering(manner="exact", threshold=232.09)
246+
t0 = time.time()
247+
rad.fit(X)
248+
t_rad = time.time() - t0
249+
250+
# Compute KMeans clustering for comparison
251+
252+
k_means = KMeans(n_clusters=3, n_init=10)
253+
t0 = time.time()
254+
k_means.fit(X)
255+
t_kmeans = time.time() - t0
256+
257+
# %%
258+
# Reapllying the same process as before
259+
# --------------------------------------
260+
261+
rad_centers_index = np.array(rad.centers_)
262+
order = get_order_labels(k_means, rad, X)
263+
264+
kmeans_centers = k_means.cluster_centers_
265+
rad_centers = rad_centers_index[order]
266+
rad_centers_coordinates = X[rad_centers]
267+
268+
# Pair the cluster labels
269+
kmeans_labels = pairwise_distances_argmin(X, kmeans_centers)
270+
rad_labels = pairwise_distances_argmin(X, rad_centers_coordinates)
271+
272+
# %%
273+
# Plotting the results and the difference
274+
# ---------------------------------------
275+
276+
fig = plt.figure(figsize=(12, 6))
277+
fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
278+
colors = ["#4EACC5", "#FF9C34", "#4E9A06"]
279+
280+
# KMeans
281+
ax = fig.add_subplot(1, 3, 1, projection="3d", elev=48, azim=134, roll=0)
282+
283+
ax.scatter(
284+
wine_reduced[:, 0],
285+
wine_reduced[:, 1],
286+
wine_reduced[:, 2],
287+
c=kmeans_labels,
288+
cmap="Dark2",
289+
s=40,
290+
)
291+
# adapting center coordinates to the 3D plot
292+
kmeans_centers = pca.transform(kmeans_centers)
293+
ax.scatter(
294+
kmeans_centers[:, 0],
295+
kmeans_centers[:, 1],
296+
kmeans_centers[:, 2],
297+
c="r",
298+
s=200,
299+
)
300+
ax.set_title("KMeans")
301+
ax.set_xticks(())
302+
ax.set_yticks(())
303+
ax.set_zticks(())
304+
305+
ax.text3D(
306+
60.0, 80.0, 0.0, "train time: %.2fs\ninertia: %f" % (t_kmeans, k_means.inertia_)
307+
)
308+
309+
# MDS
310+
ax = fig.add_subplot(1, 3, 2, projection="3d", elev=48, azim=134, roll=0)
311+
ax.scatter(
312+
wine_reduced[:, 0],
313+
wine_reduced[:, 1],
314+
wine_reduced[:, 2],
315+
c=rad_labels,
316+
cmap="Dark2",
317+
s=40,
318+
)
319+
# adapting center coordinates to the 3D plot
320+
rad_centers_coordinates = pca.transform(rad_centers_coordinates)
321+
ax.scatter(
322+
rad_centers_coordinates[:, 0],
323+
rad_centers_coordinates[:, 1],
324+
rad_centers_coordinates[:, 2],
325+
c="r",
326+
s=200,
327+
)
328+
ax.set_title("MDS Clustering")
329+
ax.set_xticks(())
330+
ax.set_yticks(())
331+
ax.set_zticks(())
332+
ax.text3D(60.0, 80.0, 0.0, "train time: %.2fs" % t_rad)
333+
334+
# Initialize the different array to all False
335+
different = rad_labels == 4
336+
ax = fig.add_subplot(1, 3, 3, projection="3d", elev=48, azim=134, roll=0)
337+
338+
for k in range(3):
339+
different += (kmeans_labels == k) != (rad_labels == k)
340+
341+
identical = np.logical_not(different)
342+
ax.scatter(
343+
wine_reduced[identical, 0], wine_reduced[identical, 1], color="#bbbbbb", marker="."
344+
)
345+
ax.scatter(wine_reduced[different, 0], wine_reduced[different, 1], color="m")
346+
ax.set_title("Difference")
347+
ax.set_xticks(())
348+
ax.set_yticks(())
349+
ax.set_zticks(())
350+
351+
plt.show()
352+
353+
# %%
354+
# Conclusion
355+
# ----------
356+
#
357+
# In this example, we applied Radius clustering to the Iris and Wine datasets and compared it with KMeans clustering.
358+
# We visualized the clustering results and the difference between the two clustering algorithms.
359+
# We saw that Radius Clustering can lead to smaller clusters than kmeans, which produces much more equilibrate clusters.
360+
# The difference plot can be very useful to see where the two clustering algorithms differ.

0 commit comments

Comments
 (0)