|
| 1 | +""" |
| 2 | +===================================================================================== |
| 3 | +Benchmark of Radius Clustering using multiple datasets and comparison with custom MDS |
| 4 | +===================================================================================== |
| 5 | +
|
| 6 | +This example demonstrates how to implement a custom solver for the MDS problem |
| 7 | +and use it within the Radius Clustering framework. |
| 8 | +Plus, it compares the results of a naive implementation using the |
| 9 | +`NetworkX` library with the Radius Clustering implementation. |
| 10 | +
|
| 11 | +The example includes: |
| 12 | + 1. Defining the custom MDS solver. |
| 13 | + 2. Defining datasets to test the clustering. |
| 14 | + 3. Applying Radius clustering on the datasets using the custom MDS solver. |
| 15 | + 4. Ensure this solution works. |
| 16 | + 5. Establish a benchmark procedure to compare the Radius clustering with a naive implementation using `NetworkX`. |
| 17 | + 6. Comparing the results in terms of : |
| 18 | + - Execution time |
| 19 | + - Number of cluster found |
| 20 | + 7. Visualizing the benchmark results. |
| 21 | + 8. Visualizing the clustering results. |
| 22 | +
|
| 23 | +This example is useful for understanding how to implement a custom MDS solver |
| 24 | +and how to perform an advanced usage of the package. |
| 25 | +""" |
| 26 | +# Author: Haenn Quentin |
| 27 | +# SPDX-License-Identifier: MIT |
| 28 | + |
| 29 | +# %% |
| 30 | +# Import necessary libraries |
| 31 | +# -------------------------- |
| 32 | +# |
| 33 | +# Since this example is a benchmark, we need to import the necessary libraries |
| 34 | +# to perform the benchmark, including `NetworkX` for the naive implementation, |
| 35 | +# `matplotlib` for visualization, and `sklearn` for the datasets. |
| 36 | + |
| 37 | + |
| 38 | +import networkx as nx |
| 39 | +import numpy as np |
| 40 | +import matplotlib.pyplot as plt |
| 41 | +import time |
| 42 | +import warnings |
| 43 | + |
| 44 | +from sklearn.datasets import fetch_openml |
| 45 | +from radius_clustering import RadiusClustering |
| 46 | +from sklearn.metrics import pairwise_distances_argmin |
| 47 | + |
| 48 | +warnings.filterwarnings("ignore", category=RuntimeWarning, module="sklearn") |
| 49 | +# %% |
| 50 | +# Define a custom MDS solver |
| 51 | +# -------------------------- |
| 52 | +# |
| 53 | +# We define a custom MDS solver that uses the `NetworkX` library to compute the MDS. |
| 54 | +# Note the signature of the function is identical to the one used in the `RadiusClustering` class. |
| 55 | + |
| 56 | + |
| 57 | +def custom_solver(n: int, edges: np.ndarray, nb_edges: int, random_state=None): |
| 58 | + """ |
| 59 | + Custom MDS solver using NetworkX to compute the MDS problem. |
| 60 | + |
| 61 | + Parameters: |
| 62 | + ----------- |
| 63 | + n : int |
| 64 | + The number of points in the dataset. |
| 65 | + edges : np.ndarray |
| 66 | + The edges of the graph, flattened into a 1D array. |
| 67 | + nb_edges : int |
| 68 | + The number of edges in the graph. |
| 69 | + random_state : int | None |
| 70 | + The random state to use for reproducibility. |
| 71 | + |
| 72 | + Returns: |
| 73 | + -------- |
| 74 | + centers : list |
| 75 | + A sorted list of the centers of the clusters. |
| 76 | + mds_exec_time : float |
| 77 | + The execution time of the MDS algorithm in seconds. |
| 78 | + """ |
| 79 | + G = nx.Graph() |
| 80 | + G.add_edges_from(edges) |
| 81 | + |
| 82 | + start_time = time.time() |
| 83 | + centers = list(nx.algorithms.dominating.dominating_set(G)) |
| 84 | + mds_exec_time = time.time() - start_time |
| 85 | + |
| 86 | + centers = sorted(centers) |
| 87 | + |
| 88 | + return centers, mds_exec_time |
| 89 | + |
| 90 | + |
| 91 | +# %% |
| 92 | +# Define datasets to test the clustering |
| 93 | +# -------------------------------------- |
| 94 | +# |
| 95 | +# We will use 4 datasets to test the clustering: |
| 96 | +# 1. Iris dataset |
| 97 | +# 2. Wine dataset |
| 98 | +# 3. Breast Cancer dataset (WDBC) |
| 99 | +# 4. Vehicle dataset |
| 100 | +# These are common datasets used in machine learning and lead to pretty fast results. |
| 101 | +# Structure of the variable `DATASETS`: |
| 102 | +# - The key is the name of the dataset. |
| 103 | +# - The value is a tuple containing: |
| 104 | +# - The dataset fetched from OpenML. |
| 105 | +# - The radius to use for the Radius clustering. (determined in literature, see references on home page) |
| 106 | +# |
| 107 | + |
| 108 | + |
| 109 | +DATASETS = { |
| 110 | + "iris": (fetch_openml(name="iris", version=1, as_frame=False), 1.43), |
| 111 | + "wine": (fetch_openml(name="wine", version=1, as_frame=False), 232.09), |
| 112 | + "glass": (fetch_openml(name="glass", version=1, as_frame=False), 3.94), |
| 113 | + "ionosphere": (fetch_openml(name="ionosphere", version=1, as_frame=False), 5.46), |
| 114 | + "breast_cancer": (fetch_openml(name="wdbc", version=1, as_frame=False), 1197.42), |
| 115 | + "synthetic": (fetch_openml(name="synthetic_control", version=1, as_frame=False), 70.12), |
| 116 | + "vehicle": (fetch_openml(name="vehicle", version=1, as_frame=False), 155.05), |
| 117 | + "yeast": (fetch_openml(name="yeast", version=1, as_frame=False), 0.4235), |
| 118 | +} |
| 119 | + |
| 120 | +# %% |
| 121 | +# Define the benchmark procedure |
| 122 | +# -------------------------------------- |
| 123 | +# |
| 124 | +# We define a function to perform the benchmark on the datasets. |
| 125 | +# The procedure is as follows: |
| 126 | +# 1. Creates an instance of RadiusClustering for each solver. |
| 127 | +# 2. For each instance, fit the algorithm on each dataset. |
| 128 | +# 3. Store the execution time and the number of clusters found for each dataset. |
| 129 | +# 4. Return the results as a dictionary. |
| 130 | + |
| 131 | + |
| 132 | +def benchmark_radius_clustering(): |
| 133 | + results = {} |
| 134 | + exact = RadiusClustering(manner="exact", radius=1.43) |
| 135 | + approx = RadiusClustering(manner="approx", radius=1.43) |
| 136 | + custom = RadiusClustering( |
| 137 | + manner="custom", radius=1.43 |
| 138 | + ) |
| 139 | + custom.set_solver(custom_solver) # Set the custom solver |
| 140 | + algorithms = [exact, approx, custom] |
| 141 | + # Loop through each algorithm and dataset |
| 142 | + for algo in algorithms: |
| 143 | + algo_results = {} |
| 144 | + time_algo = [] |
| 145 | + clusters_algo = [] |
| 146 | + # Loop through each dataset |
| 147 | + for name, (dataset, radius) in DATASETS.items(): |
| 148 | + X = dataset.data |
| 149 | + # set the radius for the dataset considered |
| 150 | + setattr(algo, "radius", radius) |
| 151 | + # Fit the algorithm |
| 152 | + t0 = time.time() |
| 153 | + algo.fit(X) |
| 154 | + t_algo = time.time() - t0 |
| 155 | + |
| 156 | + # Store the results |
| 157 | + time_algo.append(t_algo) |
| 158 | + clusters_algo.append(len(algo.centers_)) |
| 159 | + algo_results["time"] = time_algo |
| 160 | + algo_results["clusters"] = clusters_algo |
| 161 | + results[algo.manner] = algo_results |
| 162 | + |
| 163 | + return results |
| 164 | + |
| 165 | + |
| 166 | +# %% |
| 167 | +# Run the benchmark and plot the results |
| 168 | +# -------------------------------------- |
| 169 | +# We run the benchmark and plot the results for each dataset. |
| 170 | + |
| 171 | + |
| 172 | +results = benchmark_radius_clustering() |
| 173 | + |
| 174 | +# Plot the results |
| 175 | +fig, axs = plt.subplot_mosaic( |
| 176 | + [ |
| 177 | + ["time", "time", "time", "time"], |
| 178 | + ["iris", "wine", "breast_cancer", "vehicle"], |
| 179 | + ["glass", "ionosphere", "synthetic", "yeast"], |
| 180 | + ], |
| 181 | + layout="constrained", |
| 182 | + figsize=(12, 8), |
| 183 | +) |
| 184 | +fig.suptitle("Benchmark of Radius Clustering Solvers", fontsize=16) |
| 185 | + |
| 186 | +axs['time'].set_yscale('log') # Use logarithmic scale for better visibility |
| 187 | +for algo, algo_results in results.items(): |
| 188 | + # Plot execution time |
| 189 | + axs['time'].plot( |
| 190 | + DATASETS.keys(), |
| 191 | + algo_results["time"], |
| 192 | + marker='o', |
| 193 | + label=algo, |
| 194 | + ) |
| 195 | + # Plot number of clusters |
| 196 | + |
| 197 | +for i, (name, (dataset, _)) in enumerate(DATASETS.items()): |
| 198 | + axs[name].bar( |
| 199 | + results.keys(), |
| 200 | + [results[algo]["clusters"][i] for algo in results.keys()], |
| 201 | + label=name, |
| 202 | + ) |
| 203 | + axs[name].axhline( |
| 204 | + y=len(set(dataset.target)), # Number of unique classes in the dataset |
| 205 | + label="True number of clusters", |
| 206 | + color='r', |
| 207 | + linestyle='--', |
| 208 | + ) |
| 209 | + axs[name].set_title(name) |
| 210 | + axs[name].set_xlabel("Algorithms") |
| 211 | + |
| 212 | +axs["iris"].set_ylabel("Number of clusters") |
| 213 | +axs["glass"].set_ylabel("Number of clusters") |
| 214 | + |
| 215 | +axs['time'].set_title("Execution Time (log scale)") |
| 216 | +axs['time'].set_xlabel("Datasets") |
| 217 | +axs['time'].set_ylabel("Time (seconds)") |
| 218 | +axs['time'].legend(title="Algorithms") |
| 219 | +plt.tight_layout() |
| 220 | +plt.show() |
| 221 | + |
| 222 | + |
| 223 | +# %% |
| 224 | +# Conclusion |
| 225 | +# ---------- |
| 226 | +# |
| 227 | +# In this example, we applied Radius clustering to the Iris and Wine datasets and compared it with KMeans clustering. |
| 228 | +# We visualized the clustering results and the difference between the two clustering algorithms. |
| 229 | +# We saw that Radius Clustering can lead to smaller clusters than kmeans, which produces much more equilibrate clusters. |
| 230 | +# The difference plot can be very useful to see where the two clustering algorithms differ. |
0 commit comments