12
12
import numpy as np
13
13
from sklearn .metrics import pairwise_distances
14
14
from sklearn .base import BaseEstimator , ClusterMixin
15
- from sklearn .utils .validation import check_array
15
+ from sklearn .utils .validation import check_array , validate_data , check_random_state
16
16
17
17
from radius_clustering .utils ._emos import py_emos_main
18
18
from radius_clustering .utils ._mds_approx import solve_mds
19
19
20
20
DIR_PATH = os .path .dirname (os .path .realpath (__file__ ))
21
21
22
22
23
- class RadiusClustering (BaseEstimator , ClusterMixin ):
23
+ class RadiusClustering (ClusterMixin , BaseEstimator ):
24
24
"""
25
25
Radius Clustering algorithm.
26
26
@@ -42,29 +42,56 @@ class RadiusClustering(BaseEstimator, ClusterMixin):
42
42
The indices of the cluster centers.
43
43
labels\_ : array-like, shape (n_samples,)
44
44
The cluster labels for each point in the input data.
45
- effective_radius : float
45
+ effective_radius\_ : float
46
46
The maximum distance between any point and its assigned cluster center.
47
+ random_state\_ : int | None
48
+ The random state used for reproducibility. If None, no random state is set.
49
+
50
+ .. note::
51
+ The `random_state_` attribute is not used when the `manner` is set to "exact".
52
+
53
+ .. versionadded:: 1.3.0
54
+ The *random_state* parameter was added to allow reproducibility in the approximate method.
55
+
56
+ .. versionchanged:: 1.3.0
57
+ All publicly accessible attributes are now suffixed with an underscore (e.g., `centers_`, `labels_`).
58
+ This is particularly useful for compatibility with scikit-learn's API.
47
59
"""
48
60
49
- def __init__ (self , manner = "approx" , threshold = 0.5 ):
61
+ _estimator_type = "clusterer"
62
+
63
+ def __init__ (self , manner : str = "approx" , threshold : float = 0.5 , random_state : int | None = None ) -> None :
50
64
self .manner = manner
51
65
self .threshold = threshold
66
+ self .random_state = random_state
52
67
53
- def _check_symmetric (self , a , tol = 1e-8 ):
68
+ def _check_symmetric (self , a : np . ndarray , tol : float = 1e-8 ) -> bool :
54
69
if a .ndim != 2 :
55
70
raise ValueError ("Input must be a 2D array." )
56
71
if a .shape [0 ] != a .shape [1 ]:
57
72
return False
58
73
return np .allclose (a , a .T , atol = tol )
59
74
60
- def fit (self , X , y = None ):
75
+ def fit (self , X : np . ndarray , y : None = None ) -> "RadiusClustering" :
61
76
"""
62
77
Fit the MDS clustering model to the input data.
63
78
79
+ This method computes the distance matrix if the input is a feature matrix,
80
+ or uses the provided distance matrix directly if the input is already a distance matrix.
81
+
82
+ .. note::
83
+ If the input is a distance matrix, it should be symmetric and square.
84
+ If the input is a feature matrix, the distance matrix will be computed using Euclidean distance.
85
+
86
+ .. tip::
87
+ Next version will support providing different metrics or even custom callables to compute the distance matrix.
88
+
64
89
Parameters:
65
90
-----------
66
91
X : array-like, shape (n_samples, n_features)
67
- The input data to cluster.
92
+ The input data to cluster. X should be a 2D array-like structure. It can either be :
93
+ - A distance matrix (symmetric, square) with shape (n_samples, n_samples).
94
+ - A feature matrix with shape (n_samples, n_features) where the distance matrix will be computed.
68
95
y : Ignored
69
96
Not used, present here for API consistency by convention.
70
97
@@ -91,38 +118,43 @@ def fit(self, X, y=None):
91
118
For examples on common datasets and differences with kmeans,
92
119
see :ref:`sphx_glr_auto_examples_plot_iris_example.py`
93
120
"""
94
- self .X = check_array ( X )
121
+ self .X_checked_ = validate_data ( self , X )
95
122
96
123
# Create dist and adj matrices
97
- if not self ._check_symmetric (self .X ):
98
- dist_mat = pairwise_distances (self .X , metric = "euclidean" )
124
+ if not self ._check_symmetric (self .X_checked_ ):
125
+ dist_mat = pairwise_distances (self .X_checked_ , metric = "euclidean" )
99
126
else :
100
- dist_mat = self .X
127
+ dist_mat = self .X_checked_
101
128
adj_mask = np .triu ((dist_mat <= self .threshold ), k = 1 )
102
- self .nb_edges = np .sum (adj_mask )
103
- if self .nb_edges == 0 :
104
- self .centers_ = list (range (self .X .shape [0 ]))
105
- self .labels_ = self .centers_
106
- self .effective_radius = 0
107
- self ._mds_exec_time = 0
129
+ self .nb_edges_ = np .sum (adj_mask )
130
+ if self .nb_edges_ == 0 :
131
+ self .centers_ = list (range (self .X_checked_ .shape [0 ]))
132
+ self .labels_ = np . array ( self .centers_ )
133
+ self .effective_radius_ = 0
134
+ self .mds_exec_time_ = 0
108
135
return self
109
- self .edges = np .argwhere (adj_mask ).astype (np .uint32 ) #TODO: changer en uint32
110
- self .dist_mat = dist_mat
136
+ self .edges_ = np .argwhere (adj_mask ).astype (np .uint32 ) # Edges in the adjacency matrix
137
+ # uint32 is used to use less memory. Max number of features is 2^32-1
138
+ self .dist_mat_ = dist_mat
111
139
112
140
self ._clustering ()
113
141
self ._compute_effective_radius ()
114
142
self ._compute_labels ()
115
143
116
144
return self
117
145
118
- def fit_predict (self , X , y = None ):
146
+ def fit_predict (self , X : np . ndarray , y : None = None ) -> np . ndarray :
119
147
"""
120
148
Fit the model and return the cluster labels.
121
149
150
+ This method is a convenience function that combines `fit` and `predict`.
151
+
122
152
Parameters:
123
153
-----------
124
154
X : array-like, shape (n_samples, n_features)
125
- The input data to cluster.
155
+ The input data to cluster. X should be a 2D array-like structure. It can either be :
156
+ - A distance matrix (symmetric, square) with shape (n_samples, n_samples).
157
+ - A feature matrix with shape (n_samples, n_features) where the distance matrix will be computed.
126
158
y : Ignored
127
159
Not used, present here for API consistency by convention.
128
160
@@ -138,13 +170,13 @@ def _clustering(self):
138
170
"""
139
171
Perform the clustering using either the exact or approximate MDS method.
140
172
"""
141
- n = self .X .shape [0 ]
173
+ n = self .X_checked_ .shape [0 ]
142
174
if self .manner == "exact" :
143
175
self ._clustering_exact (n )
144
176
else :
145
177
self ._clustering_approx (n )
146
178
147
- def _clustering_exact (self , n ) :
179
+ def _clustering_exact (self , n : int ) -> None :
148
180
"""
149
181
Perform exact MDS clustering.
150
182
@@ -158,13 +190,26 @@ def _clustering_exact(self, n):
158
190
This function uses the EMOS algorithm to solve the MDS problem.
159
191
See: [jiang]_ for more details.
160
192
"""
161
- self .centers_ , self ._mds_exec_time = py_emos_main (
162
- self .edges .flatten (), n , self .nb_edges
193
+ self .centers_ , self .mds_exec_time_ = py_emos_main (
194
+ self .edges_ .flatten (), n , self .nb_edges_
163
195
)
164
196
165
- def _clustering_approx (self , n ) :
197
+ def _clustering_approx (self , n : int ) -> None :
166
198
"""
167
- Perform approximate MDS clustering.
199
+ Perform approximate MDS clustering. This method uses a pretty trick to set the seed for the random state of the C++ code of the MDS solver.
200
+
201
+ .. tip::
202
+ The random state is used to ensure reproducibility of the results when using the approximate method.
203
+ If `random_state` is None, a default value of 42 is used.
204
+
205
+ .. important::
206
+ :collapsible: closed
207
+ The trick to set the random state is :
208
+ 1. Use the `check_random_state` function to get a `RandomState`singleton instance, set up with the provided `random_state`.
209
+ 2. Use the `randint` method of the `RandomState` instance to generate a random integer.
210
+ 3. Use this random integer as the seed for the C++ code of the MDS solver.
211
+
212
+ This ensures that the seed passed to the C++ code is always an integer, which is required by the MDS solver, and allows for reproducibility of the results.
168
213
169
214
Parameters:
170
215
-----------
@@ -176,9 +221,13 @@ def _clustering_approx(self, n):
176
221
This function uses the approximation method to solve the MDS problem.
177
222
See [casado]_ for more details.
178
223
"""
179
- result = solve_mds (n , self .edges .flatten ().astype (np .int32 ), self .nb_edges , "test" )
224
+ if self .random_state is None :
225
+ self .random_state = 42
226
+ self .random_state_ = check_random_state (self .random_state )
227
+ seed = self .random_state_ .randint (np .iinfo (np .int32 ).max )
228
+ result = solve_mds (n , self .edges_ .flatten ().astype (np .int32 ), self .nb_edges_ , seed )
180
229
self .centers_ = [x for x in result ["solution_set" ]]
181
- self ._mds_exec_time = result ["Time" ]
230
+ self .mds_exec_time_ = result ["Time" ]
182
231
183
232
def _compute_effective_radius (self ):
184
233
"""
@@ -187,13 +236,13 @@ def _compute_effective_radius(self):
187
236
The effective radius is the maximum radius among all clusters.
188
237
That means EffRad = max(R(C_i)) for all i.
189
238
"""
190
- self .effective_radius = np .min (self .dist_mat [:, self .centers_ ], axis = 1 ).max ()
239
+ self .effective_radius_ = np .min (self .dist_mat_ [:, self .centers_ ], axis = 1 ).max ()
191
240
192
241
def _compute_labels (self ):
193
242
"""
194
243
Compute the cluster labels for each point in the dataset.
195
244
"""
196
- distances = self .dist_mat [:, self .centers_ ]
245
+ distances = self .dist_mat_ [:, self .centers_ ]
197
246
self .labels_ = np .argmin (distances , axis = 1 )
198
247
199
248
min_dist = np .min (distances , axis = 1 )
0 commit comments