9
9
"""
10
10
11
11
import os
12
+ import warnings
13
+
12
14
import numpy as np
13
- from sklearn .metrics import pairwise_distances
14
15
from sklearn .base import BaseEstimator , ClusterMixin
15
- from sklearn .utils .validation import check_array , validate_data , check_random_state
16
+ from sklearn .metrics import pairwise_distances
17
+ from sklearn .utils .validation import check_random_state , validate_data
16
18
17
19
from radius_clustering .utils ._emos import py_emos_main
18
20
from radius_clustering .utils ._mds_approx import solve_mds
21
23
22
24
23
25
class RadiusClustering (ClusterMixin , BaseEstimator ):
24
- """
26
+ r """
25
27
Radius Clustering algorithm.
26
28
27
29
This class implements clustering based on the Minimum Dominating Set (MDS) problem.
@@ -46,29 +48,52 @@ class RadiusClustering(ClusterMixin, BaseEstimator):
46
48
The maximum distance between any point and its assigned cluster center.
47
49
random_state\_ : int | None
48
50
The random state used for reproducibility. If None, no random state is set.
49
-
51
+
50
52
.. note::
51
53
The `random_state_` attribute is not used when the `manner` is set to "exact".
52
-
54
+
53
55
.. versionadded:: 1.3.0
54
- The *random_state* parameter was added to allow reproducibility in the approximate method.
56
+ The *random_state* parameter was added to allow reproducibility in
57
+ the approximate method.
55
58
56
59
.. versionchanged:: 1.3.0
57
- All publicly accessible attributes are now suffixed with an underscore (e.g., `centers_`, `labels_`).
60
+ All publicly accessible attributes are now suffixed with an underscore
61
+ (e.g., `centers_`, `labels_`).
58
62
This is particularly useful for compatibility with scikit-learn's API.
59
-
60
- .. versionchanged:: 1.3.0
61
- The `threshold` parameter was renamed to `radius` to better reflect its purpose.
63
+
64
+ .. versionadded:: 1.3.0
65
+ The `radius` parameter replaces the `threshold` parameter for setting
66
+ the dissimilarity threshold for better clarity and consistency.
67
+
68
+ .. deprecated:: 1.3.0
69
+ The `threshold` parameter is deprecated. Use `radius` instead.
70
+ Will be removed in a future version.
62
71
"""
63
72
64
73
_estimator_type = "clusterer"
65
74
66
- def __init__ (self , manner : str = "approx" , radius : float = 0.5 , random_state : int | None = None ) -> None :
75
+ def __init__ (
76
+ self ,
77
+ manner : str = "approx" ,
78
+ radius : float = 0.5 ,
79
+ threshold = None ,
80
+ random_state : int | None = None ,
81
+ ) -> None :
82
+ if threshold is not None :
83
+ warnings .warn (
84
+ "The 'threshold' parameter is deprecated and"
85
+ " will be removed in a future version."
86
+ "Please use 'radius' instead." ,
87
+ DeprecationWarning ,
88
+ stacklevel = 2 ,
89
+ )
90
+ radius = threshold
91
+ self .threshold = threshold # For backward compatibility
67
92
self .manner = manner
68
93
self .radius = radius
69
94
self .random_state = random_state
70
95
71
- def _check_symmetric (self , a : np .ndarray , tol : float = 1e-8 ) -> bool :
96
+ def _check_symmetric (self , a : np .ndarray , tol : float = 1e-8 ) -> bool :
72
97
if a .ndim != 2 :
73
98
raise ValueError ("Input must be a 2D array." )
74
99
if a .shape [0 ] != a .shape [1 ]:
@@ -80,21 +105,26 @@ def fit(self, X: np.ndarray, y: None = None) -> "RadiusClustering":
80
105
Fit the MDS clustering model to the input data.
81
106
82
107
This method computes the distance matrix if the input is a feature matrix,
83
- or uses the provided distance matrix directly if the input is already a distance matrix.
108
+ or uses the provided distance matrix directly if the input is already
109
+ a distance matrix.
84
110
85
111
.. note::
86
112
If the input is a distance matrix, it should be symmetric and square.
87
- If the input is a feature matrix, the distance matrix will be computed using Euclidean distance.
88
-
113
+ If the input is a feature matrix, the distance matrix
114
+ will be computed using Euclidean distance.
115
+
89
116
.. tip::
90
- Next version will support providing different metrics or even custom callables to compute the distance matrix.
117
+ Next version will support providing different metrics or
118
+ even custom callables to compute the distance matrix.
91
119
92
120
Parameters:
93
121
-----------
94
122
X : array-like, shape (n_samples, n_features)
95
- The input data to cluster. X should be a 2D array-like structure. It can either be :
123
+ The input data to cluster. X should be a 2D array-like structure.
124
+ It can either be :
96
125
- A distance matrix (symmetric, square) with shape (n_samples, n_samples).
97
- - A feature matrix with shape (n_samples, n_features) where the distance matrix will be computed.
126
+ - A feature matrix with shape (n_samples, n_features)
127
+ where the distance matrix will be computed.
98
128
y : Ignored
99
129
Not used, present here for API consistency by convention.
100
130
@@ -128,7 +158,7 @@ def fit(self, X: np.ndarray, y: None = None) -> "RadiusClustering":
128
158
dist_mat = pairwise_distances (self .X_checked_ , metric = "euclidean" )
129
159
else :
130
160
dist_mat = self .X_checked_
131
-
161
+
132
162
if not isinstance (self .radius , (float , int )):
133
163
raise ValueError ("Radius must be a positive float." )
134
164
if self .radius <= 0 :
@@ -141,7 +171,9 @@ def fit(self, X: np.ndarray, y: None = None) -> "RadiusClustering":
141
171
self .effective_radius_ = 0
142
172
self .mds_exec_time_ = 0
143
173
return self
144
- self .edges_ = np .argwhere (adj_mask ).astype (np .uint32 ) # Edges in the adjacency matrix
174
+ self .edges_ = np .argwhere (adj_mask ).astype (
175
+ np .uint32
176
+ ) # Edges in the adjacency matrix
145
177
# uint32 is used to use less memory. Max number of features is 2^32-1
146
178
self .dist_mat_ = dist_mat
147
179
@@ -160,9 +192,11 @@ def fit_predict(self, X: np.ndarray, y: None = None) -> np.ndarray:
160
192
Parameters:
161
193
-----------
162
194
X : array-like, shape (n_samples, n_features)
163
- The input data to cluster. X should be a 2D array-like structure. It can either be :
195
+ The input data to cluster. X should be a 2D array-like structure.
196
+ It can either be :
164
197
- A distance matrix (symmetric, square) with shape (n_samples, n_samples).
165
- - A feature matrix with shape (n_samples, n_features) where the distance matrix will be computed.
198
+ - A feature matrix with shape (n_samples, n_features) where
199
+ the distance matrix will be computed.
166
200
y : Ignored
167
201
Not used, present here for API consistency by convention.
168
202
@@ -181,9 +215,7 @@ def _clustering(self):
181
215
n = self .X_checked_ .shape [0 ]
182
216
if self .manner != "exact" and self .manner != "approx" :
183
217
print (f"Invalid manner: { self .manner } . Defaulting to 'approx'." )
184
- raise ValueError (
185
- "Invalid manner. Choose either 'exact' or 'approx'."
186
- )
218
+ raise ValueError ("Invalid manner. Choose either 'exact' or 'approx'." )
187
219
if self .manner == "exact" :
188
220
self ._clustering_exact (n )
189
221
else :
@@ -210,20 +242,27 @@ def _clustering_exact(self, n: int) -> None:
210
242
211
243
def _clustering_approx (self , n : int ) -> None :
212
244
"""
213
- Perform approximate MDS clustering. This method uses a pretty trick to set the seed for the random state of the C++ code of the MDS solver.
245
+ Perform approximate MDS clustering.
246
+ This method uses a pretty trick to set the seed for
247
+ the random state of the C++ code of the MDS solver.
214
248
215
249
.. tip::
216
- The random state is used to ensure reproducibility of the results when using the approximate method.
250
+ The random state is used to ensure reproducibility of the results
251
+ when using the approximate method.
217
252
If `random_state` is None, a default value of 42 is used.
218
-
253
+
219
254
.. important::
220
255
:collapsible: closed
221
256
The trick to set the random state is :
222
- 1. Use the `check_random_state` function to get a `RandomState`singleton instance, set up with the provided `random_state`.
223
- 2. Use the `randint` method of the `RandomState` instance to generate a random integer.
257
+ 1. Use the `check_random_state` function to get a `RandomState`singleton
258
+ instance, set up with the provided `random_state`.
259
+ 2. Use the `randint` method of the `RandomState` instance to generate a
260
+ random integer.
224
261
3. Use this random integer as the seed for the C++ code of the MDS solver.
225
262
226
- This ensures that the seed passed to the C++ code is always an integer, which is required by the MDS solver, and allows for reproducibility of the results.
263
+ This ensures that the seed passed to the C++ code is always an integer,
264
+ which is required by the MDS solver, and allows for
265
+ reproducibility of the results.
227
266
228
267
Parameters:
229
268
-----------
@@ -239,7 +278,9 @@ def _clustering_approx(self, n: int) -> None:
239
278
self .random_state = 42
240
279
self .random_state_ = check_random_state (self .random_state )
241
280
seed = self .random_state_ .randint (np .iinfo (np .int32 ).max )
242
- result = solve_mds (n , self .edges_ .flatten ().astype (np .int32 ), self .nb_edges_ , seed )
281
+ result = solve_mds (
282
+ n , self .edges_ .flatten ().astype (np .int32 ), self .nb_edges_ , seed
283
+ )
243
284
self .centers_ = sorted ([x for x in result ["solution_set" ]])
244
285
self .mds_exec_time_ = result ["Time" ]
245
286
0 commit comments