20
20
from sklearn .utils .validation import check_is_fitted
21
21
from sklearn .exceptions import ConvergenceWarning
22
22
23
+ # cython implementation of swap step in PAM algorithm.
24
+ from ._k_medoids_helper import _compute_optimal_swap , _build
25
+
23
26
24
27
class KMedoids (BaseEstimator , ClusterMixin , TransformerMixin ):
25
28
"""k-medoids clustering.
@@ -35,17 +38,27 @@ class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin):
35
38
metric : string, or callable, optional, default: 'euclidean'
36
39
What distance metric to use. See :func:metrics.pairwise_distances
37
40
38
- init : {'random', 'heuristic', 'k-medoids++'}, optional, default: 'heuristic'
41
+ method : {'alternate', 'pam'}, default: 'alternate'
42
+ Which algorithm to use.
43
+
44
+ init : {'random', 'heuristic', 'k-medoids++', 'build'}, optional, default: 'build'
39
45
Specify medoid initialization method. 'random' selects n_clusters
40
46
elements from the dataset. 'heuristic' picks the n_clusters points
41
47
with the smallest sum distance to every other point. 'k-medoids++'
42
48
follows an approach based on k-means++_, and in general, gives initial
43
49
medoids which are more separated than those generated by the other methods.
50
+ 'build' is a greedy initialization of the medoids used in the original PAM
51
+ algorithm. Often 'build' is more efficient but slower than other
52
+ initializations on big datasets and it is also very non-robust,
53
+ if there are outliers in the dataset, use another initialization.
44
54
45
55
.. _k-means++: https://theory.stanford.edu/~sergei/papers/kMeansPP-soda.pdf
46
56
47
57
max_iter : int, optional, default : 300
48
- Specify the maximum number of iterations when fitting.
58
+ Specify the maximum number of iterations when fitting. It can be zero in
59
+ which case only the initialization is computed which may be suitable for
60
+ large datasets when the initialization is sufficiently efficient
61
+ (i.e. for 'build' init).
49
62
50
63
random_state : int, RandomState instance or None, optional
51
64
Specify random state for the random number generator. Used to
@@ -112,24 +125,25 @@ def __init__(
112
125
self ,
113
126
n_clusters = 8 ,
114
127
metric = "euclidean" ,
128
+ method = "alternate" ,
115
129
init = "heuristic" ,
116
130
max_iter = 300 ,
117
131
random_state = None ,
118
132
):
119
133
self .n_clusters = n_clusters
120
134
self .metric = metric
135
+ self .method = method
121
136
self .init = init
122
137
self .max_iter = max_iter
123
138
self .random_state = random_state
124
139
125
- def _check_nonnegative_int (self , value , desc ):
140
+ def _check_nonnegative_int (self , value , desc , strict = True ):
126
141
"""Validates if value is a valid integer > 0"""
127
-
128
- if (
129
- value is None
130
- or value <= 0
131
- or not isinstance (value , (int , np .integer ))
132
- ):
142
+ if strict :
143
+ negative = (value is None ) or (value <= 0 )
144
+ else :
145
+ negative = (value is None ) or (value < 0 )
146
+ if negative or not isinstance (value , (int , np .integer )):
133
147
raise ValueError (
134
148
"%s should be a nonnegative integer. "
135
149
"%s was given" % (desc , value )
@@ -140,10 +154,10 @@ def _check_init_args(self):
140
154
141
155
# Check n_clusters and max_iter
142
156
self ._check_nonnegative_int (self .n_clusters , "n_clusters" )
143
- self ._check_nonnegative_int (self .max_iter , "max_iter" )
157
+ self ._check_nonnegative_int (self .max_iter , "max_iter" , False )
144
158
145
159
# Check init
146
- init_methods = ["random" , "heuristic" , "k-medoids++" ]
160
+ init_methods = ["random" , "heuristic" , "k-medoids++" , "build" ]
147
161
if self .init not in init_methods :
148
162
raise ValueError (
149
163
"init needs to be one of "
@@ -183,15 +197,44 @@ def fit(self, X, y=None):
183
197
)
184
198
labels = None
185
199
200
+ if self .method == "pam" :
201
+ # Compute the distance to the first and second closest points
202
+ # among medoids.
203
+ Djs , Ejs = np .sort (D [medoid_idxs ], axis = 0 )[[0 , 1 ]]
204
+
186
205
# Continue the algorithm as long as
187
206
# the medoids keep changing and the maximum number
188
207
# of iterations is not exceeded
208
+
189
209
for self .n_iter_ in range (0 , self .max_iter ):
190
210
old_medoid_idxs = np .copy (medoid_idxs )
191
211
labels = np .argmin (D [medoid_idxs , :], axis = 0 )
192
212
193
- # Update medoids with the new cluster indices
194
- self ._update_medoid_idxs_in_place (D , labels , medoid_idxs )
213
+ if self .method == "alternate" :
214
+ # Update medoids with the new cluster indices
215
+ self ._update_medoid_idxs_in_place (D , labels , medoid_idxs )
216
+ elif self .method == "pam" :
217
+ not_medoid_idxs = np .delete (np .arange (len (D )), medoid_idxs )
218
+ optimal_swap = _compute_optimal_swap (
219
+ D ,
220
+ medoid_idxs .astype (np .intc ),
221
+ not_medoid_idxs .astype (np .intc ),
222
+ Djs ,
223
+ Ejs ,
224
+ self .n_clusters ,
225
+ )
226
+ if optimal_swap is not None :
227
+ i , j , _ = optimal_swap
228
+ medoid_idxs [medoid_idxs == i ] = j
229
+
230
+ # update Djs and Ejs with new medoids
231
+ Djs , Ejs = np .sort (D [medoid_idxs ], axis = 0 )[[0 , 1 ]]
232
+ else :
233
+ raise ValueError (
234
+ f"method={ self .method } is not supported. Supported methods "
235
+ f"are 'pam' and 'alternate'."
236
+ )
237
+
195
238
if np .all (old_medoid_idxs == medoid_idxs ):
196
239
break
197
240
elif self .n_iter_ == self .max_iter - 1 :
@@ -210,7 +253,7 @@ def fit(self, X, y=None):
210
253
211
254
# Expose labels_ which are the assignments of
212
255
# the training data to clusters
213
- self .labels_ = labels
256
+ self .labels_ = np . argmin ( D [ medoid_idxs , :], axis = 0 )
214
257
self .medoid_indices_ = medoid_idxs
215
258
self .inertia_ = self ._compute_inertia (self .transform (X ))
216
259
@@ -252,6 +295,10 @@ def _update_medoid_idxs_in_place(self, D, labels, medoid_idxs):
252
295
if min_cost < curr_cost :
253
296
medoid_idxs [k ] = cluster_k_idxs [min_cost_idx ]
254
297
298
+ def _compute_cost (self , D , medoid_idxs ):
299
+ """ Compute the cose for a given configuration of the medoids"""
300
+ return self ._compute_inertia (D [:, medoid_idxs ])
301
+
255
302
def transform (self , X ):
256
303
"""Transforms X to cluster-distance space.
257
304
@@ -339,6 +386,8 @@ def _initialize_medoids(self, D, n_clusters, random_state_):
339
386
medoids = np .argpartition (np .sum (D , axis = 1 ), n_clusters - 1 )[
340
387
:n_clusters
341
388
]
389
+ elif self .init == "build" : # Build initialization
390
+ medoids = _build (D , n_clusters ).astype (np .int64 )
342
391
else :
343
392
raise ValueError (f"init value '{ self .init } ' not recognized" )
344
393
0 commit comments