Skip to content

Commit d099f66

Browse files
Add PAM algorithm to K-Medoids (#73)
1 parent 0a2615c commit d099f66

File tree

4 files changed

+216
-16
lines changed

4 files changed

+216
-16
lines changed

setup.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,11 @@
6666
["sklearn_extra/utils/_cyfht.pyx"],
6767
include_dirs=[np.get_include()],
6868
),
69+
Extension(
70+
"sklearn_extra.cluster._k_medoids_helper",
71+
["sklearn_extra/cluster/_k_medoids_helper.pyx"],
72+
include_dirs=[np.get_include()],
73+
),
6974
Extension(
7075
"sklearn_extra.robust._robust_weighted_estimator_helper",
7176
["sklearn_extra/robust/_robust_weighted_estimator_helper.pyx"],

sklearn_extra/cluster/_k_medoids.py

Lines changed: 63 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020
from sklearn.utils.validation import check_is_fitted
2121
from sklearn.exceptions import ConvergenceWarning
2222

23+
# cython implementation of swap step in PAM algorithm.
24+
from ._k_medoids_helper import _compute_optimal_swap, _build
25+
2326

2427
class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin):
2528
"""k-medoids clustering.
@@ -35,17 +38,27 @@ class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin):
3538
metric : string, or callable, optional, default: 'euclidean'
3639
What distance metric to use. See :func:metrics.pairwise_distances
3740
38-
init : {'random', 'heuristic', 'k-medoids++'}, optional, default: 'heuristic'
41+
method : {'alternate', 'pam'}, default: 'alternate'
42+
Which algorithm to use.
43+
44+
init : {'random', 'heuristic', 'k-medoids++', 'build'}, optional, default: 'build'
3945
Specify medoid initialization method. 'random' selects n_clusters
4046
elements from the dataset. 'heuristic' picks the n_clusters points
4147
with the smallest sum distance to every other point. 'k-medoids++'
4248
follows an approach based on k-means++_, and in general, gives initial
4349
medoids which are more separated than those generated by the other methods.
50+
'build' is a greedy initialization of the medoids used in the original PAM
51+
algorithm. Often 'build' is more efficient but slower than other
52+
initializations on big datasets and it is also very non-robust,
53+
if there are outliers in the dataset, use another initialization.
4454
4555
.. _k-means++: https://theory.stanford.edu/~sergei/papers/kMeansPP-soda.pdf
4656
4757
max_iter : int, optional, default : 300
48-
Specify the maximum number of iterations when fitting.
58+
Specify the maximum number of iterations when fitting. It can be zero in
59+
which case only the initialization is computed which may be suitable for
60+
large datasets when the initialization is sufficiently efficient
61+
(i.e. for 'build' init).
4962
5063
random_state : int, RandomState instance or None, optional
5164
Specify random state for the random number generator. Used to
@@ -112,24 +125,25 @@ def __init__(
112125
self,
113126
n_clusters=8,
114127
metric="euclidean",
128+
method="alternate",
115129
init="heuristic",
116130
max_iter=300,
117131
random_state=None,
118132
):
119133
self.n_clusters = n_clusters
120134
self.metric = metric
135+
self.method = method
121136
self.init = init
122137
self.max_iter = max_iter
123138
self.random_state = random_state
124139

125-
def _check_nonnegative_int(self, value, desc):
140+
def _check_nonnegative_int(self, value, desc, strict=True):
126141
"""Validates if value is a valid integer > 0"""
127-
128-
if (
129-
value is None
130-
or value <= 0
131-
or not isinstance(value, (int, np.integer))
132-
):
142+
if strict:
143+
negative = (value is None) or (value <= 0)
144+
else:
145+
negative = (value is None) or (value < 0)
146+
if negative or not isinstance(value, (int, np.integer)):
133147
raise ValueError(
134148
"%s should be a nonnegative integer. "
135149
"%s was given" % (desc, value)
@@ -140,10 +154,10 @@ def _check_init_args(self):
140154

141155
# Check n_clusters and max_iter
142156
self._check_nonnegative_int(self.n_clusters, "n_clusters")
143-
self._check_nonnegative_int(self.max_iter, "max_iter")
157+
self._check_nonnegative_int(self.max_iter, "max_iter", False)
144158

145159
# Check init
146-
init_methods = ["random", "heuristic", "k-medoids++"]
160+
init_methods = ["random", "heuristic", "k-medoids++", "build"]
147161
if self.init not in init_methods:
148162
raise ValueError(
149163
"init needs to be one of "
@@ -183,15 +197,44 @@ def fit(self, X, y=None):
183197
)
184198
labels = None
185199

200+
if self.method == "pam":
201+
# Compute the distance to the first and second closest points
202+
# among medoids.
203+
Djs, Ejs = np.sort(D[medoid_idxs], axis=0)[[0, 1]]
204+
186205
# Continue the algorithm as long as
187206
# the medoids keep changing and the maximum number
188207
# of iterations is not exceeded
208+
189209
for self.n_iter_ in range(0, self.max_iter):
190210
old_medoid_idxs = np.copy(medoid_idxs)
191211
labels = np.argmin(D[medoid_idxs, :], axis=0)
192212

193-
# Update medoids with the new cluster indices
194-
self._update_medoid_idxs_in_place(D, labels, medoid_idxs)
213+
if self.method == "alternate":
214+
# Update medoids with the new cluster indices
215+
self._update_medoid_idxs_in_place(D, labels, medoid_idxs)
216+
elif self.method == "pam":
217+
not_medoid_idxs = np.delete(np.arange(len(D)), medoid_idxs)
218+
optimal_swap = _compute_optimal_swap(
219+
D,
220+
medoid_idxs.astype(np.intc),
221+
not_medoid_idxs.astype(np.intc),
222+
Djs,
223+
Ejs,
224+
self.n_clusters,
225+
)
226+
if optimal_swap is not None:
227+
i, j, _ = optimal_swap
228+
medoid_idxs[medoid_idxs == i] = j
229+
230+
# update Djs and Ejs with new medoids
231+
Djs, Ejs = np.sort(D[medoid_idxs], axis=0)[[0, 1]]
232+
else:
233+
raise ValueError(
234+
f"method={self.method} is not supported. Supported methods "
235+
f"are 'pam' and 'alternate'."
236+
)
237+
195238
if np.all(old_medoid_idxs == medoid_idxs):
196239
break
197240
elif self.n_iter_ == self.max_iter - 1:
@@ -210,7 +253,7 @@ def fit(self, X, y=None):
210253

211254
# Expose labels_ which are the assignments of
212255
# the training data to clusters
213-
self.labels_ = labels
256+
self.labels_ = np.argmin(D[medoid_idxs, :], axis=0)
214257
self.medoid_indices_ = medoid_idxs
215258
self.inertia_ = self._compute_inertia(self.transform(X))
216259

@@ -252,6 +295,10 @@ def _update_medoid_idxs_in_place(self, D, labels, medoid_idxs):
252295
if min_cost < curr_cost:
253296
medoid_idxs[k] = cluster_k_idxs[min_cost_idx]
254297

298+
def _compute_cost(self, D, medoid_idxs):
299+
""" Compute the cose for a given configuration of the medoids"""
300+
return self._compute_inertia(D[:, medoid_idxs])
301+
255302
def transform(self, X):
256303
"""Transforms X to cluster-distance space.
257304
@@ -339,6 +386,8 @@ def _initialize_medoids(self, D, n_clusters, random_state_):
339386
medoids = np.argpartition(np.sum(D, axis=1), n_clusters - 1)[
340387
:n_clusters
341388
]
389+
elif self.init == "build": # Build initialization
390+
medoids = _build(D, n_clusters).astype(np.int64)
342391
else:
343392
raise ValueError(f"init value '{self.init}' not recognized")
344393

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# cython: infer_types=True
2+
# Fast swap step and build step in PAM algorithm for k_medoid.
3+
# Author: Timothée Mathieu
4+
# License: 3-clause BSD
5+
6+
cimport cython
7+
8+
import numpy as np
9+
cimport numpy as np
10+
from cython cimport floating, integral
11+
12+
@cython.boundscheck(False) # Deactivate bounds checking
13+
def _compute_optimal_swap( floating[:,:] D,
14+
int[:] medoid_idxs,
15+
int[:] not_medoid_idxs,
16+
floating[:] Djs,
17+
floating[:] Ejs,
18+
int n_clusters):
19+
"""Compute best cost change for all the possible swaps."""
20+
21+
# Initialize best cost change and the associated swap couple.
22+
cdef (int, int, floating) best_cost_change = (1, 1, 0.0)
23+
cdef int sample_size = len(D)
24+
cdef int i, j, h, id_i, id_h, id_j
25+
cdef floating cost_change
26+
cdef int not_medoid_shape = sample_size - n_clusters
27+
cdef bint cluster_i_bool, not_cluster_i_bool, second_best_medoid
28+
cdef bint not_second_best_medoid
29+
30+
# Compute the change in cost for each swap.
31+
for h in range(not_medoid_shape):
32+
# id of the potential new medoid.
33+
id_h = not_medoid_idxs[h]
34+
for i in range(n_clusters):
35+
# id of the medoid we want to replace.
36+
id_i = medoid_idxs[i]
37+
cost_change = 0.0
38+
# compute for all not-selected points the change in cost
39+
for j in range(not_medoid_shape):
40+
id_j = not_medoid_idxs[j]
41+
cluster_i_bool = D[id_i, id_j] == Djs[id_j]
42+
not_cluster_i_bool = D[id_i, id_j] != Djs[id_j]
43+
second_best_medoid = D[id_h, id_j] < Ejs[id_j]
44+
not_second_best_medoid = D[id_h, id_j] >= Ejs[id_j]
45+
46+
if cluster_i_bool & second_best_medoid:
47+
cost_change += D[id_j, id_h] - Djs[id_j]
48+
elif cluster_i_bool & not_second_best_medoid:
49+
cost_change += Ejs[id_j] - Djs[id_j]
50+
elif not_cluster_i_bool & (D[id_j, id_h] < Djs[id_j]):
51+
cost_change += D[id_j, id_h] - Djs[id_j]
52+
53+
# same for i
54+
second_best_medoid = D[id_h, id_i] < Ejs[id_i]
55+
if second_best_medoid:
56+
cost_change += D[id_i, id_h]
57+
else:
58+
cost_change += Ejs[id_i]
59+
60+
if cost_change < best_cost_change[2]:
61+
best_cost_change = (id_i, id_h, cost_change)
62+
63+
# If one of the swap decrease the objective, return that swap.
64+
if best_cost_change[2] < 0:
65+
return best_cost_change
66+
else:
67+
return None
68+
69+
70+
71+
72+
def _build( floating[:, :] D, int n_clusters):
73+
"""Compute BUILD initialization, a greedy medoid initialization."""
74+
75+
cdef int[:] medoid_idxs = np.zeros(n_clusters, dtype = np.intc)
76+
cdef int sample_size = len(D)
77+
cdef int[:] not_medoid_idxs = np.zeros(sample_size, dtype = np.intc)
78+
cdef int i, j, id_i, id_j
79+
80+
medoid_idxs[0] = np.argmin(np.sum(D,axis=0))
81+
not_medoid_idxs = np.delete(not_medoid_idxs, medoid_idxs[0])
82+
83+
cdef int n_medoids_current = 1
84+
85+
cdef floating[:] Dj = D[medoid_idxs[0]].copy()
86+
cdef floating cost_change
87+
cdef (int, int) new_medoid = (medoid_idxs[0], 0)
88+
cdef floating cost_change_max
89+
90+
for _ in range(n_clusters -1):
91+
cost_change_max = 0
92+
for i in range(sample_size - n_medoids_current):
93+
id_i = not_medoid_idxs[i]
94+
cost_change = 0
95+
for j in range(sample_size - n_medoids_current):
96+
id_j = not_medoid_idxs[j]
97+
cost_change += max(0, Dj[id_j] - D[id_i, id_j])
98+
if cost_change >= cost_change_max:
99+
cost_change_max = cost_change
100+
new_medoid = (id_i, i)
101+
102+
103+
medoid_idxs[n_medoids_current] = new_medoid[0]
104+
n_medoids_current += 1
105+
not_medoid_idxs = np.delete(not_medoid_idxs, new_medoid[1])
106+
107+
108+
for id_j in range(sample_size):
109+
Dj[id_j] = min(Dj[id_j], D[id_j, new_medoid[0]])
110+
return np.array(medoid_idxs)

sklearn_extra/cluster/tests/test_k_medoids.py

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,46 @@
1212

1313
from sklearn_extra.cluster import KMedoids
1414
from sklearn.cluster import KMeans
15+
from sklearn.datasets import make_blobs
16+
1517

1618
seed = 0
1719
X = np.random.RandomState(seed).rand(100, 5)
1820

21+
# test kmedoid's results
22+
rng = np.random.RandomState(seed)
23+
X_cc, y_cc = make_blobs(
24+
n_samples=100,
25+
centers=np.array([[-1, -1], [1, 1]]),
26+
random_state=rng,
27+
shuffle=False,
28+
)
29+
30+
31+
@pytest.mark.parametrize("method", ["alternate", "pam"])
32+
@pytest.mark.parametrize(
33+
"init", ["random", "heuristic", "build", "k-medoids++"]
34+
)
35+
def test_kmedoid_results(method, init):
36+
expected = np.hstack([np.zeros(50), np.ones(50)])
37+
km = KMedoids(n_clusters=2, init=init, method=method)
38+
km.fit(X_cc)
39+
# This test use data that are not perfectly separable so the
40+
# accuracy is not 1. Accuracy around 0.85
41+
assert (np.mean(km.labels_ == expected) > 0.8) or (
42+
1 - np.mean(km.labels_ == expected) > 0.8
43+
)
44+
45+
46+
def test_medoids_invalid_method():
47+
with pytest.raises(ValueError, match="invalid is not supported"):
48+
KMedoids(n_clusters=1, method="invalid").fit([[0, 1], [1, 1]])
49+
50+
51+
def test_medoids_invalid_init():
52+
with pytest.raises(ValueError, match="init needs to be one of"):
53+
KMedoids(n_clusters=1, init="invalid").fit([[0, 1], [1, 1]])
54+
1955

2056
def test_kmedoids_input_validation_and_fit_check():
2157
rng = np.random.RandomState(seed)
@@ -28,9 +64,9 @@ def test_kmedoids_input_validation_and_fit_check():
2864
with pytest.raises(ValueError, match=msg):
2965
KMedoids(n_clusters=None).fit(X)
3066

31-
msg = "max_iter should be a nonnegative integer. 0 was given"
67+
msg = "max_iter should be a nonnegative integer. -1 was given"
3268
with pytest.raises(ValueError, match=msg):
33-
KMedoids(n_clusters=1, max_iter=0).fit(X)
69+
KMedoids(n_clusters=1, max_iter=-1).fit(X)
3470

3571
msg = "max_iter should be a nonnegative integer. None was given"
3672
with pytest.raises(ValueError, match=msg):

0 commit comments

Comments
 (0)