Skip to content

Commit 19a1d6c

Browse files
committed
#659: [Hotfix] Empty clusters and their medoids are erased for K-Medoids.
1 parent 878c199 commit 19a1d6c

File tree

13 files changed

+261
-47
lines changed

13 files changed

+261
-47
lines changed

CHANGES

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
------------------------------------------------------------------------
22

3-
CHANGE NOTES FOR 0.10.1.1 (STARTED Aug 24, 2020), (RELEASED: Nov 24, 2020)
3+
CHANGE NOTES FOR 0.10.1.2 (STARTED Nov 25, 2020), (RELEASED: Nov 25, 2020)
44

55
------------------------------------------------------------------------
66

@@ -10,6 +10,18 @@ CORRECTED MAJOR BUGS:
1010
See: https://github.com/annoviko/pyclustering/issues/659
1111

1212

13+
------------------------------------------------------------------------
14+
15+
CHANGE NOTES FOR 0.10.1.1 (STARTED Nov 24, 2020), (RELEASED: Nov 24, 2020)
16+
17+
------------------------------------------------------------------------
18+
19+
CORRECTED MAJOR BUGS:
20+
21+
- Corrected bug with incorrect cluster allocation for K-Medoids (C++ `pyclustering::clst::kmeadois`).
22+
See: https://github.com/annoviko/pyclustering/issues/659
23+
24+
1325
------------------------------------------------------------------------
1426

1527
CHANGE NOTES FOR 0.10.1 (STARTED Aug 17, 2020), (RELEASED: Nov 19, 2020)

README.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ Python and C++ implementations (C++ pyclustering library) of each algorithm or
99
model. C++ pyclustering library is a part of pyclustering and supported for
1010
Linux, Windows and MacOS operating systems.
1111

12-
**Version**: 0.10.1.1
12+
**Version**: 0.10.1.2
1313

1414
**License**: The 3-Clause BSD License
1515

ccore/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
cmake_minimum_required(VERSION 3.10)
99

1010

11-
project(pyclustering VERSION 0.10.1.1 LANGUAGES CXX)
11+
project(pyclustering VERSION 0.10.1.2 LANGUAGES CXX)
1212

1313

1414
file(MAKE_DIRECTORY build)

ccore/include/pyclustering/cluster/kmedoids.hpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,15 @@ class kmedoids {
199199
200200
*/
201201
double calculate_swap_cost(const std::size_t p_index_candidate, const std::size_t p_index_cluster) const;
202+
203+
/*!
204+
205+
@brief Erase empty clusters and their medoids.
206+
@details Data might have identical points and a lot of identical points and as a result medoids might correspond
207+
to points that are totally identical.
208+
209+
*/
210+
void erase_empty_clusters();
202211
};
203212

204213

ccore/src/cluster/kmedoids.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,8 @@ void kmedoids::process(const dataset & p_data, const kmedoids_data_t p_type, kme
9595
}
9696
}
9797

98+
erase_empty_clusters();
99+
98100
m_data_ptr = nullptr;
99101
m_result_ptr = nullptr;
100102
}
@@ -222,6 +224,19 @@ double kmedoids::calculate_swap_cost(const std::size_t p_index_candidate, const
222224
}
223225

224226

227+
void kmedoids::erase_empty_clusters() {
228+
auto & clusters = m_result_ptr->clusters();
229+
auto & medoids = m_result_ptr->medoids();
230+
231+
for (std::size_t index_cluster = clusters.size() - 1; index_cluster != static_cast<std::size_t>(-1); index_cluster--) {
232+
if (clusters[index_cluster].empty()) {
233+
clusters.erase(clusters.begin() + index_cluster);
234+
medoids.erase(medoids.begin() + index_cluster);
235+
}
236+
}
237+
}
238+
239+
225240
}
226241

227242
}

ccore/src/interface/interface_property.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212

1313
const char * INTERFACE_DESCRIPTION = "pyclustering library is a C/C++ part of python pyclustering library";
14-
const char * INTERFACE_VERSION = "0.10.1.1";
14+
const char * INTERFACE_VERSION = "0.10.1.2";
1515

1616

1717
void * get_interface_description() {

pyclustering/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,7 @@
283283

284284

285285
## The current version of pyclustering library.
286-
__version__ = '0.10.1.1'
286+
__version__ = '0.10.1.2'
287287

288288
## The current root directory of pyclustering library.
289289
__PYCLUSTERING_ROOT_DIRECTORY__ = str(pathlib.Path(__file__).parent)

pyclustering/cluster/examples/kmedoids_examples.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,21 @@
1313
from pyclustering.cluster import cluster_visualizer
1414
from pyclustering.cluster.kmedoids import kmedoids
1515

16-
from pyclustering.utils import read_sample
16+
from pyclustering.utils import read_sample, calculate_distance_matrix
1717
from pyclustering.utils import timedcall, distance_metric, type_metric
1818

1919

20-
def template_clustering(start_medoids, path, tolerance=0.25, show=True, ccore=True):
21-
sample = read_sample(path)
20+
def template_clustering(start_medoids, path, tolerance=0.25, show=True, **kwargs):
21+
ccore = kwargs.get('ccore', True)
22+
data_type = kwargs.get('data_type', 'points')
23+
24+
original_data = read_sample(path)
25+
sample = original_data
26+
if data_type == 'distance_matrix':
27+
sample = calculate_distance_matrix(sample)
2228

2329
metric = distance_metric(type_metric.EUCLIDEAN_SQUARE, data=sample)
24-
kmedoids_instance = kmedoids(sample, start_medoids, tolerance, metric=metric, ccore=ccore)
30+
kmedoids_instance = kmedoids(sample, start_medoids, tolerance, metric=metric, ccore=ccore, data_type=data_type)
2531
(ticks, result) = timedcall(kmedoids_instance.process)
2632

2733
clusters = kmedoids_instance.get_clusters()
@@ -31,12 +37,12 @@ def template_clustering(start_medoids, path, tolerance=0.25, show=True, ccore=Tr
3137

3238
if show is True:
3339
visualizer = cluster_visualizer(1)
34-
visualizer.append_clusters(clusters, sample, 0)
35-
visualizer.append_cluster([sample[index] for index in start_medoids], marker='*', markersize=15)
36-
visualizer.append_cluster(medoids, data=sample, marker='*', markersize=15)
40+
visualizer.append_clusters(clusters, original_data, 0)
41+
visualizer.append_cluster([original_data[index] for index in start_medoids], marker='*', markersize=15)
42+
visualizer.append_cluster(medoids, data=original_data, marker='*', markersize=15)
3743
visualizer.show()
3844

39-
return sample, clusters
45+
return original_data, clusters
4046

4147

4248
def cluster_sample1():
@@ -58,7 +64,8 @@ def cluster_elongate():
5864
template_clustering([8, 56], SIMPLE_SAMPLES.SAMPLE_ELONGATE)
5965

6066
def cluster_lsun():
61-
template_clustering([10, 275, 385], FCPS_SAMPLES.SAMPLE_LSUN)
67+
#template_clustering([10, 275, 385], FCPS_SAMPLES.SAMPLE_LSUN)
68+
template_clustering([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], FCPS_SAMPLES.SAMPLE_LSUN, data_type='distance_matrix')
6269

6370
def cluster_target():
6471
template_clustering([10, 160, 310, 460, 560, 700], FCPS_SAMPLES.SAMPLE_TARGET)

pyclustering/cluster/kmedoids.py

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -105,11 +105,11 @@ def __init__(self, data, initial_index_medoids, tolerance=0.0001, ccore=True, **
105105
@param[in] initial_index_medoids (list): Indexes of intial medoids (indexes of points in input data).
106106
@param[in] tolerance (double): Stop condition: if maximum value of distance change of medoids of clusters is less than tolerance than algorithm will stop processing.
107107
@param[in] ccore (bool): If specified than CCORE library (C++ pyclustering library) is used for clustering instead of Python code.
108-
@param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric', 'data_type', 'itermax').
108+
@param[in] **kwargs: Arbitrary keyword arguments (available arguments: `metric`, `data_type`, `itermax`).
109109
110110
<b>Keyword Args:</b><br>
111111
- metric (distance_metric): Metric that is used for distance calculation between two points.
112-
- data_type (string): Data type of input sample 'data' that is processed by the algorithm ('points', 'distance_matrix').
112+
- data_type (string): Data type of input sample `data` that is processed by the algorithm (`points`, `distance_matrix`).
113113
- itermax (uint): Maximum number of iteration for cluster analysis.
114114
115115
"""
@@ -172,6 +172,8 @@ def process(self):
172172

173173
iterations += 1
174174

175+
self.__erase_empty_clusters()
176+
175177
return self
176178

177179

@@ -316,7 +318,7 @@ def __update_clusters(self):
316318

317319
for index in range(len(self.__medoid_indexes)):
318320
dist = self.__distance_calculator(index_point, self.__medoid_indexes[index])
319-
321+
320322
if dist < dist_optim_first:
321323
dist_optim_second = dist_optim_first
322324
index_optim = index
@@ -386,3 +388,36 @@ def __calculate_swap_cost(self, index_candidate, cluster_index):
386388
cost += candidate_distance - self.__distance_first_medoid[index_point]
387389

388390
return cost - self.__distance_first_medoid[index_candidate]
391+
392+
393+
def __erase_empty_clusters(self):
394+
"""!
395+
@brief Erase empty clusters and their medoids.
396+
@details Data might have identical points and a lot of identical points and as a result medoids might correspond
397+
to points that are totally identical.
398+
399+
"""
400+
401+
erase_required = False
402+
403+
# Before processing check if there are empty clusters
404+
for cluster in self.__clusters:
405+
if len(cluster) == 0:
406+
erase_required = True
407+
break
408+
409+
if erase_required is False:
410+
return
411+
412+
none_empty_clusters = []
413+
none_empty_medoids = []
414+
415+
for index in range(len(self.__clusters)):
416+
if len(self.__clusters[index]) == 0:
417+
continue
418+
419+
none_empty_clusters.append(self.__clusters[index])
420+
none_empty_medoids.append(self.__medoid_indexes[index])
421+
422+
self.__clusters = none_empty_clusters
423+
self.__medoid_indexes = none_empty_medoids

pyclustering/cluster/tests/integration/it_kmedoids.py

Lines changed: 90 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from pyclustering.cluster.tests.kmedoids_templates import kmedoids_test_template
1919
from pyclustering.cluster.kmedoids import kmedoids
2020

21-
from pyclustering.samples.definitions import SIMPLE_SAMPLES
21+
from pyclustering.samples.definitions import SIMPLE_SAMPLES, SIMPLE_ANSWERS
2222

2323
from pyclustering.utils import read_sample
2424
from pyclustering.utils.metric import type_metric, distance_metric
@@ -139,7 +139,7 @@ def testClusterAllocationTheSameObjectsThreeInitialMedoidsByCore(self):
139139
kmedoids_test_template.templateClusterAllocationTheSameObjects(25, 3, True)
140140

141141
def testCoreInterfaceIntInputData(self):
142-
kmedoids_instance = kmedoids([ [1], [2], [3], [20], [21], [22] ], [ 2, 5 ], 0.025, True)
142+
kmedoids_instance = kmedoids([[1], [2], [3], [20], [21], [22]], [2, 5], 0.025, True)
143143
kmedoids_instance.process()
144144
assert len(kmedoids_instance.get_clusters()) == 2
145145

@@ -153,18 +153,21 @@ def testAllocatedRequestedClustersSampleSimple04ByCore(self):
153153

154154
def testAllocatedRequestedClustersWithTheSamePointsByCore(self):
155155
# Bug issue #366 - Kmedoids returns incorrect number of clusters.
156-
sample = [ [0.0, 0.0], [0.1, 0.1], [0.0, 0.0], [0.1, 0.2] ]
157-
kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, 4, None, True)
156+
sample = [[0.0, 0.0], [0.1, 0.1], [0.0, 0.0], [0.1, 0.2]]
157+
kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, 3, None, True)
158158
kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, 3, None, True)
159159
kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, 2, None, True)
160160
kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, 1, None, True)
161161

162+
def testAllocatedRequestedClustersWithTheSamePoints2(self):
163+
sample = [[0.23, 0.2], [-0.1, 0.1], [0.0, 0.9], [0.1, -0.2], [0.8, 0.1], [-0.1, 0.1], [-0.4, -0.2], [0.0, 0.9]]
164+
answers = [1, 2, 3, 4, 5, 6, 6, 6]
165+
for expected_amount in answers:
166+
kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, expected_amount, None, True)
167+
162168
def testAllocatedRequestedClustersWithTotallyTheSamePointsByCore(self):
163169
# Bug issue #366 - Kmedoids returns incorrect number of clusters.
164-
sample = [ [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0] ]
165-
kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, 4, None, True)
166-
kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, 3, None, True)
167-
kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, 2, None, True)
170+
sample = [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]
168171
kmedoids_test_template.templateAllocateRequestedClusterAmount(sample, 1, None, True)
169172

170173

@@ -184,3 +187,82 @@ def testItermax10Simple01(self):
184187

185188
def testItermax10Simple02(self):
186189
kmedoids_test_template.templateLengthProcessData(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, [3, 12, 20], [10, 5, 8], True, itermax=10)
190+
191+
192+
def testSimple01AnswerByCore(self):
193+
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, SIMPLE_ANSWERS.ANSWER_SIMPLE1, True, random_state=1000)
194+
195+
def testSimple01AnswerDistanceMatrixByCore(self):
196+
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE1, SIMPLE_ANSWERS.ANSWER_SIMPLE1, True, random_state=1000, data_type='distance_matrix')
197+
198+
def testSimple02AnswerByCore(self):
199+
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, SIMPLE_ANSWERS.ANSWER_SIMPLE2, True, random_state=1000)
200+
201+
def testSimple02AnswerDistanceMatrixByCore(self):
202+
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE2, SIMPLE_ANSWERS.ANSWER_SIMPLE2, True, random_state=1000, data_type='distance_matrix')
203+
204+
def testSimple03AnswerByCore(self):
205+
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, SIMPLE_ANSWERS.ANSWER_SIMPLE3, True, random_state=1000)
206+
207+
def testSimple03AnswerDistanceMatrixByCore(self):
208+
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE3, SIMPLE_ANSWERS.ANSWER_SIMPLE3, True, random_state=1000, data_type='distance_matrix')
209+
210+
def testSimple04AnswerByCore(self):
211+
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE4, SIMPLE_ANSWERS.ANSWER_SIMPLE4, True, random_state=1000)
212+
213+
def testSimple04AnswerDistanceMatrixByCore(self):
214+
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE4, SIMPLE_ANSWERS.ANSWER_SIMPLE4, True, random_state=1000, data_type='distance_matrix')
215+
216+
def testSimple05AnswerByCore(self):
217+
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE5, SIMPLE_ANSWERS.ANSWER_SIMPLE5, True, random_state=1000)
218+
219+
def testSimple05AnswerDistanceMatrixByCore(self):
220+
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE5, SIMPLE_ANSWERS.ANSWER_SIMPLE5, True, random_state=1000, data_type='distance_matrix')
221+
222+
def testSimple06AnswerByCore(self):
223+
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE6, SIMPLE_ANSWERS.ANSWER_SIMPLE6, True, random_state=1000)
224+
225+
def testSimple06AnswerDistanceMatrixByCore(self):
226+
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE6, SIMPLE_ANSWERS.ANSWER_SIMPLE6, True, random_state=1000, data_type='distance_matrix')
227+
228+
def testSimple07AnswerByCore(self):
229+
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE7, SIMPLE_ANSWERS.ANSWER_SIMPLE7, True, random_state=1000)
230+
231+
def testSimple07AnswerDistanceMatrixByCore(self):
232+
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE7, SIMPLE_ANSWERS.ANSWER_SIMPLE7, True, random_state=1000, data_type='distance_matrix')
233+
234+
def testSimple08AnswerByCore(self):
235+
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE8, SIMPLE_ANSWERS.ANSWER_SIMPLE8, True, random_state=1000)
236+
237+
def testSimple08AnswerDistanceMatrixByCore(self):
238+
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE8, SIMPLE_ANSWERS.ANSWER_SIMPLE8, True, random_state=1000, data_type='distance_matrix')
239+
240+
def testSimple09AnswerByCore(self):
241+
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE9, SIMPLE_ANSWERS.ANSWER_SIMPLE9, True, random_state=1000)
242+
243+
def testSimple09AnswerDistanceMatrixByCore(self):
244+
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE9, SIMPLE_ANSWERS.ANSWER_SIMPLE9, True, random_state=1000, data_type='distance_matrix')
245+
246+
def testSimple10AnswerByCore(self):
247+
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE10, SIMPLE_ANSWERS.ANSWER_SIMPLE10, True, random_state=1000)
248+
249+
def testSimple10AnswerDistanceMatrixByCore(self):
250+
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE10, SIMPLE_ANSWERS.ANSWER_SIMPLE10, True, random_state=1000, data_type='distance_matrix')
251+
252+
def testSimple11AnswerByCore(self):
253+
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE11, SIMPLE_ANSWERS.ANSWER_SIMPLE11, True, random_state=1000)
254+
255+
def testSimple11AnswerDistanceMatrixByCore(self):
256+
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE11, SIMPLE_ANSWERS.ANSWER_SIMPLE11, True, random_state=1000, data_type='distance_matrix')
257+
258+
def testSimple12AnswerByCore(self):
259+
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE12, SIMPLE_ANSWERS.ANSWER_SIMPLE12, True, random_state=1000)
260+
261+
def testSimple12AnswerDistanceMatrixByCore(self):
262+
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE12, SIMPLE_ANSWERS.ANSWER_SIMPLE12, True, random_state=1000, data_type='distance_matrix')
263+
264+
def testSimple13AnswerByCore(self):
265+
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE13, SIMPLE_ANSWERS.ANSWER_SIMPLE13, True, random_state=1000)
266+
267+
def testSimple13AnswerDistanceMatrixByCore(self):
268+
kmedoids_test_template.clustering_with_answer(SIMPLE_SAMPLES.SAMPLE_SIMPLE13, SIMPLE_ANSWERS.ANSWER_SIMPLE13, True, random_state=1000, data_type='distance_matrix')

0 commit comments

Comments
 (0)