Skip to content

Commit 523cd86

Browse files
authored
Merge pull request #552 from luis261/master
Add parameter to DBCV which toggles the usage of mutual reachability distances
2 parents 94744a5 + 2137a20 commit 523cd86

File tree

1 file changed

+63
-31
lines changed

1 file changed

+63
-31
lines changed

hdbscan/validity.py

Lines changed: 63 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,24 @@ def all_points_core_distance(distance_matrix, d=2.0):
3636
return result
3737

3838

39-
def all_points_mutual_reachability(X, labels, cluster_id,
40-
metric='euclidean', d=None, **kwd_args):
39+
def max_ratio(stacked_distances):
40+
max_ratio = 0
41+
for i in range(stacked_distances.shape[0]):
42+
for j in range(stacked_distances.shape[1]):
43+
dist = stacked_distances[i][j][0]
44+
coredist = stacked_distances[i][j][1]
45+
if dist == 0 or coredist/dist <= max_ratio:
46+
continue
47+
max_ratio = coredist/dist
48+
49+
return max_ratio
50+
51+
52+
def distances_between_points(X, labels, cluster_id,
53+
metric='euclidean', d=None, no_coredist=False,
54+
print_max_raw_to_coredist_ratio=False, **kwd_args):
4155
"""
42-
Compute the all-points-mutual-reachability distances for all the points of
43-
a cluster.
56+
Compute pairwise distances for all the points of a cluster.
4457
4558
If metric is 'precomputed' then assume X is a distance matrix for the full
4659
dataset. Note that in this case you must pass in 'd' the dimension of the
@@ -58,9 +71,7 @@ def all_points_mutual_reachability(X, labels, cluster_id,
5871
cluster label to each data point, with -1 for noise points.
5972
6073
cluster_id : integer
61-
The cluster label for which to compute the all-points
62-
mutual-reachability (which should be done on a cluster
63-
by cluster basis).
74+
The cluster label for which to compute the distances
6475
6576
metric : string
6677
The metric used to compute distances for the clustering (and
@@ -80,9 +91,8 @@ def all_points_mutual_reachability(X, labels, cluster_id,
8091
Returns
8192
-------
8293
83-
mutual_reachaibility : array (n_samples, n_samples)
84-
The pairwise mutual reachability distances between all points in `X`
85-
with `label` equal to `cluster_id`.
94+
distances : array (n_samples, n_samples)
95+
The distances between all points in `X` with `label` equal to `cluster_id`.
8696
8797
core_distances : array (n_samples,)
8898
The all-points-core_distance of all points in `X` with `label` equal
@@ -104,13 +114,19 @@ def all_points_mutual_reachability(X, labels, cluster_id,
104114
**kwd_args)
105115
d = X.shape[1]
106116

107-
core_distances = all_points_core_distance(distance_matrix.copy(), d=d)
108-
core_dist_matrix = np.tile(core_distances, (core_distances.shape[0], 1))
117+
if no_coredist:
118+
return distance_matrix, None
119+
120+
else:
121+
core_distances = all_points_core_distance(distance_matrix.copy(), d=d)
122+
core_dist_matrix = np.tile(core_distances, (core_distances.shape[0], 1))
123+
stacked_distances = np.dstack(
124+
[distance_matrix, core_dist_matrix, core_dist_matrix.T])
109125

110-
result = np.dstack(
111-
[distance_matrix, core_dist_matrix, core_dist_matrix.T]).max(axis=-1)
126+
if print_max_raw_to_coredist_ratio:
127+
print("Max raw distance to coredistance ratio: " + str(max_ratio(stacked_distances)))
112128

113-
return result, core_distances
129+
return stacked_distances.max(axis=-1), core_distances
114130

115131

116132
def internal_minimum_spanning_tree(mr_distances):
@@ -181,11 +197,10 @@ def internal_minimum_spanning_tree(mr_distances):
181197
def density_separation(X, labels, cluster_id1, cluster_id2,
182198
internal_nodes1, internal_nodes2,
183199
core_distances1, core_distances2,
184-
metric='euclidean', **kwd_args):
200+
metric='euclidean', no_coredist=False, **kwd_args):
185201
"""
186202
Compute the density separation between two clusters. This is the minimum
187-
all-points mutual reachability distance between pairs of points, one from
188-
internal nodes of MSTs of each cluster.
203+
distance between pairs of points, one from internal nodes of MSTs of each cluster.
189204
190205
Parameters
191206
----------
@@ -246,20 +261,24 @@ def density_separation(X, labels, cluster_id1, cluster_id2,
246261
cluster2 = X[labels == cluster_id2][internal_nodes2]
247262
distance_matrix = cdist(cluster1, cluster2, metric, **kwd_args)
248263

249-
core_dist_matrix1 = np.tile(core_distances1[internal_nodes1],
250-
(distance_matrix.shape[1], 1)).T
251-
core_dist_matrix2 = np.tile(core_distances2[internal_nodes2],
252-
(distance_matrix.shape[0], 1))
264+
if no_coredist:
265+
return distance_matrix.min()
253266

254-
mr_dist_matrix = np.dstack([distance_matrix,
255-
core_dist_matrix1,
256-
core_dist_matrix2]).max(axis=-1)
267+
else:
268+
core_dist_matrix1 = np.tile(core_distances1[internal_nodes1],
269+
(distance_matrix.shape[1], 1)).T
270+
core_dist_matrix2 = np.tile(core_distances2[internal_nodes2],
271+
(distance_matrix.shape[0], 1))
272+
273+
mr_dist_matrix = np.dstack([distance_matrix,
274+
core_dist_matrix1,
275+
core_dist_matrix2]).max(axis=-1)
257276

258-
return mr_dist_matrix.min()
277+
return mr_dist_matrix.min()
259278

260279

261280
def validity_index(X, labels, metric='euclidean',
262-
d=None, per_cluster_scores=False, **kwd_args):
281+
d=None, per_cluster_scores=False, mst_raw_dist=False, verbose=False, **kwd_args):
263282
"""
264283
Compute the density based cluster validity index for the
265284
clustering specified by `labels` and for each cluster in `labels`.
@@ -291,6 +310,11 @@ def validity_index(X, labels, metric='euclidean',
291310
Defaults to False with the function returning a single float
292311
value for the whole clustering.
293312
313+
mst_raw_dist : optional, boolean (default False)
314+
If True, the MST's are constructed solely via 'raw' distances (depending on the given metric, e.g. euclidean distances)
315+
instead of using mutual reachability distances. Thus setting this parameter to True avoids using 'all-points-core-distances' at all.
316+
This is advantageous specifically in the case of elongated clusters that lie in close proximity to each other <citation needed>.
317+
294318
**kwd_args :
295319
Extra arguments to pass to the distance computation for other
296320
metrics, such as minkowski, Mahanalobis etc.
@@ -327,18 +351,20 @@ def validity_index(X, labels, metric='euclidean',
327351
if np.sum(labels == cluster_id) == 0:
328352
continue
329353

330-
mr_distances, core_distances[
331-
cluster_id] = all_points_mutual_reachability(
354+
distances_for_mst, core_distances[
355+
cluster_id] = distances_between_points(
332356
X,
333357
labels,
334358
cluster_id,
335359
metric,
336360
d,
361+
no_coredist=mst_raw_dist,
362+
print_max_raw_to_coredist_ratio=verbose,
337363
**kwd_args
338364
)
339365

340366
mst_nodes[cluster_id], mst_edges[cluster_id] = \
341-
internal_minimum_spanning_tree(mr_distances)
367+
internal_minimum_spanning_tree(distances_for_mst)
342368
density_sparseness[cluster_id] = mst_edges[cluster_id].T[2].max()
343369

344370
for i in range(max_cluster_id):
@@ -357,7 +383,8 @@ def validity_index(X, labels, metric='euclidean',
357383
X, labels, i, j,
358384
internal_nodes_i, internal_nodes_j,
359385
core_distances[i], core_distances[j],
360-
metric=metric, **kwd_args
386+
metric=metric, no_coredist=mst_raw_dist,
387+
**kwd_args
361388
)
362389
density_sep[j, i] = density_sep[i, j]
363390

@@ -374,6 +401,11 @@ def validity_index(X, labels, metric='euclidean',
374401
(min_density_sep - density_sparseness[i]) /
375402
max(min_density_sep, density_sparseness[i])
376403
)
404+
405+
if verbose:
406+
print("Minimum density separation: " + str(min_density_sep))
407+
print("Density sparseness: " + str(density_sparseness[i]))
408+
377409
cluster_size = np.sum(labels == i)
378410
result += (cluster_size / n_samples) * cluster_validity_indices[i]
379411

0 commit comments

Comments
 (0)