Skip to content

Commit 1f04e0b

Browse files
committed
handle sparse matrices, add user warning for too many missing
1 parent e92012d commit 1f04e0b

File tree

1 file changed

+15
-3
lines changed

1 file changed

+15
-3
lines changed

hdbscan/hdbscan_.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,10 @@ def _hdbscan_generic(X, min_samples=5, alpha=1.0, metric='minkowski', p=2,
7272
elif metric == 'arccos':
7373
distance_matrix = pairwise_distances(X, metric='cosine', **kwargs)
7474
elif metric == 'precomputed':
75-
# Treating this case explicitly instead of letting sklearn.metrics.pairwise_distances handle it enables
76-
# the usage of numpy.inf in the distance matrix to indicate missing information.
75+
# Treating this case explicitly, instead of letting
76+
# sklearn.metrics.pairwise_distances handle it,
77+
# enables the usage of numpy.inf in the distance
78+
# matrix to indicate missing distance information.
7779
# TODO: Check if copying is necessary
7880
distance_matrix = X.copy()
7981
else:
@@ -91,6 +93,13 @@ def _hdbscan_generic(X, min_samples=5, alpha=1.0, metric='minkowski', p=2,
9193

9294
min_spanning_tree = mst_linkage_core(mutual_reachability_)
9395

96+
# Warn if the MST couldn't be constructed around the missing distances
97+
if np.isinf(min_spanning_tree.T[2]).any():
98+
warn('The minimum spanning tree contains edge weights with value '
99+
'infinity. Potentially, you are missing too many distances '
100+
'in the initial distance matrix for the given neighborhood '
101+
'size.', UserWarning)
102+
94103
# mst_linkage_core does not generate a full minimal spanning tree
95104
# If a tree is required then we must build the edges from the information
96105
# returned by mst_linkage_core (i.e. just the order of points to be merged)
@@ -477,10 +486,13 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
477486
'Should be one of: "eom", "leaf"\n')
478487

479488
# Checks input and converts to an nd-array where possible
480-
if metric != 'precomputed':
489+
if metric != 'precomputed' or issparse(X):
481490
X = check_array(X, accept_sparse='csr')
482491
else:
492+
# Only non-sparse, precomputed distance matrices are handled here
493+
# and thereby allowed to contain numpy.inf for missing distances
483494
check_precomputed_distance_matrix(X)
495+
484496
# Python 2 and 3 compliant string_type checking
485497
if isinstance(memory, six.string_types):
486498
memory = Memory(cachedir=memory, verbose=0)

0 commit comments

Comments
 (0)