Skip to content

Commit 51fd5ed

Browse files
authored
Merge pull request #189 from LGro/missing_distances
allow np.inf for missing values in case of precomputed metric
2 parents f3769a5 + 9cdc513 commit 51fd5ed

File tree

1 file changed

+37
-2
lines changed

1 file changed

+37
-2
lines changed

hdbscan/hdbscan_.py

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,13 @@ def _hdbscan_generic(X, min_samples=5, alpha=1.0, metric='minkowski', p=2,
7171
distance_matrix = pairwise_distances(X, metric=metric, p=p)
7272
elif metric == 'arccos':
7373
distance_matrix = pairwise_distances(X, metric='cosine', **kwargs)
74+
elif metric == 'precomputed':
75+
# Treating this case explicitly, instead of letting
76+
# sklearn.metrics.pairwise_distances handle it,
77+
# enables the usage of numpy.inf in the distance
78+
# matrix to indicate missing distance information.
79+
# TODO: Check if copying is necessary
80+
distance_matrix = X.copy()
7481
else:
7582
distance_matrix = pairwise_distances(X, metric=metric, **kwargs)
7683

@@ -86,6 +93,13 @@ def _hdbscan_generic(X, min_samples=5, alpha=1.0, metric='minkowski', p=2,
8693

8794
min_spanning_tree = mst_linkage_core(mutual_reachability_)
8895

96+
# Warn if the MST couldn't be constructed around the missing distances
97+
if np.isinf(min_spanning_tree.T[2]).any():
98+
warn('The minimum spanning tree contains edge weights with value '
99+
'infinity. Potentially, you are missing too many distances '
100+
'in the initial distance matrix for the given neighborhood '
101+
'size.', UserWarning)
102+
89103
# mst_linkage_core does not generate a full minimal spanning tree
90104
# If a tree is required then we must build the edges from the information
91105
# returned by mst_linkage_core (i.e. just the order of points to be merged)
@@ -282,6 +296,14 @@ def _hdbscan_boruvka_balltree(X, min_samples=5, alpha=1.0,
282296
return single_linkage_tree, None
283297

284298

299+
def check_precomputed_distance_matrix(X):
300+
"""Perform check_array(X) after removing infinite values (numpy.inf) from the given distance matrix.
301+
"""
302+
tmp = X.copy()
303+
tmp[np.isinf(tmp)] = 1
304+
check_array(tmp)
305+
306+
285307
def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
286308
metric='minkowski', p=2, leaf_size=40,
287309
algorithm='best', memory=Memory(cachedir=None, verbose=0),
@@ -464,7 +486,13 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
464486
'Should be one of: "eom", "leaf"\n')
465487

466488
# Checks input and converts to an nd-array where possible
467-
X = check_array(X, accept_sparse='csr')
489+
if metric != 'precomputed' or issparse(X):
490+
X = check_array(X, accept_sparse='csr')
491+
else:
492+
# Only non-sparse, precomputed distance matrices are handled here
493+
# and thereby allowed to contain numpy.inf for missing distances
494+
check_precomputed_distance_matrix(X)
495+
468496
# Python 2 and 3 compliant string_type checking
469497
if isinstance(memory, six.string_types):
470498
memory = Memory(cachedir=memory, verbose=0)
@@ -798,9 +826,16 @@ def fit(self, X, y=None):
798826
self : object
799827
Returns self
800828
"""
801-
X = check_array(X, accept_sparse='csr')
802829
if self.metric != 'precomputed':
830+
X = check_array(X, accept_sparse='csr')
803831
self._raw_data = X
832+
elif issparse(X):
833+
# Handle sparse precomputed distance matrices separately
834+
X = check_array(X, accept_sparse='csr')
835+
else:
836+
# Only non-sparse, precomputed distance matrices are allowed
837+
# to have numpy.inf values indicating missing distances
838+
check_precomputed_distance_matrix(X)
804839

805840
kwargs = self.get_params()
806841
# prediction data only applies to the persistent model, so remove

0 commit comments

Comments
 (0)