Skip to content

Commit e92012d

Browse files
committed
add np.inf for missing values in precomputed metric
1 parent f3769a5 commit e92012d

File tree

1 file changed

+20
-2
lines changed

1 file changed

+20
-2
lines changed

hdbscan/hdbscan_.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,11 @@ def _hdbscan_generic(X, min_samples=5, alpha=1.0, metric='minkowski', p=2,
7171
distance_matrix = pairwise_distances(X, metric=metric, p=p)
7272
elif metric == 'arccos':
7373
distance_matrix = pairwise_distances(X, metric='cosine', **kwargs)
74+
elif metric == 'precomputed':
75+
# Treating this case explicitly instead of letting sklearn.metrics.pairwise_distances handle it enables
76+
# the usage of numpy.inf in the distance matrix to indicate missing information.
77+
# TODO: Check if copying is necessary
78+
distance_matrix = X.copy()
7479
else:
7580
distance_matrix = pairwise_distances(X, metric=metric, **kwargs)
7681

@@ -282,6 +287,14 @@ def _hdbscan_boruvka_balltree(X, min_samples=5, alpha=1.0,
282287
return single_linkage_tree, None
283288

284289

290+
def check_precomputed_distance_matrix(X):
291+
"""Perform check_array(X) after removing infinite values (numpy.inf) from the given distance matrix.
292+
"""
293+
tmp = X.copy()
294+
tmp[np.isinf(tmp)] = 1
295+
check_array(tmp)
296+
297+
285298
def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
286299
metric='minkowski', p=2, leaf_size=40,
287300
algorithm='best', memory=Memory(cachedir=None, verbose=0),
@@ -464,7 +477,10 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
464477
'Should be one of: "eom", "leaf"\n')
465478

466479
# Checks input and converts to an nd-array where possible
467-
X = check_array(X, accept_sparse='csr')
480+
if metric != 'precomputed':
481+
X = check_array(X, accept_sparse='csr')
482+
else:
483+
check_precomputed_distance_matrix(X)
468484
# Python 2 and 3 compliant string_type checking
469485
if isinstance(memory, six.string_types):
470486
memory = Memory(cachedir=memory, verbose=0)
@@ -798,9 +814,11 @@ def fit(self, X, y=None):
798814
self : object
799815
Returns self
800816
"""
801-
X = check_array(X, accept_sparse='csr')
802817
if self.metric != 'precomputed':
818+
X = check_array(X, accept_sparse='csr')
803819
self._raw_data = X
820+
else:
821+
check_precomputed_distance_matrix(X)
804822

805823
kwargs = self.get_params()
806824
# prediction data only applies to the persistent model, so remove

0 commit comments

Comments
 (0)