@@ -72,8 +72,10 @@ def _hdbscan_generic(X, min_samples=5, alpha=1.0, metric='minkowski', p=2,
72
72
elif metric == 'arccos' :
73
73
distance_matrix = pairwise_distances (X , metric = 'cosine' , ** kwargs )
74
74
elif metric == 'precomputed' :
75
- # Treating this case explicitly instead of letting sklearn.metrics.pairwise_distances handle it enables
76
- # the usage of numpy.inf in the distance matrix to indicate missing information.
75
+ # Treating this case explicitly, instead of letting
76
+ # sklearn.metrics.pairwise_distances handle it,
77
+ # enables the usage of numpy.inf in the distance
78
+ # matrix to indicate missing distance information.
77
79
# TODO: Check if copying is necessary
78
80
distance_matrix = X .copy ()
79
81
else :
@@ -91,6 +93,13 @@ def _hdbscan_generic(X, min_samples=5, alpha=1.0, metric='minkowski', p=2,
91
93
92
94
min_spanning_tree = mst_linkage_core (mutual_reachability_ )
93
95
96
+ # Warn if the MST couldn't be constructed around the missing distances
97
+ if np .isinf (min_spanning_tree .T [2 ]).any ():
98
+ warn ('The minimum spanning tree contains edge weights with value '
99
+ 'infinity. Potentially, you are missing too many distances '
100
+ 'in the initial distance matrix for the given neighborhood '
101
+ 'size.' , UserWarning )
102
+
94
103
# mst_linkage_core does not generate a full minimal spanning tree
95
104
# If a tree is required then we must build the edges from the information
96
105
# returned by mst_linkage_core (i.e. just the order of points to be merged)
@@ -477,10 +486,13 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
477
486
'Should be one of: "eom", "leaf"\n ' )
478
487
479
488
# Checks input and converts to an nd-array where possible
480
- if metric != 'precomputed' :
489
+ if metric != 'precomputed' or issparse ( X ) :
481
490
X = check_array (X , accept_sparse = 'csr' )
482
491
else :
492
+ # Only non-sparse, precomputed distance matrices are handled here
493
+ # and thereby allowed to contain numpy.inf for missing distances
483
494
check_precomputed_distance_matrix (X )
495
+
484
496
# Python 2 and 3 compliant string_type checking
485
497
if isinstance (memory , six .string_types ):
486
498
memory = Memory (cachedir = memory , verbose = 0 )
0 commit comments