@@ -71,6 +71,13 @@ def _hdbscan_generic(X, min_samples=5, alpha=1.0, metric='minkowski', p=2,
71
71
distance_matrix = pairwise_distances (X , metric = metric , p = p )
72
72
elif metric == 'arccos' :
73
73
distance_matrix = pairwise_distances (X , metric = 'cosine' , ** kwargs )
74
+ elif metric == 'precomputed' :
75
+ # Treating this case explicitly, instead of letting
76
+ # sklearn.metrics.pairwise_distances handle it,
77
+ # enables the usage of numpy.inf in the distance
78
+ # matrix to indicate missing distance information.
79
+ # TODO: Check if copying is necessary
80
+ distance_matrix = X .copy ()
74
81
else :
75
82
distance_matrix = pairwise_distances (X , metric = metric , ** kwargs )
76
83
@@ -86,6 +93,13 @@ def _hdbscan_generic(X, min_samples=5, alpha=1.0, metric='minkowski', p=2,
86
93
87
94
min_spanning_tree = mst_linkage_core (mutual_reachability_ )
88
95
96
+ # Warn if the MST couldn't be constructed around the missing distances
97
+ if np .isinf (min_spanning_tree .T [2 ]).any ():
98
+ warn ('The minimum spanning tree contains edge weights with value '
99
+ 'infinity. Potentially, you are missing too many distances '
100
+ 'in the initial distance matrix for the given neighborhood '
101
+ 'size.' , UserWarning )
102
+
89
103
# mst_linkage_core does not generate a full minimal spanning tree
90
104
# If a tree is required then we must build the edges from the information
91
105
# returned by mst_linkage_core (i.e. just the order of points to be merged)
@@ -282,6 +296,14 @@ def _hdbscan_boruvka_balltree(X, min_samples=5, alpha=1.0,
282
296
return single_linkage_tree , None
283
297
284
298
299
+ def check_precomputed_distance_matrix (X ):
300
+ """Perform check_array(X) after removing infinite values (numpy.inf) from the given distance matrix.
301
+ """
302
+ tmp = X .copy ()
303
+ tmp [np .isinf (tmp )] = 1
304
+ check_array (tmp )
305
+
306
+
285
307
def hdbscan (X , min_cluster_size = 5 , min_samples = None , alpha = 1.0 ,
286
308
metric = 'minkowski' , p = 2 , leaf_size = 40 ,
287
309
algorithm = 'best' , memory = Memory (cachedir = None , verbose = 0 ),
@@ -464,7 +486,13 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
464
486
'Should be one of: "eom", "leaf"\n ' )
465
487
466
488
# Checks input and converts to an nd-array where possible
467
- X = check_array (X , accept_sparse = 'csr' )
489
+ if metric != 'precomputed' or issparse (X ):
490
+ X = check_array (X , accept_sparse = 'csr' )
491
+ else :
492
+ # Only non-sparse, precomputed distance matrices are handled here
493
+ # and thereby allowed to contain numpy.inf for missing distances
494
+ check_precomputed_distance_matrix (X )
495
+
468
496
# Python 2 and 3 compliant string_type checking
469
497
if isinstance (memory , six .string_types ):
470
498
memory = Memory (cachedir = memory , verbose = 0 )
@@ -798,9 +826,16 @@ def fit(self, X, y=None):
798
826
self : object
799
827
Returns self
800
828
"""
801
- X = check_array (X , accept_sparse = 'csr' )
802
829
if self .metric != 'precomputed' :
830
+ X = check_array (X , accept_sparse = 'csr' )
803
831
self ._raw_data = X
832
+ elif issparse (X ):
833
+ # Handle sparse precomputed distance matrices separately
834
+ X = check_array (X , accept_sparse = 'csr' )
835
+ else :
836
+ # Only non-sparse, precomputed distance matrices are allowed
837
+ # to have numpy.inf values indicating missing distances
838
+ check_precomputed_distance_matrix (X )
804
839
805
840
kwargs = self .get_params ()
806
841
# prediction data only applies to the persistent model, so remove
0 commit comments