4848def _tree_to_labels (X , single_linkage_tree , min_cluster_size = 10 ,
4949 cluster_selection_method = 'eom' ,
5050 allow_single_cluster = False ,
51- match_reference_implementation = False ):
51+ match_reference_implementation = False ,
52+ cluster_selection_epsilon = 0.0 ):
5253 """Converts a pretrained tree and cluster size into a
5354 set of labels and probabilities.
5455 """
@@ -59,7 +60,8 @@ def _tree_to_labels(X, single_linkage_tree, min_cluster_size=10,
5960 stability_dict ,
6061 cluster_selection_method ,
6162 allow_single_cluster ,
62- match_reference_implementation )
63+ match_reference_implementation ,
64+ cluster_selection_epsilon )
6365
6466 return (labels , probabilities , stabilities , condensed_tree ,
6567 single_linkage_tree )
@@ -210,9 +212,9 @@ def _hdbscan_prims_kdtree(X, min_samples=5, alpha=1.0,
210212
211213 # Convert edge list into standard hierarchical clustering format
212214 single_linkage_tree = label (min_spanning_tree )
213-
215+
214216 if gen_min_span_tree :
215- warn ('Cannot generate Minimum Spanning Tree; '
217+ warn ('Cannot generate Minimum Spanning Tree; '
216218 'the implemented Prim\' s does not produce '
217219 'the full minimum spanning tree ' , UserWarning )
218220
@@ -246,9 +248,9 @@ def _hdbscan_prims_balltree(X, min_samples=5, alpha=1.0,
246248 :]
247249 # Convert edge list into standard hierarchical clustering format
248250 single_linkage_tree = label (min_spanning_tree )
249-
251+
250252 if gen_min_span_tree :
251- warn ('Cannot generate Minimum Spanning Tree; '
253+ warn ('Cannot generate Minimum Spanning Tree; '
252254 'the implemented Prim\' s does not produce '
253255 'the full minimum spanning tree ' , UserWarning )
254256
@@ -327,7 +329,7 @@ def check_precomputed_distance_matrix(X):
327329 check_array (tmp )
328330
329331
330- def hdbscan (X , min_cluster_size = 5 , min_samples = None , alpha = 1.0 ,
332+ def hdbscan (X , min_cluster_size = 5 , min_samples = None , alpha = 1.0 , cluster_selection_epsilon = 0.0 ,
331333 metric = 'minkowski' , p = 2 , leaf_size = 40 ,
332334 algorithm = 'best' , memory = Memory (cachedir = None , verbose = 0 ),
333335 approx_min_span_tree = True , gen_min_span_tree = False ,
@@ -353,6 +355,10 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
353355 to be considered as a core point. This includes the point itself.
354356 defaults to the min_cluster_size.
355357
358+ cluster_selection_epsilon: float, optional (default=0.0)
359+ A distance threshold. Clusters below this value will be merged.
360+ See [3]_ for more information.
361+
356362 alpha : float, optional (default=1.0)
357363 A distance scaling parameter as used in robust single linkage.
358364 See [2]_ for more information.
@@ -471,6 +477,8 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
471477 cluster tree. In Advances in Neural Information Processing Systems
472478 (pp. 343-351).
473479
480+ .. [3] Malzer, C., & Baum, M. (2019). HDBSCAN(ε^): An Alternative Cluster
481+ Extraction Method for HDBSCAN. arxiv preprint 1911.02282.
474482 """
475483 if min_samples is None :
476484 min_samples = min_cluster_size
@@ -485,6 +493,12 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
485493 if min_cluster_size == 1 :
486494 raise ValueError ('Min cluster size must be greater than one' )
487495
496+ if type (cluster_selection_epsilon ) is int :
497+ cluster_selection_epsilon = float (cluster_selection_epsilon )
498+
499+ if type (cluster_selection_epsilon ) is not float or cluster_selection_epsilon < 0.0 :
500+ raise ValueError ('Epsilon must be a float value greater than or equal to 0!' )
501+
488502 if not isinstance (alpha , float ) or alpha <= 0.0 :
489503 raise ValueError ('Alpha must be a positive float value greater than'
490504 ' 0!' )
@@ -528,7 +542,7 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
528542 if algorithm != 'best' :
529543 if metric != 'precomputed' and issparse (X ) and metric != 'generic' :
530544 raise ValueError ("Sparse data matrices only support algorithm 'generic'." )
531-
545+
532546 if algorithm == 'generic' :
533547 (single_linkage_tree ,
534548 result_min_span_tree ) = memory .cache (
@@ -616,7 +630,8 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
616630 min_cluster_size ,
617631 cluster_selection_method ,
618632 allow_single_cluster ,
619- match_reference_implementation ) + \
633+ match_reference_implementation ,
634+ cluster_selection_epsilon ) + \
620635 (result_min_span_tree ,)
621636
622637
@@ -656,6 +671,10 @@ class HDBSCAN(BaseEstimator, ClusterMixin):
656671 A distance scaling parameter as used in robust single linkage.
657672 See [3]_ for more information.
658673
674+ cluster_selection_epsilon: float, optional (default=0.0)
675+ A distance threshold. Clusters below this value will be merged.
676+ See [5]_ for more information.
677+
659678 algorithm : string, optional (default='best')
660679 Exactly which algorithm to use; hdbscan has variants specialised
661680 for different characteristics of the data. By default this is set
@@ -813,9 +832,12 @@ class HDBSCAN(BaseEstimator, ClusterMixin):
813832 Sander, J., 2014. Density-Based Clustering Validation. In SDM
814833 (pp. 839-847).
815834
835+ .. [5] Malzer, C., & Baum, M. (2019). HDBSCAN(ε^): An Alternative Cluster
836+ Extraction Method for HDBSCAN. arxiv preprint 1911.02282.
837+
816838 """
817839
818- def __init__ (self , min_cluster_size = 5 , min_samples = None ,
840+ def __init__ (self , min_cluster_size = 5 , min_samples = None , cluster_selection_epsilon = 0.0 ,
819841 metric = 'euclidean' , alpha = 1.0 , p = None ,
820842 algorithm = 'best' , leaf_size = 40 ,
821843 memory = Memory (cachedir = None , verbose = 0 ),
@@ -829,7 +851,7 @@ def __init__(self, min_cluster_size=5, min_samples=None,
829851 self .min_cluster_size = min_cluster_size
830852 self .min_samples = min_samples
831853 self .alpha = alpha
832-
854+ self . cluster_selection_epsilon = cluster_selection_epsilon
833855 self .metric = metric
834856 self .p = p
835857 self .algorithm = algorithm
0 commit comments