@@ -72,6 +72,7 @@ def _tree_to_labels(
7272 match_reference_implementation = False ,
7373 cluster_selection_epsilon = 0.0 ,
7474 max_cluster_size = 0 ,
75+ cluster_selection_epsilon_max = float ('inf' ),
7576):
7677 """Converts a pretrained tree and cluster size into a
7778 set of labels and probabilities.
@@ -86,6 +87,7 @@ def _tree_to_labels(
8687 match_reference_implementation ,
8788 cluster_selection_epsilon ,
8889 max_cluster_size ,
90+ cluster_selection_epsilon_max ,
8991 )
9092
9193 return (labels , probabilities , stabilities , condensed_tree , single_linkage_tree )
@@ -529,6 +531,7 @@ def hdbscan(
529531 cluster_selection_method = "eom" ,
530532 allow_single_cluster = False ,
531533 match_reference_implementation = False ,
534+ cluster_selection_epsilon_max = float ('inf' ),
532535 ** kwargs
533536):
534537 """Perform HDBSCAN clustering from a vector array or distance matrix.
@@ -555,7 +558,7 @@ def hdbscan(
555558 See [3]_ for more information. Note that this should not be used
556559 if we want to predict the cluster labels for new points in future
557560 (e.g. using approximate_predict), as the approximate_predict function
558- is not aware of this argument.
561+ is not aware of this argument. This is the minimum epsilon allowed.
559562
560563 alpha : float, optional (default=1.0)
561564 A distance scaling parameter as used in robust single linkage.
@@ -641,6 +644,16 @@ def hdbscan(
641644 performance cost, ensure that the clustering results match the
642645 reference implementation.
643646
647+ cluster_selection_epsilon_max: float, optional (default=inf)
648+ A distance threshold. Clusters above this value will be split.
649+ Has no effect when using leaf clustering (where clusters are
650+ usually small regardless) and can also be overridden in rare
651+ cases by a high value for cluster_selection_epsilon. Note that
652+ this should not be used if we want to predict the cluster labels
653+ for new points in future (e.g. using approximate_predict), as
654+ the approximate_predict function is not aware of this argument.
655+ This is the maximum epsilon allowed.
656+
644657 **kwargs : optional
645658 Arguments passed to the distance metric
646659
@@ -722,6 +735,9 @@ def hdbscan(
722735 "Minkowski metric with negative p value is not" " defined!"
723736 )
724737
738+ if cluster_selection_epsilon_max < cluster_selection_epsilon :
739+ raise ValueError ("Cluster selection epsilon max must be greater than epsilon!" )
740+
725741 if match_reference_implementation :
726742 min_samples = min_samples - 1
727743 min_cluster_size = min_cluster_size + 1
@@ -891,6 +907,7 @@ def hdbscan(
891907 match_reference_implementation ,
892908 cluster_selection_epsilon ,
893909 max_cluster_size ,
910+ cluster_selection_epsilon_max ,
894911 )
895912 + (result_min_span_tree ,)
896913 )
@@ -934,6 +951,7 @@ class HDBSCAN(BaseEstimator, ClusterMixin):
934951
935952 cluster_selection_epsilon: float, optional (default=0.0)
936953 A distance threshold. Clusters below this value will be merged.
954+ This is the minimum epsilon allowed.
937955 See [5]_ for more information.
938956
939957 algorithm : string, optional (default='best')
@@ -1010,6 +1028,16 @@ class HDBSCAN(BaseEstimator, ClusterMixin):
10101028 performance cost, ensure that the clustering results match the
10111029 reference implementation.
10121030
1031+ cluster_selection_epsilon_max: float, optional (default=inf)
1032+ A distance threshold. Clusters above this value will be split.
1033+ Has no effect when using leaf clustering (where clusters are
1034+ usually small regardless) and can also be overridden in rare
1035+ cases by a high value for cluster_selection_epsilon. Note that
1036+ this should not be used if we want to predict the cluster labels
1037+ for new points in future (e.g. using approximate_predict), as
1038+ the approximate_predict function is not aware of this argument.
1039+ This is the maximum epsilon allowed.
1040+
10131041 **kwargs : optional
10141042 Arguments passed to the distance metric
10151043
@@ -1127,6 +1155,7 @@ def __init__(
11271155 prediction_data = False ,
11281156 branch_detection_data = False ,
11291157 match_reference_implementation = False ,
1158+ cluster_selection_epsilon_max = float ('inf' ),
11301159 ** kwargs
11311160 ):
11321161 self .min_cluster_size = min_cluster_size
@@ -1147,6 +1176,7 @@ def __init__(
11471176 self .match_reference_implementation = match_reference_implementation
11481177 self .prediction_data = prediction_data
11491178 self .branch_detection_data = branch_detection_data
1179+ self .cluster_selection_epsilon_max = cluster_selection_epsilon_max
11501180
11511181 self ._metric_kwargs = kwargs
11521182
@@ -1296,7 +1326,7 @@ def generate_prediction_data(self):
12961326 def generate_branch_detection_data (self ):
12971327 """
12981328 Create data that caches intermediate results used for detecting
1299- branches within clusters. This data is only useful if you are
1329+ branches within clusters. This data is only useful if you are
13001330 intending to use functions from ``hdbscan.branches``.
13011331 """
13021332 if self .metric in FAST_METRICS :
0 commit comments