Skip to content

Commit 0b08696

Browse files
author
cmalzer
committed
changed parameter name from epsilon to cluster_selection_epsilon and set default value from None to 0.0
1 parent e53b1da commit 0b08696

File tree

2 files changed

+34
-27
lines changed

2 files changed

+34
-27
lines changed

hdbscan/_hdbscan_tree.pyx

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -609,29 +609,29 @@ cpdef list get_cluster_tree_leaves(np.ndarray cluster_tree):
609609
root = cluster_tree['parent'].min()
610610
return recurse_leaf_dfs(cluster_tree, root)
611611

612-
cpdef np.intp_t traverse_upwards(np.ndarray cluster_tree, np.double_t cut_value, np.intp_t leaf):
612+
cpdef np.intp_t traverse_upwards(np.ndarray cluster_tree, np.double_t cluster_selection_epsilon, np.intp_t leaf):
613613

614614
root = cluster_tree['parent'].min()
615615
parent = cluster_tree[cluster_tree['child'] == leaf]['parent']
616616
if parent == root:
617617
return leaf #return node closest to root
618618

619619
parent_eps = 1/cluster_tree[cluster_tree['child'] == parent]['lambda_val']
620-
if parent_eps > cut_value:
620+
if parent_eps > cluster_selection_epsilon:
621621
return parent
622622
else:
623-
return traverse_upwards(cluster_tree, cut_value, parent)
623+
return traverse_upwards(cluster_tree, cluster_selection_epsilon, parent)
624624

625-
cpdef set epsilon_search(set leaves, np.ndarray cluster_tree, np.double_t cut_value):
625+
cpdef set epsilon_search(set leaves, np.ndarray cluster_tree, np.double_t cluster_selection_epsilon):
626626

627627
selected_clusters = list()
628628
processed = list()
629629

630630
for leaf in leaves:
631631
eps = 1/cluster_tree['lambda_val'][cluster_tree['child'] == leaf][0]
632-
if eps < cut_value:
632+
if eps < cluster_selection_epsilon:
633633
if leaf not in processed:
634-
epsilon_child = traverse_upwards(cluster_tree, cut_value, leaf)
634+
epsilon_child = traverse_upwards(cluster_tree, cluster_selection_epsilon, leaf)
635635
selected_clusters.append(epsilon_child)
636636

637637
for sub_node in bfs_from_cluster_tree(cluster_tree, epsilon_child):
@@ -646,7 +646,7 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability,
646646
cluster_selection_method='eom',
647647
allow_single_cluster=False,
648648
match_reference_implementation=False,
649-
cut_value=None):
649+
cluster_selection_epsilon=0.0):
650650
"""Given a tree and stability dict, produce the cluster labels
651651
(and probabilities) for a flat clustering based on the chosen
652652
cluster selection method.
@@ -672,8 +672,8 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability,
672672
Whether to match the reference implementation in how to handle
673673
certain edge cases.
674674
675-
cut_value: float, optional (default None)
676-
A threshold for cluster splits
675+
cluster_selection_epsilon: float, optional (default 0.0)
676+
A distance threshold for cluster splits.
677677
678678
Returns
679679
-------
@@ -727,10 +727,10 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability,
727727
if sub_node != node:
728728
is_cluster[sub_node] = False
729729

730-
if cut_value is not None and cut_value != 0.0:
730+
if cluster_selection_epsilon != 0.0:
731731

732732
eom_clusters = set([c for c in is_cluster if is_cluster[c]])
733-
selected_clusters = epsilon_search(eom_clusters, cluster_tree, cut_value)
733+
selected_clusters = epsilon_search(eom_clusters, cluster_tree, cluster_selection_epsilon)
734734
for c in is_cluster:
735735
if c in selected_clusters:
736736
is_cluster[c] = True
@@ -745,8 +745,8 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability,
745745
is_cluster[c] = False
746746
is_cluster[tree['parent'].min()] = True
747747

748-
if cut_value is not None and cut_value != 0.0:
749-
selected_clusters = epsilon_search(leaves, cluster_tree, cut_value)
748+
if cluster_selection_epsilon != 0.0:
749+
selected_clusters = epsilon_search(leaves, cluster_tree, cluster_selection_epsilon)
750750
else:
751751
selected_clusters = leaves
752752

hdbscan/hdbscan_.py

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def _tree_to_labels(X, single_linkage_tree, min_cluster_size=10,
4949
cluster_selection_method='eom',
5050
allow_single_cluster=False,
5151
match_reference_implementation=False,
52-
epsilon=None):
52+
cluster_selection_epsilon=0.0):
5353
"""Converts a pretrained tree and cluster size into a
5454
set of labels and probabilities.
5555
"""
@@ -61,7 +61,7 @@ def _tree_to_labels(X, single_linkage_tree, min_cluster_size=10,
6161
cluster_selection_method,
6262
allow_single_cluster,
6363
match_reference_implementation,
64-
epsilon)
64+
cluster_selection_epsilon)
6565

6666
return (labels, probabilities, stabilities, condensed_tree,
6767
single_linkage_tree)
@@ -329,7 +329,7 @@ def check_precomputed_distance_matrix(X):
329329
check_array(tmp)
330330

331331

332-
def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0, epsilon=None,
332+
def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0, cluster_selection_epsilon=0.0,
333333
metric='minkowski', p=2, leaf_size=40,
334334
algorithm='best', memory=Memory(cachedir=None, verbose=0),
335335
approx_min_span_tree=True, gen_min_span_tree=False,
@@ -355,8 +355,9 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0, epsilon=None,
355355
to be considered as a core point. This includes the point itself.
356356
defaults to the min_cluster_size.
357357
358-
epsilon: float, optional (default=None)
359-
A threshold for cluster splits.
358+
cluster_selection_epsilon: float, optional (default=0.0)
359+
A distance threshold. Clusters below this value will be merged.
360+
See [3]_ for more information.
360361
361362
alpha : float, optional (default=1.0)
362363
A distance scaling parameter as used in robust single linkage.
@@ -476,6 +477,8 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0, epsilon=None,
476477
cluster tree. In Advances in Neural Information Processing Systems
477478
(pp. 343-351).
478479
480+
.. [3] Malzer, C., & Baum, M. (2019). HDBSCAN(ε^): An Alternative Cluster
481+
Extraction Method for HDBSCAN. arxiv preprint 1911.02282.
479482
"""
480483
if min_samples is None:
481484
min_samples = min_cluster_size
@@ -490,13 +493,10 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0, epsilon=None,
490493
if min_cluster_size == 1:
491494
raise ValueError('Min cluster size must be greater than one')
492495

493-
if epsilon is None:
494-
epsilon = 0.0
495-
496-
if type(epsilon) is int:
497-
epsilon = float(epsilon)
496+
if type(cluster_selection_epsilon) is int:
497+
cluster_selection_epsilon = float(cluster_selection_epsilon)
498498

499-
if type(epsilon) is not float or epsilon < 0.0:
499+
if type(cluster_selection_epsilon) is not float or cluster_selection_epsilon < 0.0:
500500
raise ValueError('Epsilon must be a float value greater than or equal to 0!')
501501

502502
if not isinstance(alpha, float) or alpha <= 0.0:
@@ -631,7 +631,7 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0, epsilon=None,
631631
cluster_selection_method,
632632
allow_single_cluster,
633633
match_reference_implementation,
634-
epsilon) + \
634+
cluster_selection_epsilon) + \
635635
(result_min_span_tree,)
636636

637637

@@ -671,6 +671,10 @@ class HDBSCAN(BaseEstimator, ClusterMixin):
671671
A distance scaling parameter as used in robust single linkage.
672672
See [3]_ for more information.
673673
674+
cluster_selection_epsilon: float, optional (default=0.0)
675+
A distance threshold. Clusters below this value will be merged.
676+
See [5]_ for more information.
677+
674678
algorithm : string, optional (default='best')
675679
Exactly which algorithm to use; hdbscan has variants specialised
676680
for different characteristics of the data. By default this is set
@@ -828,9 +832,12 @@ class HDBSCAN(BaseEstimator, ClusterMixin):
828832
Sander, J., 2014. Density-Based Clustering Validation. In SDM
829833
(pp. 839-847).
830834
835+
.. [5] Malzer, C., & Baum, M. (2019). HDBSCAN(ε^): An Alternative Cluster
836+
Extraction Method for HDBSCAN. arxiv preprint 1911.02282.
837+
831838
"""
832839

833-
def __init__(self, min_cluster_size=5, min_samples=None, epsilon=None,
840+
def __init__(self, min_cluster_size=5, min_samples=None, cluster_selection_epsilon=0.0,
834841
metric='euclidean', alpha=1.0, p=None,
835842
algorithm='best', leaf_size=40,
836843
memory=Memory(cachedir=None, verbose=0),
@@ -844,7 +851,7 @@ def __init__(self, min_cluster_size=5, min_samples=None, epsilon=None,
844851
self.min_cluster_size = min_cluster_size
845852
self.min_samples = min_samples
846853
self.alpha = alpha
847-
self.epsilon = epsilon
854+
self.cluster_selection_epsilon = cluster_selection_epsilon
848855
self.metric = metric
849856
self.p = p
850857
self.algorithm = algorithm

0 commit comments

Comments
 (0)