Skip to content

Commit 5a4944a

Browse files
committed
Fix prediction data not honoring cluster_selection_epsilon
1 parent e55f957 commit 5a4944a

File tree

4 files changed

+13
-6
lines changed

4 files changed

+13
-6
lines changed

hdbscan/_hdbscan_tree.pyx

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -705,6 +705,9 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability,
705705
706706
stabilities : ndarray (n_clusters,)
707707
The cluster coherence strengths of each cluster.
708+
709+
selected clusters : ndarray (n_clusters,)
710+
The ids of the selected clusters
708711
"""
709712
cdef list node_list
710713
cdef np.ndarray cluster_tree
@@ -803,4 +806,4 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability,
803806
probs = get_probabilities(tree, reverse_cluster_map, labels)
804807
stabilities = get_stability_scores(labels, clusters, stability, max_lambda)
805808

806-
return (labels, probs, stabilities)
809+
return (labels, probs, stabilities, np.array(sorted(clusters)))

hdbscan/flat.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,8 @@ def HDBSCAN_flat(X, n_clusters=None,
184184
new_clusterer.probabilities_,
185185
new_clusterer.cluster_persistence_,
186186
new_clusterer._condensed_tree,
187-
new_clusterer._single_linkage_tree) = output
187+
new_clusterer._single_linkage_tree,
188+
new_clusterer._selected_clusters) = output
188189

189190
# PredictionData attached to HDBSCAN should also change.
190191
# A function re_init is defined in this module to handle this.

hdbscan/hdbscan_.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def _tree_to_labels(
6262
"""
6363
condensed_tree = condense_tree(single_linkage_tree, min_cluster_size)
6464
stability_dict = compute_stability(condensed_tree)
65-
labels, probabilities, stabilities = get_clusters(
65+
labels, probabilities, stabilities, selected_clusters = get_clusters(
6666
condensed_tree,
6767
stability_dict,
6868
cluster_selection_method,
@@ -72,7 +72,8 @@ def _tree_to_labels(
7272
max_cluster_size,
7373
)
7474

75-
return (labels, probabilities, stabilities, condensed_tree, single_linkage_tree)
75+
return (labels, probabilities, stabilities, condensed_tree, single_linkage_tree,
76+
selected_clusters)
7677

7778

7879
def _hdbscan_generic(
@@ -1130,6 +1131,7 @@ def __init__(
11301131
self._outlier_scores = None
11311132
self._prediction_data = None
11321133
self._relative_validity = None
1134+
self._selected_clusters = None
11331135

11341136
def fit(self, X, y=None):
11351137
"""Perform HDBSCAN clustering from features or distance matrix.
@@ -1186,6 +1188,7 @@ def fit(self, X, y=None):
11861188
self.cluster_persistence_,
11871189
self._condensed_tree,
11881190
self._single_linkage_tree,
1191+
self._selected_clusters,
11891192
self._min_spanning_tree,
11901193
) = hdbscan(clean_data, **kwargs)
11911194

@@ -1248,6 +1251,7 @@ def generate_prediction_data(self):
12481251
self._prediction_data = PredictionData(
12491252
self._raw_data,
12501253
self.condensed_tree_,
1254+
self._selected_clusters,
12511255
min_samples,
12521256
tree_type=tree_type,
12531257
metric=self.metric,

hdbscan/prediction.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,15 +95,14 @@ def _recurse_leaf_dfs(self, current_node):
9595
return sum(
9696
[recurse_leaf_dfs(self.cluster_tree, child) for child in children], [])
9797

98-
def __init__(self, data, condensed_tree, min_samples,
98+
def __init__(self, data, condensed_tree, selected_clusters, min_samples,
9999
tree_type='kdtree', metric='euclidean', **kwargs):
100100
self.raw_data = data.astype(np.float64)
101101
self.tree = self._tree_type_map[tree_type](self.raw_data,
102102
metric=metric, **kwargs)
103103
self.core_distances = self.tree.query(data, k=min_samples)[0][:, -1]
104104
self.dist_metric = DistanceMetric.get_metric(metric, **kwargs)
105105

106-
selected_clusters = sorted(condensed_tree._select_clusters())
107106
# raw_condensed_tree = condensed_tree.to_numpy()
108107
raw_condensed_tree = condensed_tree._raw_tree
109108

0 commit comments

Comments
 (0)