30
30
except ImportError :
31
31
_HAVE_HDBSCAN = False
32
32
33
- from numba .typed import Dict
34
33
35
34
36
35
def to_numpy_rec_array (named_tuple_tree ):
@@ -137,7 +136,7 @@ def fast_hdbscan(
137
136
data ,
138
137
data_labels = None ,
139
138
semi_supervised = False ,
140
- ss_algorithm = None ,
139
+ ss_algorithm = 'bc' ,
141
140
min_samples = 10 ,
142
141
min_cluster_size = 10 ,
143
142
cluster_selection_method = "eom" ,
@@ -149,17 +148,14 @@ def fast_hdbscan(
149
148
):
150
149
data = check_array (data )
151
150
152
- if semi_supervised and data_labels is None :
153
- raise ValueError (
154
- "data_labels must not be None when semi_supervised is set to True!"
155
- )
156
-
151
+ # Detect parameter inconsistencies early.
157
152
if semi_supervised :
158
- label_indices = np .flatnonzero (data_labels > - 1 )
159
- label_values = data_labels [label_indices ]
160
- data_labels_dict = Dict ()
161
- for index , label in zip (label_indices , label_values ):
162
- data_labels_dict [index ] = label
153
+ if data_labels is None :
154
+ raise ValueError (
155
+ "data_labels must not be None when semi_supervised is set to True!"
156
+ )
157
+ if ss_algorithm not in ["bc" , "bc_simple" ]:
158
+ raise ValueError (f"Invalid ss_algorithm { ss_algorithm } " )
163
159
164
160
if (
165
161
(not (np .issubdtype (type (min_samples ), np .integer ) or min_samples is None ))
@@ -184,38 +180,61 @@ def fast_hdbscan(
184
180
min_samples = min_cluster_size if min_samples is None else min_samples ,
185
181
sample_weights = sample_weights ,
186
182
)
183
+
184
+ return fast_hdbscan_mst_edges (
185
+ edges ,
186
+ data_labels = data_labels ,
187
+ semi_supervised = semi_supervised ,
188
+ ss_algorithm = ss_algorithm ,
189
+ min_cluster_size = min_cluster_size ,
190
+ cluster_selection_method = cluster_selection_method ,
191
+ max_cluster_size = max_cluster_size ,
192
+ allow_single_cluster = allow_single_cluster ,
193
+ cluster_selection_epsilon = cluster_selection_epsilon ,
194
+ sample_weights = sample_weights ,
195
+ )[: (None if return_trees else 2 )]
196
+
197
+
198
+ def fast_hdbscan_mst_edges (
199
+ edges ,
200
+ data_labels = None ,
201
+ semi_supervised = False ,
202
+ ss_algorithm = 'bc' ,
203
+ min_cluster_size = 10 ,
204
+ cluster_selection_method = "eom" ,
205
+ max_cluster_size = np .inf ,
206
+ allow_single_cluster = False ,
207
+ cluster_selection_epsilon = 0.0 ,
208
+ sample_weights = None ,
209
+ ):
187
210
sorted_mst = edges [np .argsort (edges .T [2 ])]
188
211
if sample_weights is None :
189
212
linkage_tree = mst_to_linkage_tree (sorted_mst )
190
213
else :
191
214
linkage_tree = mst_to_linkage_tree_w_sample_weights (sorted_mst , sample_weights )
192
- condensed_tree = condense_tree (linkage_tree , min_cluster_size = min_cluster_size , sample_weights = sample_weights )
215
+ condensed_tree = condense_tree (
216
+ linkage_tree , min_cluster_size = min_cluster_size , sample_weights = sample_weights
217
+ )
193
218
if cluster_selection_epsilon > 0.0 or cluster_selection_method == "eom" :
194
219
cluster_tree = cluster_tree_from_condensed_tree (condensed_tree )
195
220
196
221
if cluster_selection_method == "eom" :
197
222
if semi_supervised :
198
- if ss_algorithm == "bc" :
199
- selected_clusters = extract_clusters_bcubed (
200
- condensed_tree ,
201
- cluster_tree ,
202
- data_labels_dict ,
203
- allow_virtual_nodes = True ,
204
- allow_single_cluster = allow_single_cluster ,
205
- )
206
- elif ss_algorithm == "bc_simple" :
207
- selected_clusters = extract_clusters_bcubed (
208
- condensed_tree ,
209
- cluster_tree ,
210
- data_labels_dict ,
211
- allow_virtual_nodes = False ,
212
- allow_single_cluster = allow_single_cluster ,
213
- )
214
- else :
215
- raise ValueError (f"Invalid ss_algorithm { ss_algorithm } " )
223
+ # Silently ignores max_cluster_size!
224
+ # Assumes ss_algorithm is either 'bc' or 'bc_simple'
225
+ selected_clusters = extract_clusters_bcubed (
226
+ condensed_tree ,
227
+ cluster_tree ,
228
+ data_labels ,
229
+ allow_virtual_nodes = True if ss_algorithm == "bc" else False ,
230
+ allow_single_cluster = allow_single_cluster ,
231
+ )
216
232
else :
217
233
selected_clusters = extract_eom_clusters (
218
- condensed_tree , cluster_tree , max_cluster_size = max_cluster_size , allow_single_cluster = allow_single_cluster
234
+ condensed_tree ,
235
+ cluster_tree ,
236
+ max_cluster_size = max_cluster_size ,
237
+ allow_single_cluster = allow_single_cluster ,
219
238
)
220
239
elif cluster_selection_method == "leaf" :
221
240
selected_clusters = extract_leaves (
@@ -235,15 +254,13 @@ def fast_hdbscan(
235
254
condensed_tree ,
236
255
selected_clusters ,
237
256
cluster_selection_epsilon ,
238
- n_samples = data .shape [0 ],
257
+ n_samples = edges .shape [0 ] + 1 ,
239
258
)
240
259
membership_strengths = get_point_membership_strength_vector (
241
260
condensed_tree , selected_clusters , clusters
242
261
)
243
262
244
- if return_trees :
245
- return clusters , membership_strengths , linkage_tree , condensed_tree , sorted_mst
246
- return clusters , membership_strengths
263
+ return clusters , membership_strengths , linkage_tree , condensed_tree , sorted_mst
247
264
248
265
249
266
class HDBSCAN (BaseEstimator , ClusterMixin ):
@@ -257,7 +274,7 @@ def __init__(
257
274
max_cluster_size = np .inf ,
258
275
cluster_selection_epsilon = 0.0 ,
259
276
semi_supervised = False ,
260
- ss_algorithm = None ,
277
+ ss_algorithm = 'bc' ,
261
278
** kwargs ,
262
279
):
263
280
self .min_cluster_size = min_cluster_size
0 commit comments