20
20
get_cluster_label_vector ,
21
21
get_point_membership_strength_vector ,
22
22
cluster_tree_from_condensed_tree ,
23
- extract_clusters_bcubed
23
+ extract_clusters_bcubed ,
24
24
)
25
25
26
26
try :
@@ -41,7 +41,7 @@ def to_numpy_rec_array(named_tuple_tree):
41
41
("parent" , np .intp ),
42
42
("child" , np .intp ),
43
43
("lambda_val" , float ),
44
- ("child_size" , np .intp ),
44
+ ("child_size" , np .float32 ),
45
45
],
46
46
)
47
47
@@ -149,14 +149,16 @@ def fast_hdbscan(
149
149
data = check_array (data )
150
150
151
151
if semi_supervised and data_labels is None :
152
- raise ValueError ("data_labels must not be None when semi_supervised is set to True!" )
152
+ raise ValueError (
153
+ "data_labels must not be None when semi_supervised is set to True!"
154
+ )
153
155
154
156
if semi_supervised :
155
157
label_indices = np .flatnonzero (data_labels > - 1 )
156
158
label_values = data_labels [label_indices ]
157
159
data_labels_dict = Dict ()
158
160
for index , label in zip (label_indices , label_values ):
159
- data_labels_dict [index ] = label
161
+ data_labels_dict [index ] = label
160
162
161
163
if (
162
164
(not (np .issubdtype (type (min_samples ), np .integer ) or min_samples is None ))
@@ -165,17 +167,21 @@ def fast_hdbscan(
165
167
or min_cluster_size <= 0
166
168
):
167
169
raise ValueError ("Min samples and min cluster size must be positive integers!" )
168
-
170
+
169
171
if (
170
172
not np .issubdtype (type (cluster_selection_epsilon ), np .floating )
171
173
or cluster_selection_epsilon < 0.0
172
174
):
173
- raise ValueError ('Cluster selection epsilon must be a positive floating point number!' )
175
+ raise ValueError (
176
+ "Cluster selection epsilon must be a positive floating point number!"
177
+ )
174
178
175
179
sklearn_tree = KDTree (data )
176
180
numba_tree = kdtree_to_numba (sklearn_tree )
177
181
edges = parallel_boruvka (
178
- numba_tree , min_samples = min_cluster_size if min_samples is None else min_samples , sample_weights = sample_weights
182
+ numba_tree ,
183
+ min_samples = min_cluster_size if min_samples is None else min_samples ,
184
+ sample_weights = sample_weights ,
179
185
)
180
186
sorted_mst = edges [np .argsort (edges .T [2 ])]
181
187
if sample_weights is None :
@@ -187,39 +193,49 @@ def fast_hdbscan(
187
193
cluster_tree = cluster_tree_from_condensed_tree (condensed_tree )
188
194
189
195
if cluster_selection_method == "eom" :
190
- if semi_supervised :
191
- if (ss_algorithm == "bc" ):
192
- selected_clusters = extract_clusters_bcubed (condensed_tree ,
193
- cluster_tree ,
194
- data_labels_dict ,
195
- allow_virtual_nodes = True ,
196
- allow_single_cluster = allow_single_cluster )
197
- elif (ss_algorithm == "bc_without_vn" ):
198
- selected_clusters = extract_clusters_bcubed (condensed_tree ,
199
- cluster_tree ,
200
- data_labels_dict ,
201
- allow_virtual_nodes = False ,
202
- allow_single_cluster = allow_single_cluster )
203
- else :
204
- raise ValueError (f"Invalid ss_algorithm { ss_algorithm } " )
205
- else :
206
- selected_clusters = extract_eom_clusters (condensed_tree ,
207
- cluster_tree ,
208
- allow_single_cluster = allow_single_cluster )
196
+ if semi_supervised :
197
+ if ss_algorithm == "bc" :
198
+ selected_clusters = extract_clusters_bcubed (
199
+ condensed_tree ,
200
+ cluster_tree ,
201
+ data_labels_dict ,
202
+ allow_virtual_nodes = True ,
203
+ allow_single_cluster = allow_single_cluster ,
204
+ )
205
+ elif ss_algorithm == "bc_without_vn" :
206
+ selected_clusters = extract_clusters_bcubed (
207
+ condensed_tree ,
208
+ cluster_tree ,
209
+ data_labels_dict ,
210
+ allow_virtual_nodes = False ,
211
+ allow_single_cluster = allow_single_cluster ,
212
+ )
213
+ else :
214
+ raise ValueError (f"Invalid ss_algorithm { ss_algorithm } " )
215
+ else :
216
+ selected_clusters = extract_eom_clusters (
217
+ condensed_tree , cluster_tree , allow_single_cluster = allow_single_cluster
218
+ )
209
219
elif cluster_selection_method == "leaf" :
210
220
selected_clusters = extract_leaves (
211
221
condensed_tree , allow_single_cluster = allow_single_cluster
212
222
)
213
223
else :
214
224
raise ValueError (f"Invalid cluster_selection_method { cluster_selection_method } " )
215
-
225
+
216
226
if len (selected_clusters ) > 1 and cluster_selection_epsilon > 0.0 :
217
227
selected_clusters = cluster_epsilon_search (
218
- selected_clusters , cluster_tree ,
228
+ selected_clusters ,
229
+ cluster_tree ,
219
230
min_persistence = cluster_selection_epsilon ,
220
231
)
221
232
222
- clusters = get_cluster_label_vector (condensed_tree , selected_clusters , cluster_selection_epsilon )
233
+ clusters = get_cluster_label_vector (
234
+ condensed_tree ,
235
+ selected_clusters ,
236
+ cluster_selection_epsilon ,
237
+ n_samples = data .shape [0 ],
238
+ )
223
239
membership_strengths = get_point_membership_strength_vector (
224
240
condensed_tree , selected_clusters , clusters
225
241
)
@@ -252,16 +268,18 @@ def __init__(
252
268
253
269
def fit (self , X , y = None , sample_weight = None , ** fit_params ):
254
270
255
- if ( self .semi_supervised ) :
271
+ if self .semi_supervised :
256
272
X , y = check_X_y (X , y , accept_sparse = "csr" , force_all_finite = False )
257
273
if sample_weight is not None :
258
274
sample_weight = _check_sample_weight (sample_weight , X , dtype = np .float32 )
259
275
self ._raw_labels = y
260
276
# Replace non-finite labels with -1 labels
261
277
y [~ np .isfinite (y )] = - 1
262
278
263
- if ~ np .any (y != - 1 ):
264
- raise ValueError ("y must contain at least one label > -1. Currently it only contains -1 and/or non-finite labels!" )
279
+ if ~ np .any (y != - 1 ):
280
+ raise ValueError (
281
+ "y must contain at least one label > -1. Currently it only contains -1 and/or non-finite labels!"
282
+ )
265
283
else :
266
284
X = check_array (X , accept_sparse = "csr" , force_all_finite = False )
267
285
if sample_weight is not None :
@@ -275,7 +293,7 @@ def fit(self, X, y=None, sample_weight=None, **fit_params):
275
293
finite_index = np .where (np .isfinite (X ).sum (axis = 1 ) == X .shape [1 ])[0 ]
276
294
clean_data = X [finite_index ]
277
295
clean_data_labels = y
278
-
296
+
279
297
if self .semi_supervised :
280
298
clean_data_labels = y [finite_index ]
281
299
@@ -295,7 +313,13 @@ def fit(self, X, y=None, sample_weight=None, **fit_params):
295
313
self ._single_linkage_tree ,
296
314
self ._condensed_tree ,
297
315
self ._min_spanning_tree ,
298
- ) = fast_hdbscan (clean_data , clean_data_labels , return_trees = True , sample_weights = sample_weight , ** kwargs )
316
+ ) = fast_hdbscan (
317
+ clean_data ,
318
+ clean_data_labels ,
319
+ return_trees = True ,
320
+ sample_weights = sample_weight ,
321
+ ** kwargs ,
322
+ )
299
323
300
324
self ._condensed_tree = to_numpy_rec_array (self ._condensed_tree )
301
325
@@ -318,7 +342,11 @@ def fit(self, X, y=None, sample_weight=None, **fit_params):
318
342
return self
319
343
320
344
def dbscan_clustering (self , epsilon ):
321
- check_is_fitted (self , "_single_linkage_tree" , msg = "You first need to fit the HDBSCAN model before picking a DBSCAN clustering" )
345
+ check_is_fitted (
346
+ self ,
347
+ "_single_linkage_tree" ,
348
+ msg = "You first need to fit the HDBSCAN model before picking a DBSCAN clustering" ,
349
+ )
322
350
return get_cluster_labelling_at_cut (
323
351
self ._single_linkage_tree ,
324
352
epsilon ,
@@ -327,7 +355,11 @@ def dbscan_clustering(self, epsilon):
327
355
328
356
@property
329
357
def condensed_tree_ (self ):
330
- check_is_fitted (self , "_condensed_tree" , msg = "You first need to fit the HDBSCAN model before accessing the condensed tree" )
358
+ check_is_fitted (
359
+ self ,
360
+ "_condensed_tree" ,
361
+ msg = "You first need to fit the HDBSCAN model before accessing the condensed tree" ,
362
+ )
331
363
if self ._condensed_tree is not None :
332
364
return CondensedTree (
333
365
self ._condensed_tree ,
@@ -341,7 +373,11 @@ def condensed_tree_(self):
341
373
342
374
@property
343
375
def single_linkage_tree_ (self ):
344
- check_is_fitted (self , "_single_linkage_tree" , msg = "You first need to fit the HDBSCAN model before accessing the single linkage tree" )
376
+ check_is_fitted (
377
+ self ,
378
+ "_single_linkage_tree" ,
379
+ msg = "You first need to fit the HDBSCAN model before accessing the single linkage tree" ,
380
+ )
345
381
if self ._single_linkage_tree is not None :
346
382
return SingleLinkageTree (self ._single_linkage_tree )
347
383
else :
@@ -351,7 +387,11 @@ def single_linkage_tree_(self):
351
387
352
388
@property
353
389
def minimum_spanning_tree_ (self ):
354
- check_is_fitted (self , "_min_spanning_tree" , msg = "You first need to fit the HDBSCAN model before accessing the minimum spanning tree" )
390
+ check_is_fitted (
391
+ self ,
392
+ "_min_spanning_tree" ,
393
+ msg = "You first need to fit the HDBSCAN model before accessing the minimum spanning tree" ,
394
+ )
355
395
if self ._min_spanning_tree is not None :
356
396
if self ._raw_data is not None :
357
397
return MinimumSpanningTree (self ._min_spanning_tree , self ._raw_data )
0 commit comments