Skip to content

Commit e5d08fa

Browse files
committed
Fix child sizes for weighted damples; fix segfault
1 parent 5540a77 commit e5d08fa

File tree

2 files changed

+88
-45
lines changed

2 files changed

+88
-45
lines changed

fast_hdbscan/cluster_trees.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ def condense_tree(hierarchy, min_cluster_size=10, sample_weights=None):
174174
parents = np.ones(root, dtype=np.int64)
175175
children = np.empty(root, dtype=np.int64)
176176
lambdas = np.empty(root, dtype=np.float32)
177-
sizes = np.ones(root, dtype=np.int64)
177+
sizes = np.ones(root, dtype=np.float32)
178178

179179
ignore = np.zeros(root + 1, dtype=np.bool_) # 'bool' is no longer an attribute of 'numpy'
180180

@@ -196,8 +196,8 @@ def condense_tree(hierarchy, min_cluster_size=10, sample_weights=None):
196196
else:
197197
lambda_value = np.inf
198198

199-
left_count = np.int64(hierarchy[left - num_points, 3]) if left >= num_points else sample_weights[left]
200-
right_count = np.int64(hierarchy[right - num_points, 3]) if right >= num_points else sample_weights[left]
199+
left_count = np.float32(hierarchy[left - num_points, 3]) if left >= num_points else sample_weights[left]
200+
right_count = np.float32(hierarchy[right - num_points, 3]) if right >= num_points else sample_weights[left]
201201

202202
# The logic here is in a strange order, but it has non-trivial performance gains ...
203203
# The most common case by far is a singleton on the left; and cluster on the right take care of this separately
@@ -434,7 +434,7 @@ def extract_clusters_bcubed(condensed_tree, cluster_tree, label_indices, allow_v
434434

435435
@numba.njit()
436436
def score_condensed_tree_nodes(condensed_tree):
437-
result = {0: 0.0 for i in range(0)}
437+
result = {0: np.float32(0.0) for i in range(0)}
438438

439439
for i in range(condensed_tree.parent.shape[0]):
440440
parent = condensed_tree.parent[i]
@@ -602,13 +602,16 @@ def get_cluster_labelling_at_cut(linkage_tree, cut, min_cluster_size):
602602
def get_cluster_label_vector(
603603
tree,
604604
clusters,
605-
cluster_selection_epsilon
605+
cluster_selection_epsilon,
606+
n_samples,
606607
):
608+
if len(tree.parent) == 0:
609+
return np.full(n_samples, -1, dtype=np.intp)
607610
root_cluster = tree.parent.min()
608-
result = np.empty(root_cluster, dtype=np.intp)
611+
result = np.full(n_samples, -1, dtype=np.intp)
609612
cluster_label_map = {c: n for n, c in enumerate(np.sort(clusters))}
610613

611-
disjoint_set = ds_rank_create(tree.parent.max() + 1)
614+
disjoint_set = ds_rank_create(max(tree.parent.max() + 1, tree.child.max() + 1))
612615
clusters = set(clusters)
613616

614617
for n in range(tree.parent.shape[0]):

fast_hdbscan/hdbscan.py

Lines changed: 78 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
get_cluster_label_vector,
2121
get_point_membership_strength_vector,
2222
cluster_tree_from_condensed_tree,
23-
extract_clusters_bcubed
23+
extract_clusters_bcubed,
2424
)
2525

2626
try:
@@ -41,7 +41,7 @@ def to_numpy_rec_array(named_tuple_tree):
4141
("parent", np.intp),
4242
("child", np.intp),
4343
("lambda_val", float),
44-
("child_size", np.intp),
44+
("child_size", np.float32),
4545
],
4646
)
4747

@@ -149,14 +149,16 @@ def fast_hdbscan(
149149
data = check_array(data)
150150

151151
if semi_supervised and data_labels is None:
152-
raise ValueError("data_labels must not be None when semi_supervised is set to True!")
152+
raise ValueError(
153+
"data_labels must not be None when semi_supervised is set to True!"
154+
)
153155

154156
if semi_supervised:
155157
label_indices = np.flatnonzero(data_labels > -1)
156158
label_values = data_labels[label_indices]
157159
data_labels_dict = Dict()
158160
for index, label in zip(label_indices, label_values):
159-
data_labels_dict[index] = label
161+
data_labels_dict[index] = label
160162

161163
if (
162164
(not (np.issubdtype(type(min_samples), np.integer) or min_samples is None))
@@ -165,17 +167,21 @@ def fast_hdbscan(
165167
or min_cluster_size <= 0
166168
):
167169
raise ValueError("Min samples and min cluster size must be positive integers!")
168-
170+
169171
if (
170172
not np.issubdtype(type(cluster_selection_epsilon), np.floating)
171173
or cluster_selection_epsilon < 0.0
172174
):
173-
raise ValueError('Cluster selection epsilon must be a positive floating point number!')
175+
raise ValueError(
176+
"Cluster selection epsilon must be a positive floating point number!"
177+
)
174178

175179
sklearn_tree = KDTree(data)
176180
numba_tree = kdtree_to_numba(sklearn_tree)
177181
edges = parallel_boruvka(
178-
numba_tree, min_samples=min_cluster_size if min_samples is None else min_samples, sample_weights=sample_weights
182+
numba_tree,
183+
min_samples=min_cluster_size if min_samples is None else min_samples,
184+
sample_weights=sample_weights,
179185
)
180186
sorted_mst = edges[np.argsort(edges.T[2])]
181187
if sample_weights is None:
@@ -187,39 +193,49 @@ def fast_hdbscan(
187193
cluster_tree = cluster_tree_from_condensed_tree(condensed_tree)
188194

189195
if cluster_selection_method == "eom":
190-
if semi_supervised:
191-
if(ss_algorithm=="bc"):
192-
selected_clusters = extract_clusters_bcubed(condensed_tree,
193-
cluster_tree,
194-
data_labels_dict,
195-
allow_virtual_nodes=True,
196-
allow_single_cluster=allow_single_cluster)
197-
elif(ss_algorithm=="bc_without_vn"):
198-
selected_clusters = extract_clusters_bcubed(condensed_tree,
199-
cluster_tree,
200-
data_labels_dict,
201-
allow_virtual_nodes=False,
202-
allow_single_cluster=allow_single_cluster)
203-
else:
204-
raise ValueError(f"Invalid ss_algorithm {ss_algorithm}")
205-
else:
206-
selected_clusters = extract_eom_clusters(condensed_tree,
207-
cluster_tree,
208-
allow_single_cluster=allow_single_cluster)
196+
if semi_supervised:
197+
if ss_algorithm == "bc":
198+
selected_clusters = extract_clusters_bcubed(
199+
condensed_tree,
200+
cluster_tree,
201+
data_labels_dict,
202+
allow_virtual_nodes=True,
203+
allow_single_cluster=allow_single_cluster,
204+
)
205+
elif ss_algorithm == "bc_without_vn":
206+
selected_clusters = extract_clusters_bcubed(
207+
condensed_tree,
208+
cluster_tree,
209+
data_labels_dict,
210+
allow_virtual_nodes=False,
211+
allow_single_cluster=allow_single_cluster,
212+
)
213+
else:
214+
raise ValueError(f"Invalid ss_algorithm {ss_algorithm}")
215+
else:
216+
selected_clusters = extract_eom_clusters(
217+
condensed_tree, cluster_tree, allow_single_cluster=allow_single_cluster
218+
)
209219
elif cluster_selection_method == "leaf":
210220
selected_clusters = extract_leaves(
211221
condensed_tree, allow_single_cluster=allow_single_cluster
212222
)
213223
else:
214224
raise ValueError(f"Invalid cluster_selection_method {cluster_selection_method}")
215-
225+
216226
if len(selected_clusters) > 1 and cluster_selection_epsilon > 0.0:
217227
selected_clusters = cluster_epsilon_search(
218-
selected_clusters, cluster_tree,
228+
selected_clusters,
229+
cluster_tree,
219230
min_persistence=cluster_selection_epsilon,
220231
)
221232

222-
clusters = get_cluster_label_vector(condensed_tree, selected_clusters, cluster_selection_epsilon)
233+
clusters = get_cluster_label_vector(
234+
condensed_tree,
235+
selected_clusters,
236+
cluster_selection_epsilon,
237+
n_samples=data.shape[0],
238+
)
223239
membership_strengths = get_point_membership_strength_vector(
224240
condensed_tree, selected_clusters, clusters
225241
)
@@ -252,16 +268,18 @@ def __init__(
252268

253269
def fit(self, X, y=None, sample_weight=None, **fit_params):
254270

255-
if (self.semi_supervised):
271+
if self.semi_supervised:
256272
X, y = check_X_y(X, y, accept_sparse="csr", force_all_finite=False)
257273
if sample_weight is not None:
258274
sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float32)
259275
self._raw_labels = y
260276
# Replace non-finite labels with -1 labels
261277
y[~np.isfinite(y)] = -1
262278

263-
if ~np.any(y !=-1):
264-
raise ValueError("y must contain at least one label > -1. Currently it only contains -1 and/or non-finite labels!")
279+
if ~np.any(y != -1):
280+
raise ValueError(
281+
"y must contain at least one label > -1. Currently it only contains -1 and/or non-finite labels!"
282+
)
265283
else:
266284
X = check_array(X, accept_sparse="csr", force_all_finite=False)
267285
if sample_weight is not None:
@@ -275,7 +293,7 @@ def fit(self, X, y=None, sample_weight=None, **fit_params):
275293
finite_index = np.where(np.isfinite(X).sum(axis=1) == X.shape[1])[0]
276294
clean_data = X[finite_index]
277295
clean_data_labels = y
278-
296+
279297
if self.semi_supervised:
280298
clean_data_labels = y[finite_index]
281299

@@ -295,7 +313,13 @@ def fit(self, X, y=None, sample_weight=None, **fit_params):
295313
self._single_linkage_tree,
296314
self._condensed_tree,
297315
self._min_spanning_tree,
298-
) = fast_hdbscan(clean_data, clean_data_labels, return_trees=True, sample_weights=sample_weight, **kwargs)
316+
) = fast_hdbscan(
317+
clean_data,
318+
clean_data_labels,
319+
return_trees=True,
320+
sample_weights=sample_weight,
321+
**kwargs,
322+
)
299323

300324
self._condensed_tree = to_numpy_rec_array(self._condensed_tree)
301325

@@ -318,7 +342,11 @@ def fit(self, X, y=None, sample_weight=None, **fit_params):
318342
return self
319343

320344
def dbscan_clustering(self, epsilon):
321-
check_is_fitted(self, "_single_linkage_tree", msg="You first need to fit the HDBSCAN model before picking a DBSCAN clustering")
345+
check_is_fitted(
346+
self,
347+
"_single_linkage_tree",
348+
msg="You first need to fit the HDBSCAN model before picking a DBSCAN clustering",
349+
)
322350
return get_cluster_labelling_at_cut(
323351
self._single_linkage_tree,
324352
epsilon,
@@ -327,7 +355,11 @@ def dbscan_clustering(self, epsilon):
327355

328356
@property
329357
def condensed_tree_(self):
330-
check_is_fitted(self, "_condensed_tree", msg="You first need to fit the HDBSCAN model before accessing the condensed tree")
358+
check_is_fitted(
359+
self,
360+
"_condensed_tree",
361+
msg="You first need to fit the HDBSCAN model before accessing the condensed tree",
362+
)
331363
if self._condensed_tree is not None:
332364
return CondensedTree(
333365
self._condensed_tree,
@@ -341,7 +373,11 @@ def condensed_tree_(self):
341373

342374
@property
343375
def single_linkage_tree_(self):
344-
check_is_fitted(self, "_single_linkage_tree", msg="You first need to fit the HDBSCAN model before accessing the single linkage tree")
376+
check_is_fitted(
377+
self,
378+
"_single_linkage_tree",
379+
msg="You first need to fit the HDBSCAN model before accessing the single linkage tree",
380+
)
345381
if self._single_linkage_tree is not None:
346382
return SingleLinkageTree(self._single_linkage_tree)
347383
else:
@@ -351,7 +387,11 @@ def single_linkage_tree_(self):
351387

352388
@property
353389
def minimum_spanning_tree_(self):
354-
check_is_fitted(self, "_min_spanning_tree", msg="You first need to fit the HDBSCAN model before accessing the minimum spanning tree")
390+
check_is_fitted(
391+
self,
392+
"_min_spanning_tree",
393+
msg="You first need to fit the HDBSCAN model before accessing the minimum spanning tree",
394+
)
355395
if self._min_spanning_tree is not None:
356396
if self._raw_data is not None:
357397
return MinimumSpanningTree(self._min_spanning_tree, self._raw_data)

0 commit comments

Comments
 (0)