Skip to content

Commit c58f92d

Browse files
authored
Merge branch 'main' into sample_weights
2 parents 5d48141 + 163e167 commit c58f92d

File tree

5 files changed

+260
-14
lines changed

5 files changed

+260
-14
lines changed

README.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
Fast Multicore HDBSCAN
1010
======================
1111

12-
Ahe ``fast_hdbscan`` library provides a simple implementation of the HDBSCAN clustering algorithm designed specifically
12+
The ``fast_hdbscan`` library provides a simple implementation of the HDBSCAN clustering algorithm designed specifically
1313
for high performance on multicore machine with low dimensional data (2D to about 20D). The algorithm runs in parallel and can make
1414
effective use of as many cores as you wish to throw at a problem. It is thus ideal for large SMP systems, and even
1515
modern multicore laptops.

azure-pipelines.yml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@ pr:
1717
- doc/*
1818
- README.rst
1919

20+
parameters:
21+
- name: includeReleaseCandidates
22+
displayName: "Allow pre-release dependencies"
23+
type: boolean
24+
default: false
25+
2026
variables:
2127
triggeredByPullRequest: $[eq(variables['Build.Reason'], 'PullRequest')]
2228

@@ -66,8 +72,17 @@ stages:
6672

6773
- script: |
6874
python -m pip install --upgrade pip
75+
displayName: 'Upgrade pip'
76+
77+
- script: |
6978
pip install -r requirements.txt
7079
displayName: 'Install dependencies'
80+
condition: ${{ eq(parameters.includeReleaseCandidates, false) }}
81+
82+
- script: |
83+
pip install --pre -r requirements.txt
84+
displayName: 'Install dependencies (allow pre-releases)'
85+
condition: ${{ eq(parameters.includeReleaseCandidates, true) }}
7186
7287
- script: |
7388
pip install -e .

fast_hdbscan/cluster_trees.py

Lines changed: 178 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@
55

66
from .disjoint_set import ds_rank_create, ds_find, ds_union_by_rank
77

8+
from numba.typed import Dict, List
9+
from numba.types import int64, ListType
10+
11+
int64_list_type = ListType(int64)
12+
813
LinkageMergeData = namedtuple("LinkageMergeData", ["parent", "size", "next"])
914

1015

@@ -171,7 +176,7 @@ def condense_tree(hierarchy, min_cluster_size=10, sample_weights=None):
171176
lambdas = np.empty(root, dtype=np.float32)
172177
sizes = np.ones(root, dtype=np.int64)
173178

174-
ignore = np.zeros(root + 1, dtype=np.bool8)
179+
ignore = np.zeros(root + 1, dtype=np.bool_) # 'bool' is no longer an attribute of 'numpy'
175180

176181
if sample_weights is None:
177182
sample_weights = np.ones(num_points, dtype=np.float32)
@@ -255,6 +260,178 @@ def extract_leaves(condensed_tree, allow_single_cluster=True):
255260
return np.nonzero(leaf_indicator)[0]
256261

257262

263+
264+
# The *_bcubed functions below implement the (semi-supervised) HDBSCAN*(BC) algorithm presented
265+
# in Castro Gertrudes, J., Zimek, A., Sander, J. et al. A unified view of density-based methods
266+
# for semi-supervised clustering and classification. Data Min Knowl Disc 33, 1894–1952 (2019).
267+
268+
@numba.njit()
269+
def cluster_tree_from_condensed_tree_bcubed(condensed_tree, cluster_tree, label_indices):
270+
# This functions returns a cluster_tree with virtual nodes (if applicable).
271+
272+
label_indices_list = list(label_indices.keys())
273+
cluster_tree_parents = list(cluster_tree.parent)
274+
275+
# A labeled node that has no children and who's parent is not a leaf cluster, then it must be
276+
# a noisy node (virtual node).
277+
278+
mask1 = condensed_tree.child_size > 1
279+
mask2 = condensed_tree.child_size == 1
280+
mask3 = np.array([child in label_indices_list for child in condensed_tree.child])
281+
mask4 = np.array([parent in cluster_tree_parents for parent in condensed_tree.parent]) # check that it's not a leaf cluster
282+
283+
mask = (mask1 | (mask2 & mask3 & mask4))
284+
285+
return CondensedTree(condensed_tree.parent[mask], condensed_tree.child[mask], condensed_tree.lambda_val[mask],
286+
condensed_tree.child_size[mask])
287+
288+
289+
@numba.njit()
290+
def get_condensed_tree_clusters_bcubed(condensed_tree, cluster_tree=None, cluster_tree_bcubed=None, allow_virtual_nodes=False):
291+
292+
cluster_elements = Dict.empty(
293+
key_type=int64,
294+
value_type=int64_list_type,
295+
)
296+
297+
virtual_nodes = [0 for x in range(0)]
298+
299+
parents_set = set(list(condensed_tree.parent))
300+
for i in range(len(condensed_tree.child) - 1, -1, -1): # Traverse tree bottom up
301+
parent = condensed_tree.parent[i]
302+
child = condensed_tree.child[i]
303+
if child in parents_set:
304+
if parent in cluster_elements:
305+
cluster_elements[parent].extend(cluster_elements[child])
306+
else:
307+
cluster_elements[parent] = List(cluster_elements[child])
308+
elif parent in cluster_elements:
309+
cluster_elements[parent].append(child)
310+
else:
311+
cluster_elements[parent] = List.empty_list(int64)
312+
cluster_elements[parent].append(child)
313+
314+
if allow_virtual_nodes and (cluster_tree is not None) and (cluster_tree_bcubed is not None):
315+
for i in list(set(cluster_tree_bcubed.child).difference(set(cluster_tree.child))):
316+
virtual_nodes.append(i)
317+
for node in virtual_nodes:
318+
cluster_elements[node] = List.empty_list(int64)
319+
cluster_elements[node].append(node)
320+
321+
return cluster_elements, np.array(virtual_nodes)
322+
323+
324+
@numba.njit()
325+
def eom_recursion_bcubed(node, cluster_tree, stability_node_scores, bcubed_node_scores, selected_clusters):
326+
current_score_stability_bcubed = np.array([stability_node_scores[node], bcubed_node_scores[node]], dtype=np.float32)
327+
328+
children = cluster_tree.child[cluster_tree.parent == node]
329+
child_score_total_stability_bcubed = np.array([0.0, 0.0], dtype=np.float32)
330+
331+
for child_node in children:
332+
child_score_total_stability_bcubed += eom_recursion_bcubed(child_node, cluster_tree, stability_node_scores, bcubed_node_scores, selected_clusters)
333+
334+
if child_score_total_stability_bcubed[1] > current_score_stability_bcubed[1]:
335+
return child_score_total_stability_bcubed
336+
337+
elif child_score_total_stability_bcubed[1] < current_score_stability_bcubed[1]:
338+
selected_clusters[node] = True
339+
unselect_below_node(node, cluster_tree, selected_clusters)
340+
return current_score_stability_bcubed
341+
342+
# Stability scores used to resolve ties.
343+
elif child_score_total_stability_bcubed[1] == current_score_stability_bcubed[1]:
344+
345+
if child_score_total_stability_bcubed[0] > current_score_stability_bcubed[0]:
346+
return child_score_total_stability_bcubed
347+
else:
348+
selected_clusters[node] = True
349+
unselect_below_node(node, cluster_tree, selected_clusters)
350+
return current_score_stability_bcubed
351+
352+
353+
@numba.njit()
354+
def score_condensed_tree_nodes_bcubed(cluster_elements, label_indices):
355+
356+
label_values = label_indices.values()
357+
label_counts = {0: 0 for i in range(0)}
358+
359+
for label in label_values:
360+
if label in label_counts:
361+
label_counts[label] +=1
362+
else:
363+
label_counts[label] = 1
364+
365+
label_counts_values = list(label_counts.values())
366+
total_num_of_labeled_points = sum(label_counts_values)
367+
bcubed = {0: 0.0 for i in range(0)}
368+
369+
for cluster, elements in cluster_elements.items():
370+
371+
cluster_labeled_points_dict = {0: 0 for i in range(0)}
372+
373+
cluster_labeled_points = list(set(elements) & set(label_indices.keys()))
374+
bcubed[cluster] = 0.0
375+
376+
if len(cluster_labeled_points) > 0:
377+
378+
for p in cluster_labeled_points:
379+
p_label = label_indices[p]
380+
if p_label in cluster_labeled_points_dict:
381+
cluster_labeled_points_dict[p_label] += 1
382+
else:
383+
cluster_labeled_points_dict[p_label] = 1
384+
385+
for label, num_points in cluster_labeled_points_dict.items():
386+
387+
total_num_of_class_label = label_counts[label]
388+
num_labeled_in_node = len(cluster_labeled_points)
389+
390+
precision_point = (num_points/num_labeled_in_node)/total_num_of_labeled_points
391+
recall_point = (num_points/total_num_of_class_label)/total_num_of_labeled_points
392+
393+
# Bcubed F-measure
394+
bcubed[cluster] += num_points*(2.0/(1.0/precision_point + 1.0/recall_point))
395+
return bcubed
396+
397+
398+
@numba.njit()
399+
def extract_clusters_bcubed(condensed_tree, cluster_tree, label_indices, allow_virtual_nodes=False, allow_single_cluster=False):
400+
401+
if allow_virtual_nodes:
402+
403+
cluster_tree_bcubed = cluster_tree_from_condensed_tree_bcubed(condensed_tree, cluster_tree, label_indices)
404+
cluster_elements, virtual_nodes = get_condensed_tree_clusters_bcubed(condensed_tree, cluster_tree, cluster_tree_bcubed, allow_virtual_nodes)
405+
stability_node_scores = score_condensed_tree_nodes(condensed_tree)
406+
for node in virtual_nodes:
407+
stability_node_scores[node] = 0.0
408+
bcubed_node_scores = score_condensed_tree_nodes_bcubed(cluster_elements, label_indices)
409+
410+
else:
411+
412+
cluster_tree_bcubed = cluster_tree
413+
cluster_elements, virtual_nodes = get_condensed_tree_clusters_bcubed(condensed_tree)
414+
stability_node_scores = score_condensed_tree_nodes(condensed_tree)
415+
bcubed_node_scores = score_condensed_tree_nodes_bcubed(cluster_elements, label_indices)
416+
417+
selected_clusters = {node: False for node in bcubed_node_scores}
418+
419+
if len(cluster_tree_bcubed.parent) == 0:
420+
return np.zeros(0, dtype=np.int64)
421+
422+
cluster_tree_root = cluster_tree_bcubed.parent.min()
423+
424+
if allow_single_cluster:
425+
eom_recursion_bcubed(cluster_tree_root, cluster_tree_bcubed, stability_node_scores, bcubed_node_scores, selected_clusters)
426+
elif len(bcubed_node_scores) > 1:
427+
root_children = cluster_tree_bcubed.child[cluster_tree_bcubed.parent == cluster_tree_root]
428+
for child_node in root_children:
429+
eom_recursion_bcubed(child_node, cluster_tree_bcubed, stability_node_scores, bcubed_node_scores, selected_clusters)
430+
431+
return np.asarray([node for node, selected in selected_clusters.items() if (selected and (node not in virtual_nodes))])
432+
433+
434+
258435
@numba.njit()
259436
def score_condensed_tree_nodes(condensed_tree):
260437
result = {0: 0.0 for i in range(0)}

fast_hdbscan/hdbscan.py

Lines changed: 64 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import numpy as np
22

33
from sklearn.base import BaseEstimator, ClusterMixin
4-
from sklearn.utils import check_array
4+
from sklearn.utils import check_array, check_X_y
55
from sklearn.utils.validation import check_is_fitted, _check_sample_weight
66
from sklearn.neighbors import KDTree
77

@@ -20,6 +20,7 @@
2020
get_cluster_label_vector,
2121
get_point_membership_strength_vector,
2222
cluster_tree_from_condensed_tree,
23+
extract_clusters_bcubed
2324
)
2425

2526
try:
@@ -29,6 +30,8 @@
2930
except ImportError:
3031
_HAVE_HDBSCAN = False
3132

33+
from numba.typed import Dict
34+
3235

3336
def to_numpy_rec_array(named_tuple_tree):
3437
size = named_tuple_tree.parent.shape[0]
@@ -132,6 +135,9 @@ def remap_single_linkage_tree(tree, internal_to_raw, outliers):
132135

133136
def fast_hdbscan(
134137
data,
138+
data_labels=None,
139+
semi_supervised=False,
140+
ss_algorithm=None,
135141
min_samples=10,
136142
min_cluster_size=10,
137143
cluster_selection_method="eom",
@@ -142,6 +148,16 @@ def fast_hdbscan(
142148
):
143149
data = check_array(data)
144150

151+
if semi_supervised and data_labels is None:
152+
raise ValueError("data_labels must not be None when semi_supervised is set to True!")
153+
154+
if semi_supervised:
155+
label_indices = np.flatnonzero(data_labels > -1)
156+
label_values = data_labels[label_indices]
157+
data_labels_dict = Dict()
158+
for index, label in zip(label_indices, label_values):
159+
data_labels_dict[index] = label
160+
145161
if (
146162
(not (np.issubdtype(type(min_samples), np.integer) or min_samples is None))
147163
or not np.issubdtype(type(min_cluster_size), np.integer)
@@ -171,9 +187,25 @@ def fast_hdbscan(
171187
cluster_tree = cluster_tree_from_condensed_tree(condensed_tree)
172188

173189
if cluster_selection_method == "eom":
174-
selected_clusters = extract_eom_clusters(
175-
condensed_tree, cluster_tree, allow_single_cluster=allow_single_cluster
176-
)
190+
if semi_supervised:
191+
if(ss_algorithm=="bc"):
192+
selected_clusters = extract_clusters_bcubed(condensed_tree,
193+
cluster_tree,
194+
data_labels_dict,
195+
allow_virtual_nodes=True,
196+
allow_single_cluster=allow_single_cluster)
197+
elif(ss_algorithm=="bc_without_vn"):
198+
selected_clusters = extract_clusters_bcubed(condensed_tree,
199+
cluster_tree,
200+
data_labels_dict,
201+
allow_virtual_nodes=False,
202+
allow_single_cluster=allow_single_cluster)
203+
else:
204+
raise ValueError(f"Invalid ss_algorithm {ss_algorithm}")
205+
else:
206+
selected_clusters = extract_eom_clusters(condensed_tree,
207+
cluster_tree,
208+
allow_single_cluster=allow_single_cluster)
177209
elif cluster_selection_method == "leaf":
178210
selected_clusters = extract_leaves(
179211
condensed_tree, allow_single_cluster=allow_single_cluster
@@ -206,32 +238,54 @@ def __init__(
206238
cluster_selection_method="eom",
207239
allow_single_cluster=False,
208240
cluster_selection_epsilon=0.0,
241+
semi_supervised=False,
242+
ss_algorithm=None,
209243
**kwargs,
210244
):
211245
self.min_cluster_size = min_cluster_size
212246
self.min_samples = min_samples
213247
self.cluster_selection_method = cluster_selection_method
214248
self.allow_single_cluster = allow_single_cluster
215249
self.cluster_selection_epsilon = cluster_selection_epsilon
250+
self.semi_supervised = semi_supervised
251+
self.ss_algorithm = ss_algorithm
216252

217-
def fit(self, X, y=None, sample_weight=None, **fit_params):
218-
X = check_array(X, accept_sparse="csr", force_all_finite=False)
219-
if sample_weight is not None:
220-
sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float32)
221-
self._raw_data = X
253+
def fit(self, X, y=None, **fit_params):
254+
255+
if (self.semi_supervised):
256+
X, y = check_X_y(X, y, accept_sparse="csr", force_all_finite=False)
257+
if sample_weight is not None:
258+
sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float32)
259+
self._raw_labels = y
260+
# Replace non-finite labels with -1 labels
261+
y[~np.isfinite(y)] = -1
262+
263+
if ~np.any(y !=-1):
264+
raise ValueError("y must contain at least one label > -1. Currently it only contains -1 and/or non-finite labels!")
265+
else:
266+
X = check_array(X, accept_sparse="csr", force_all_finite=False)
267+
if sample_weight is not None:
268+
sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float32)
269+
self._raw_data = X
222270

223271
self._all_finite = np.all(np.isfinite(X))
224272
if ~self._all_finite:
225273
# Pass only the purely finite indices into hdbscan
226274
# We will later assign all non-finite points to the background -1 cluster
227275
finite_index = np.where(np.isfinite(X).sum(axis=1) == X.shape[1])[0]
228276
clean_data = X[finite_index]
277+
clean_data_labels = y
278+
279+
if self.semi_supervised:
280+
clean_data_labels = y[finite_index]
281+
229282
internal_to_raw = {
230283
x: y for x, y in zip(range(len(finite_index)), finite_index)
231284
}
232285
outliers = list(set(range(X.shape[0])) - set(finite_index))
233286
else:
234287
clean_data = X
288+
clean_data_labels = y
235289

236290
kwargs = self.get_params()
237291

@@ -241,7 +295,7 @@ def fit(self, X, y=None, sample_weight=None, **fit_params):
241295
self._single_linkage_tree,
242296
self._condensed_tree,
243297
self._min_spanning_tree,
244-
) = fast_hdbscan(clean_data, return_trees=True, sample_weights=sample_weight, **kwargs)
298+
) = fast_hdbscan(clean_data, clean_data_labels, return_trees=True, sample_weights=sample_weight, **kwargs)
245299

246300
self._condensed_tree = to_numpy_rec_array(self._condensed_tree)
247301

0 commit comments

Comments
 (0)