Skip to content

Commit 163e167

Browse files
authored
Merge pull request TutteInstitute#21 from nsakr/semi_supervised_fast_hdbscan
Add HDBSCAN*(BC) implementation.
2 parents fb1f0e5 + e0ecc01 commit 163e167

File tree

2 files changed

+237
-8
lines changed

2 files changed

+237
-8
lines changed

fast_hdbscan/cluster_trees.py

Lines changed: 178 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@
55

66
from .disjoint_set import ds_rank_create, ds_find, ds_union_by_rank
77

8+
from numba.typed import Dict, List
9+
from numba.types import int64, ListType
10+
11+
int64_list_type = ListType(int64)
12+
813
LinkageMergeData = namedtuple("LinkageMergeData", ["parent", "size", "next"])
914

1015

@@ -131,7 +136,7 @@ def condense_tree(hierarchy, min_cluster_size=10):
131136
lambdas = np.empty(root, dtype=np.float32)
132137
sizes = np.ones(root, dtype=np.int64)
133138

134-
ignore = np.zeros(root + 1, dtype=np.bool)
139+
ignore = np.zeros(root + 1, dtype=np.bool_) # 'bool' is no longer an attribute of 'numpy'
135140

136141
idx = 0
137142

@@ -212,6 +217,178 @@ def extract_leaves(condensed_tree, allow_single_cluster=True):
212217
return np.nonzero(leaf_indicator)[0]
213218

214219

220+
221+
# The *_bcubed functions below implement the (semi-supervised) HDBSCAN*(BC) algorithm presented
222+
# in Castro Gertrudes, J., Zimek, A., Sander, J. et al. A unified view of density-based methods
223+
# for semi-supervised clustering and classification. Data Min Knowl Disc 33, 1894–1952 (2019).
224+
225+
@numba.njit()
226+
def cluster_tree_from_condensed_tree_bcubed(condensed_tree, cluster_tree, label_indices):
227+
# This functions returns a cluster_tree with virtual nodes (if applicable).
228+
229+
label_indices_list = list(label_indices.keys())
230+
cluster_tree_parents = list(cluster_tree.parent)
231+
232+
# A labeled node that has no children and who's parent is not a leaf cluster, then it must be
233+
# a noisy node (virtual node).
234+
235+
mask1 = condensed_tree.child_size > 1
236+
mask2 = condensed_tree.child_size == 1
237+
mask3 = np.array([child in label_indices_list for child in condensed_tree.child])
238+
mask4 = np.array([parent in cluster_tree_parents for parent in condensed_tree.parent]) # check that it's not a leaf cluster
239+
240+
mask = (mask1 | (mask2 & mask3 & mask4))
241+
242+
return CondensedTree(condensed_tree.parent[mask], condensed_tree.child[mask], condensed_tree.lambda_val[mask],
243+
condensed_tree.child_size[mask])
244+
245+
246+
@numba.njit()
247+
def get_condensed_tree_clusters_bcubed(condensed_tree, cluster_tree=None, cluster_tree_bcubed=None, allow_virtual_nodes=False):
248+
249+
cluster_elements = Dict.empty(
250+
key_type=int64,
251+
value_type=int64_list_type,
252+
)
253+
254+
virtual_nodes = [0 for x in range(0)]
255+
256+
parents_set = set(list(condensed_tree.parent))
257+
for i in range(len(condensed_tree.child) - 1, -1, -1): # Traverse tree bottom up
258+
parent = condensed_tree.parent[i]
259+
child = condensed_tree.child[i]
260+
if child in parents_set:
261+
if parent in cluster_elements:
262+
cluster_elements[parent].extend(cluster_elements[child])
263+
else:
264+
cluster_elements[parent] = List(cluster_elements[child])
265+
elif parent in cluster_elements:
266+
cluster_elements[parent].append(child)
267+
else:
268+
cluster_elements[parent] = List.empty_list(int64)
269+
cluster_elements[parent].append(child)
270+
271+
if allow_virtual_nodes and (cluster_tree is not None) and (cluster_tree_bcubed is not None):
272+
for i in list(set(cluster_tree_bcubed.child).difference(set(cluster_tree.child))):
273+
virtual_nodes.append(i)
274+
for node in virtual_nodes:
275+
cluster_elements[node] = List.empty_list(int64)
276+
cluster_elements[node].append(node)
277+
278+
return cluster_elements, np.array(virtual_nodes)
279+
280+
281+
@numba.njit()
282+
def eom_recursion_bcubed(node, cluster_tree, stability_node_scores, bcubed_node_scores, selected_clusters):
283+
current_score_stability_bcubed = np.array([stability_node_scores[node], bcubed_node_scores[node]], dtype=np.float32)
284+
285+
children = cluster_tree.child[cluster_tree.parent == node]
286+
child_score_total_stability_bcubed = np.array([0.0, 0.0], dtype=np.float32)
287+
288+
for child_node in children:
289+
child_score_total_stability_bcubed += eom_recursion_bcubed(child_node, cluster_tree, stability_node_scores, bcubed_node_scores, selected_clusters)
290+
291+
if child_score_total_stability_bcubed[1] > current_score_stability_bcubed[1]:
292+
return child_score_total_stability_bcubed
293+
294+
elif child_score_total_stability_bcubed[1] < current_score_stability_bcubed[1]:
295+
selected_clusters[node] = True
296+
unselect_below_node(node, cluster_tree, selected_clusters)
297+
return current_score_stability_bcubed
298+
299+
# Stability scores used to resolve ties.
300+
elif child_score_total_stability_bcubed[1] == current_score_stability_bcubed[1]:
301+
302+
if child_score_total_stability_bcubed[0] > current_score_stability_bcubed[0]:
303+
return child_score_total_stability_bcubed
304+
else:
305+
selected_clusters[node] = True
306+
unselect_below_node(node, cluster_tree, selected_clusters)
307+
return current_score_stability_bcubed
308+
309+
310+
@numba.njit()
311+
def score_condensed_tree_nodes_bcubed(cluster_elements, label_indices):
312+
313+
label_values = label_indices.values()
314+
label_counts = {0: 0 for i in range(0)}
315+
316+
for label in label_values:
317+
if label in label_counts:
318+
label_counts[label] +=1
319+
else:
320+
label_counts[label] = 1
321+
322+
label_counts_values = list(label_counts.values())
323+
total_num_of_labeled_points = sum(label_counts_values)
324+
bcubed = {0: 0.0 for i in range(0)}
325+
326+
for cluster, elements in cluster_elements.items():
327+
328+
cluster_labeled_points_dict = {0: 0 for i in range(0)}
329+
330+
cluster_labeled_points = list(set(elements) & set(label_indices.keys()))
331+
bcubed[cluster] = 0.0
332+
333+
if len(cluster_labeled_points) > 0:
334+
335+
for p in cluster_labeled_points:
336+
p_label = label_indices[p]
337+
if p_label in cluster_labeled_points_dict:
338+
cluster_labeled_points_dict[p_label] += 1
339+
else:
340+
cluster_labeled_points_dict[p_label] = 1
341+
342+
for label, num_points in cluster_labeled_points_dict.items():
343+
344+
total_num_of_class_label = label_counts[label]
345+
num_labeled_in_node = len(cluster_labeled_points)
346+
347+
precision_point = (num_points/num_labeled_in_node)/total_num_of_labeled_points
348+
recall_point = (num_points/total_num_of_class_label)/total_num_of_labeled_points
349+
350+
# Bcubed F-measure
351+
bcubed[cluster] += num_points*(2.0/(1.0/precision_point + 1.0/recall_point))
352+
return bcubed
353+
354+
355+
@numba.njit()
356+
def extract_clusters_bcubed(condensed_tree, cluster_tree, label_indices, allow_virtual_nodes=False, allow_single_cluster=False):
357+
358+
if allow_virtual_nodes:
359+
360+
cluster_tree_bcubed = cluster_tree_from_condensed_tree_bcubed(condensed_tree, cluster_tree, label_indices)
361+
cluster_elements, virtual_nodes = get_condensed_tree_clusters_bcubed(condensed_tree, cluster_tree, cluster_tree_bcubed, allow_virtual_nodes)
362+
stability_node_scores = score_condensed_tree_nodes(condensed_tree)
363+
for node in virtual_nodes:
364+
stability_node_scores[node] = 0.0
365+
bcubed_node_scores = score_condensed_tree_nodes_bcubed(cluster_elements, label_indices)
366+
367+
else:
368+
369+
cluster_tree_bcubed = cluster_tree
370+
cluster_elements, virtual_nodes = get_condensed_tree_clusters_bcubed(condensed_tree)
371+
stability_node_scores = score_condensed_tree_nodes(condensed_tree)
372+
bcubed_node_scores = score_condensed_tree_nodes_bcubed(cluster_elements, label_indices)
373+
374+
selected_clusters = {node: False for node in bcubed_node_scores}
375+
376+
if len(cluster_tree_bcubed.parent) == 0:
377+
return np.zeros(0, dtype=np.int64)
378+
379+
cluster_tree_root = cluster_tree_bcubed.parent.min()
380+
381+
if allow_single_cluster:
382+
eom_recursion_bcubed(cluster_tree_root, cluster_tree_bcubed, stability_node_scores, bcubed_node_scores, selected_clusters)
383+
elif len(bcubed_node_scores) > 1:
384+
root_children = cluster_tree_bcubed.child[cluster_tree_bcubed.parent == cluster_tree_root]
385+
for child_node in root_children:
386+
eom_recursion_bcubed(child_node, cluster_tree_bcubed, stability_node_scores, bcubed_node_scores, selected_clusters)
387+
388+
return np.asarray([node for node, selected in selected_clusters.items() if (selected and (node not in virtual_nodes))])
389+
390+
391+
215392
@numba.njit()
216393
def score_condensed_tree_nodes(condensed_tree):
217394
result = {0: 0.0 for i in range(0)}

fast_hdbscan/hdbscan.py

Lines changed: 59 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import numpy as np
22

33
from sklearn.base import BaseEstimator, ClusterMixin
4-
from sklearn.utils import check_array
4+
from sklearn.utils import check_array, check_X_y
55
from sklearn.neighbors import KDTree
66

77
from warnings import warn
@@ -18,6 +18,7 @@
1818
get_cluster_label_vector,
1919
get_point_membership_strength_vector,
2020
cluster_tree_from_condensed_tree,
21+
extract_clusters_bcubed
2122
)
2223

2324
try:
@@ -27,6 +28,8 @@
2728
except ImportError:
2829
_HAVE_HDBSCAN = False
2930

31+
from numba.typed import Dict
32+
3033

3134
def to_numpy_rec_array(named_tuple_tree):
3235
size = named_tuple_tree.parent.shape[0]
@@ -130,6 +133,9 @@ def remap_single_linkage_tree(tree, internal_to_raw, outliers):
130133

131134
def fast_hdbscan(
132135
data,
136+
data_labels=None,
137+
semi_supervised=False,
138+
ss_algorithm=None,
133139
min_samples=10,
134140
min_cluster_size=10,
135141
cluster_selection_method="eom",
@@ -139,6 +145,16 @@ def fast_hdbscan(
139145
):
140146
data = check_array(data)
141147

148+
if semi_supervised and data_labels is None:
149+
raise ValueError("data_labels must not be None when semi_supervised is set to True!")
150+
151+
if semi_supervised:
152+
label_indices = np.flatnonzero(data_labels > -1)
153+
label_values = data_labels[label_indices]
154+
data_labels_dict = Dict()
155+
for index, label in zip(label_indices, label_values):
156+
data_labels_dict[index] = label
157+
142158
if (
143159
(not (np.issubdtype(type(min_samples), np.integer) or min_samples is None))
144160
or not np.issubdtype(type(min_cluster_size), np.integer)
@@ -165,9 +181,25 @@ def fast_hdbscan(
165181
cluster_tree = cluster_tree_from_condensed_tree(condensed_tree)
166182

167183
if cluster_selection_method == "eom":
168-
selected_clusters = extract_eom_clusters(
169-
condensed_tree, cluster_tree, allow_single_cluster=allow_single_cluster
170-
)
184+
if semi_supervised:
185+
if(ss_algorithm=="bc"):
186+
selected_clusters = extract_clusters_bcubed(condensed_tree,
187+
cluster_tree,
188+
data_labels_dict,
189+
allow_virtual_nodes=True,
190+
allow_single_cluster=allow_single_cluster)
191+
elif(ss_algorithm=="bc_without_vn"):
192+
selected_clusters = extract_clusters_bcubed(condensed_tree,
193+
cluster_tree,
194+
data_labels_dict,
195+
allow_virtual_nodes=False,
196+
allow_single_cluster=allow_single_cluster)
197+
else:
198+
raise ValueError(f"Invalid ss_algorithm {ss_algorithm}")
199+
else:
200+
selected_clusters = extract_eom_clusters(condensed_tree,
201+
cluster_tree,
202+
allow_single_cluster=allow_single_cluster)
171203
elif cluster_selection_method == "leaf":
172204
selected_clusters = extract_leaves(
173205
condensed_tree, allow_single_cluster=allow_single_cluster
@@ -200,30 +232,50 @@ def __init__(
200232
cluster_selection_method="eom",
201233
allow_single_cluster=False,
202234
cluster_selection_epsilon=0.0,
235+
semi_supervised=False,
236+
ss_algorithm=None,
203237
**kwargs,
204238
):
205239
self.min_cluster_size = min_cluster_size
206240
self.min_samples = min_samples
207241
self.cluster_selection_method = cluster_selection_method
208242
self.allow_single_cluster = allow_single_cluster
209243
self.cluster_selection_epsilon = cluster_selection_epsilon
244+
self.semi_supervised = semi_supervised
245+
self.ss_algorithm = ss_algorithm
210246

211247
def fit(self, X, y=None, **fit_params):
212-
X = check_array(X, accept_sparse="csr", force_all_finite=False)
213-
self._raw_data = X
248+
249+
if (self.semi_supervised):
250+
X, y = check_X_y(X, y, accept_sparse="csr", force_all_finite=False)
251+
self._raw_labels = y
252+
# Replace non-finite labels with -1 labels
253+
y[~np.isfinite(y)] = -1
254+
255+
if ~np.any(y !=-1):
256+
raise ValueError("y must contain at least one label > -1. Currently it only contains -1 and/or non-finite labels!")
257+
else:
258+
X = check_array(X, accept_sparse="csr", force_all_finite=False)
259+
self._raw_data = X
214260

215261
self._all_finite = np.all(np.isfinite(X))
216262
if ~self._all_finite:
217263
# Pass only the purely finite indices into hdbscan
218264
# We will later assign all non-finite points to the background -1 cluster
219265
finite_index = np.where(np.isfinite(X).sum(axis=1) == X.shape[1])[0]
220266
clean_data = X[finite_index]
267+
clean_data_labels = y
268+
269+
if self.semi_supervised:
270+
clean_data_labels = y[finite_index]
271+
221272
internal_to_raw = {
222273
x: y for x, y in zip(range(len(finite_index)), finite_index)
223274
}
224275
outliers = list(set(range(X.shape[0])) - set(finite_index))
225276
else:
226277
clean_data = X
278+
clean_data_labels = y
227279

228280
kwargs = self.get_params()
229281

@@ -233,7 +285,7 @@ def fit(self, X, y=None, **fit_params):
233285
self._single_linkage_tree,
234286
self._condensed_tree,
235287
self._min_spanning_tree,
236-
) = fast_hdbscan(clean_data, return_trees=True, **kwargs)
288+
) = fast_hdbscan(clean_data, clean_data_labels, return_trees=True, **kwargs)
237289

238290
self._condensed_tree = to_numpy_rec_array(self._condensed_tree)
239291

0 commit comments

Comments
 (0)