|
5 | 5 |
|
6 | 6 | from .disjoint_set import ds_rank_create, ds_find, ds_union_by_rank
|
7 | 7 |
|
| 8 | +from numba.typed import Dict, List |
| 9 | +from numba.types import int64, ListType |
| 10 | + |
| 11 | +int64_list_type = ListType(int64) |
| 12 | + |
8 | 13 | LinkageMergeData = namedtuple("LinkageMergeData", ["parent", "size", "next"])
|
9 | 14 |
|
10 | 15 |
|
@@ -171,7 +176,7 @@ def condense_tree(hierarchy, min_cluster_size=10, sample_weights=None):
|
171 | 176 | lambdas = np.empty(root, dtype=np.float32)
|
172 | 177 | sizes = np.ones(root, dtype=np.int64)
|
173 | 178 |
|
174 |
| - ignore = np.zeros(root + 1, dtype=np.bool8) |
| 179 | + ignore = np.zeros(root + 1, dtype=np.bool_) # 'bool' is no longer an attribute of 'numpy' |
175 | 180 |
|
176 | 181 | if sample_weights is None:
|
177 | 182 | sample_weights = np.ones(num_points, dtype=np.float32)
|
@@ -255,6 +260,178 @@ def extract_leaves(condensed_tree, allow_single_cluster=True):
|
255 | 260 | return np.nonzero(leaf_indicator)[0]
|
256 | 261 |
|
257 | 262 |
|
| 263 | + |
| 264 | +# The *_bcubed functions below implement the (semi-supervised) HDBSCAN*(BC) algorithm presented |
| 265 | +# in Castro Gertrudes, J., Zimek, A., Sander, J. et al. A unified view of density-based methods |
| 266 | +# for semi-supervised clustering and classification. Data Min Knowl Disc 33, 1894–1952 (2019). |
| 267 | + |
| 268 | +@numba.njit() |
| 269 | +def cluster_tree_from_condensed_tree_bcubed(condensed_tree, cluster_tree, label_indices): |
| 270 | + # This functions returns a cluster_tree with virtual nodes (if applicable). |
| 271 | + |
| 272 | + label_indices_list = list(label_indices.keys()) |
| 273 | + cluster_tree_parents = list(cluster_tree.parent) |
| 274 | + |
| 275 | + # A labeled node that has no children and who's parent is not a leaf cluster, then it must be |
| 276 | + # a noisy node (virtual node). |
| 277 | + |
| 278 | + mask1 = condensed_tree.child_size > 1 |
| 279 | + mask2 = condensed_tree.child_size == 1 |
| 280 | + mask3 = np.array([child in label_indices_list for child in condensed_tree.child]) |
| 281 | + mask4 = np.array([parent in cluster_tree_parents for parent in condensed_tree.parent]) # check that it's not a leaf cluster |
| 282 | + |
| 283 | + mask = (mask1 | (mask2 & mask3 & mask4)) |
| 284 | + |
| 285 | + return CondensedTree(condensed_tree.parent[mask], condensed_tree.child[mask], condensed_tree.lambda_val[mask], |
| 286 | + condensed_tree.child_size[mask]) |
| 287 | + |
| 288 | + |
| 289 | +@numba.njit() |
| 290 | +def get_condensed_tree_clusters_bcubed(condensed_tree, cluster_tree=None, cluster_tree_bcubed=None, allow_virtual_nodes=False): |
| 291 | + |
| 292 | + cluster_elements = Dict.empty( |
| 293 | + key_type=int64, |
| 294 | + value_type=int64_list_type, |
| 295 | + ) |
| 296 | + |
| 297 | + virtual_nodes = [0 for x in range(0)] |
| 298 | + |
| 299 | + parents_set = set(list(condensed_tree.parent)) |
| 300 | + for i in range(len(condensed_tree.child) - 1, -1, -1): # Traverse tree bottom up |
| 301 | + parent = condensed_tree.parent[i] |
| 302 | + child = condensed_tree.child[i] |
| 303 | + if child in parents_set: |
| 304 | + if parent in cluster_elements: |
| 305 | + cluster_elements[parent].extend(cluster_elements[child]) |
| 306 | + else: |
| 307 | + cluster_elements[parent] = List(cluster_elements[child]) |
| 308 | + elif parent in cluster_elements: |
| 309 | + cluster_elements[parent].append(child) |
| 310 | + else: |
| 311 | + cluster_elements[parent] = List.empty_list(int64) |
| 312 | + cluster_elements[parent].append(child) |
| 313 | + |
| 314 | + if allow_virtual_nodes and (cluster_tree is not None) and (cluster_tree_bcubed is not None): |
| 315 | + for i in list(set(cluster_tree_bcubed.child).difference(set(cluster_tree.child))): |
| 316 | + virtual_nodes.append(i) |
| 317 | + for node in virtual_nodes: |
| 318 | + cluster_elements[node] = List.empty_list(int64) |
| 319 | + cluster_elements[node].append(node) |
| 320 | + |
| 321 | + return cluster_elements, np.array(virtual_nodes) |
| 322 | + |
| 323 | + |
| 324 | +@numba.njit() |
| 325 | +def eom_recursion_bcubed(node, cluster_tree, stability_node_scores, bcubed_node_scores, selected_clusters): |
| 326 | + current_score_stability_bcubed = np.array([stability_node_scores[node], bcubed_node_scores[node]], dtype=np.float32) |
| 327 | + |
| 328 | + children = cluster_tree.child[cluster_tree.parent == node] |
| 329 | + child_score_total_stability_bcubed = np.array([0.0, 0.0], dtype=np.float32) |
| 330 | + |
| 331 | + for child_node in children: |
| 332 | + child_score_total_stability_bcubed += eom_recursion_bcubed(child_node, cluster_tree, stability_node_scores, bcubed_node_scores, selected_clusters) |
| 333 | + |
| 334 | + if child_score_total_stability_bcubed[1] > current_score_stability_bcubed[1]: |
| 335 | + return child_score_total_stability_bcubed |
| 336 | + |
| 337 | + elif child_score_total_stability_bcubed[1] < current_score_stability_bcubed[1]: |
| 338 | + selected_clusters[node] = True |
| 339 | + unselect_below_node(node, cluster_tree, selected_clusters) |
| 340 | + return current_score_stability_bcubed |
| 341 | + |
| 342 | + # Stability scores used to resolve ties. |
| 343 | + elif child_score_total_stability_bcubed[1] == current_score_stability_bcubed[1]: |
| 344 | + |
| 345 | + if child_score_total_stability_bcubed[0] > current_score_stability_bcubed[0]: |
| 346 | + return child_score_total_stability_bcubed |
| 347 | + else: |
| 348 | + selected_clusters[node] = True |
| 349 | + unselect_below_node(node, cluster_tree, selected_clusters) |
| 350 | + return current_score_stability_bcubed |
| 351 | + |
| 352 | + |
| 353 | +@numba.njit() |
| 354 | +def score_condensed_tree_nodes_bcubed(cluster_elements, label_indices): |
| 355 | + |
| 356 | + label_values = label_indices.values() |
| 357 | + label_counts = {0: 0 for i in range(0)} |
| 358 | + |
| 359 | + for label in label_values: |
| 360 | + if label in label_counts: |
| 361 | + label_counts[label] +=1 |
| 362 | + else: |
| 363 | + label_counts[label] = 1 |
| 364 | + |
| 365 | + label_counts_values = list(label_counts.values()) |
| 366 | + total_num_of_labeled_points = sum(label_counts_values) |
| 367 | + bcubed = {0: 0.0 for i in range(0)} |
| 368 | + |
| 369 | + for cluster, elements in cluster_elements.items(): |
| 370 | + |
| 371 | + cluster_labeled_points_dict = {0: 0 for i in range(0)} |
| 372 | + |
| 373 | + cluster_labeled_points = list(set(elements) & set(label_indices.keys())) |
| 374 | + bcubed[cluster] = 0.0 |
| 375 | + |
| 376 | + if len(cluster_labeled_points) > 0: |
| 377 | + |
| 378 | + for p in cluster_labeled_points: |
| 379 | + p_label = label_indices[p] |
| 380 | + if p_label in cluster_labeled_points_dict: |
| 381 | + cluster_labeled_points_dict[p_label] += 1 |
| 382 | + else: |
| 383 | + cluster_labeled_points_dict[p_label] = 1 |
| 384 | + |
| 385 | + for label, num_points in cluster_labeled_points_dict.items(): |
| 386 | + |
| 387 | + total_num_of_class_label = label_counts[label] |
| 388 | + num_labeled_in_node = len(cluster_labeled_points) |
| 389 | + |
| 390 | + precision_point = (num_points/num_labeled_in_node)/total_num_of_labeled_points |
| 391 | + recall_point = (num_points/total_num_of_class_label)/total_num_of_labeled_points |
| 392 | + |
| 393 | + # Bcubed F-measure |
| 394 | + bcubed[cluster] += num_points*(2.0/(1.0/precision_point + 1.0/recall_point)) |
| 395 | + return bcubed |
| 396 | + |
| 397 | + |
| 398 | +@numba.njit() |
| 399 | +def extract_clusters_bcubed(condensed_tree, cluster_tree, label_indices, allow_virtual_nodes=False, allow_single_cluster=False): |
| 400 | + |
| 401 | + if allow_virtual_nodes: |
| 402 | + |
| 403 | + cluster_tree_bcubed = cluster_tree_from_condensed_tree_bcubed(condensed_tree, cluster_tree, label_indices) |
| 404 | + cluster_elements, virtual_nodes = get_condensed_tree_clusters_bcubed(condensed_tree, cluster_tree, cluster_tree_bcubed, allow_virtual_nodes) |
| 405 | + stability_node_scores = score_condensed_tree_nodes(condensed_tree) |
| 406 | + for node in virtual_nodes: |
| 407 | + stability_node_scores[node] = 0.0 |
| 408 | + bcubed_node_scores = score_condensed_tree_nodes_bcubed(cluster_elements, label_indices) |
| 409 | + |
| 410 | + else: |
| 411 | + |
| 412 | + cluster_tree_bcubed = cluster_tree |
| 413 | + cluster_elements, virtual_nodes = get_condensed_tree_clusters_bcubed(condensed_tree) |
| 414 | + stability_node_scores = score_condensed_tree_nodes(condensed_tree) |
| 415 | + bcubed_node_scores = score_condensed_tree_nodes_bcubed(cluster_elements, label_indices) |
| 416 | + |
| 417 | + selected_clusters = {node: False for node in bcubed_node_scores} |
| 418 | + |
| 419 | + if len(cluster_tree_bcubed.parent) == 0: |
| 420 | + return np.zeros(0, dtype=np.int64) |
| 421 | + |
| 422 | + cluster_tree_root = cluster_tree_bcubed.parent.min() |
| 423 | + |
| 424 | + if allow_single_cluster: |
| 425 | + eom_recursion_bcubed(cluster_tree_root, cluster_tree_bcubed, stability_node_scores, bcubed_node_scores, selected_clusters) |
| 426 | + elif len(bcubed_node_scores) > 1: |
| 427 | + root_children = cluster_tree_bcubed.child[cluster_tree_bcubed.parent == cluster_tree_root] |
| 428 | + for child_node in root_children: |
| 429 | + eom_recursion_bcubed(child_node, cluster_tree_bcubed, stability_node_scores, bcubed_node_scores, selected_clusters) |
| 430 | + |
| 431 | + return np.asarray([node for node, selected in selected_clusters.items() if (selected and (node not in virtual_nodes))]) |
| 432 | + |
| 433 | + |
| 434 | + |
258 | 435 | @numba.njit()
|
259 | 436 | def score_condensed_tree_nodes(condensed_tree):
|
260 | 437 | result = {0: 0.0 for i in range(0)}
|
|
0 commit comments