|
5 | 5 |
|
6 | 6 | from .disjoint_set import ds_rank_create, ds_find, ds_union_by_rank
|
7 | 7 |
|
| 8 | +from numba.typed import Dict, List |
| 9 | +from numba.types import int64, ListType |
| 10 | + |
| 11 | +int64_list_type = ListType(int64) |
| 12 | + |
8 | 13 | LinkageMergeData = namedtuple("LinkageMergeData", ["parent", "size", "next"])
|
9 | 14 |
|
10 | 15 |
|
@@ -131,7 +136,7 @@ def condense_tree(hierarchy, min_cluster_size=10):
|
131 | 136 | lambdas = np.empty(root, dtype=np.float32)
|
132 | 137 | sizes = np.ones(root, dtype=np.int64)
|
133 | 138 |
|
134 |
| - ignore = np.zeros(root + 1, dtype=np.bool) |
| 139 | + ignore = np.zeros(root + 1, dtype=np.bool_) # 'bool' is no longer an attribute of 'numpy' |
135 | 140 |
|
136 | 141 | idx = 0
|
137 | 142 |
|
@@ -212,6 +217,178 @@ def extract_leaves(condensed_tree, allow_single_cluster=True):
|
212 | 217 | return np.nonzero(leaf_indicator)[0]
|
213 | 218 |
|
214 | 219 |
|
| 220 | + |
| 221 | +# The *_bcubed functions below implement the (semi-supervised) HDBSCAN*(BC) algorithm presented |
| 222 | +# in Castro Gertrudes, J., Zimek, A., Sander, J. et al. A unified view of density-based methods |
| 223 | +# for semi-supervised clustering and classification. Data Min Knowl Disc 33, 1894–1952 (2019). |
| 224 | + |
| 225 | +@numba.njit() |
| 226 | +def cluster_tree_from_condensed_tree_bcubed(condensed_tree, cluster_tree, label_indices): |
| 227 | + # This functions returns a cluster_tree with virtual nodes (if applicable). |
| 228 | + |
| 229 | + label_indices_list = list(label_indices.keys()) |
| 230 | + cluster_tree_parents = list(cluster_tree.parent) |
| 231 | + |
| 232 | + # A labeled node that has no children and who's parent is not a leaf cluster, then it must be |
| 233 | + # a noisy node (virtual node). |
| 234 | + |
| 235 | + mask1 = condensed_tree.child_size > 1 |
| 236 | + mask2 = condensed_tree.child_size == 1 |
| 237 | + mask3 = np.array([child in label_indices_list for child in condensed_tree.child]) |
| 238 | + mask4 = np.array([parent in cluster_tree_parents for parent in condensed_tree.parent]) # check that it's not a leaf cluster |
| 239 | + |
| 240 | + mask = (mask1 | (mask2 & mask3 & mask4)) |
| 241 | + |
| 242 | + return CondensedTree(condensed_tree.parent[mask], condensed_tree.child[mask], condensed_tree.lambda_val[mask], |
| 243 | + condensed_tree.child_size[mask]) |
| 244 | + |
| 245 | + |
| 246 | +@numba.njit() |
| 247 | +def get_condensed_tree_clusters_bcubed(condensed_tree, cluster_tree=None, cluster_tree_bcubed=None, allow_virtual_nodes=False): |
| 248 | + |
| 249 | + cluster_elements = Dict.empty( |
| 250 | + key_type=int64, |
| 251 | + value_type=int64_list_type, |
| 252 | + ) |
| 253 | + |
| 254 | + virtual_nodes = [0 for x in range(0)] |
| 255 | + |
| 256 | + parents_set = set(list(condensed_tree.parent)) |
| 257 | + for i in range(len(condensed_tree.child) - 1, -1, -1): # Traverse tree bottom up |
| 258 | + parent = condensed_tree.parent[i] |
| 259 | + child = condensed_tree.child[i] |
| 260 | + if child in parents_set: |
| 261 | + if parent in cluster_elements: |
| 262 | + cluster_elements[parent].extend(cluster_elements[child]) |
| 263 | + else: |
| 264 | + cluster_elements[parent] = List(cluster_elements[child]) |
| 265 | + elif parent in cluster_elements: |
| 266 | + cluster_elements[parent].append(child) |
| 267 | + else: |
| 268 | + cluster_elements[parent] = List.empty_list(int64) |
| 269 | + cluster_elements[parent].append(child) |
| 270 | + |
| 271 | + if allow_virtual_nodes and (cluster_tree is not None) and (cluster_tree_bcubed is not None): |
| 272 | + for i in list(set(cluster_tree_bcubed.child).difference(set(cluster_tree.child))): |
| 273 | + virtual_nodes.append(i) |
| 274 | + for node in virtual_nodes: |
| 275 | + cluster_elements[node] = List.empty_list(int64) |
| 276 | + cluster_elements[node].append(node) |
| 277 | + |
| 278 | + return cluster_elements, np.array(virtual_nodes) |
| 279 | + |
| 280 | + |
| 281 | +@numba.njit() |
| 282 | +def eom_recursion_bcubed(node, cluster_tree, stability_node_scores, bcubed_node_scores, selected_clusters): |
| 283 | + current_score_stability_bcubed = np.array([stability_node_scores[node], bcubed_node_scores[node]], dtype=np.float32) |
| 284 | + |
| 285 | + children = cluster_tree.child[cluster_tree.parent == node] |
| 286 | + child_score_total_stability_bcubed = np.array([0.0, 0.0], dtype=np.float32) |
| 287 | + |
| 288 | + for child_node in children: |
| 289 | + child_score_total_stability_bcubed += eom_recursion_bcubed(child_node, cluster_tree, stability_node_scores, bcubed_node_scores, selected_clusters) |
| 290 | + |
| 291 | + if child_score_total_stability_bcubed[1] > current_score_stability_bcubed[1]: |
| 292 | + return child_score_total_stability_bcubed |
| 293 | + |
| 294 | + elif child_score_total_stability_bcubed[1] < current_score_stability_bcubed[1]: |
| 295 | + selected_clusters[node] = True |
| 296 | + unselect_below_node(node, cluster_tree, selected_clusters) |
| 297 | + return current_score_stability_bcubed |
| 298 | + |
| 299 | + # Stability scores used to resolve ties. |
| 300 | + elif child_score_total_stability_bcubed[1] == current_score_stability_bcubed[1]: |
| 301 | + |
| 302 | + if child_score_total_stability_bcubed[0] > current_score_stability_bcubed[0]: |
| 303 | + return child_score_total_stability_bcubed |
| 304 | + else: |
| 305 | + selected_clusters[node] = True |
| 306 | + unselect_below_node(node, cluster_tree, selected_clusters) |
| 307 | + return current_score_stability_bcubed |
| 308 | + |
| 309 | + |
| 310 | +@numba.njit() |
| 311 | +def score_condensed_tree_nodes_bcubed(cluster_elements, label_indices): |
| 312 | + |
| 313 | + label_values = label_indices.values() |
| 314 | + label_counts = {0: 0 for i in range(0)} |
| 315 | + |
| 316 | + for label in label_values: |
| 317 | + if label in label_counts: |
| 318 | + label_counts[label] +=1 |
| 319 | + else: |
| 320 | + label_counts[label] = 1 |
| 321 | + |
| 322 | + label_counts_values = list(label_counts.values()) |
| 323 | + total_num_of_labeled_points = sum(label_counts_values) |
| 324 | + bcubed = {0: 0.0 for i in range(0)} |
| 325 | + |
| 326 | + for cluster, elements in cluster_elements.items(): |
| 327 | + |
| 328 | + cluster_labeled_points_dict = {0: 0 for i in range(0)} |
| 329 | + |
| 330 | + cluster_labeled_points = list(set(elements) & set(label_indices.keys())) |
| 331 | + bcubed[cluster] = 0.0 |
| 332 | + |
| 333 | + if len(cluster_labeled_points) > 0: |
| 334 | + |
| 335 | + for p in cluster_labeled_points: |
| 336 | + p_label = label_indices[p] |
| 337 | + if p_label in cluster_labeled_points_dict: |
| 338 | + cluster_labeled_points_dict[p_label] += 1 |
| 339 | + else: |
| 340 | + cluster_labeled_points_dict[p_label] = 1 |
| 341 | + |
| 342 | + for label, num_points in cluster_labeled_points_dict.items(): |
| 343 | + |
| 344 | + total_num_of_class_label = label_counts[label] |
| 345 | + num_labeled_in_node = len(cluster_labeled_points) |
| 346 | + |
| 347 | + precision_point = (num_points/num_labeled_in_node)/total_num_of_labeled_points |
| 348 | + recall_point = (num_points/total_num_of_class_label)/total_num_of_labeled_points |
| 349 | + |
| 350 | + # Bcubed F-measure |
| 351 | + bcubed[cluster] += num_points*(2.0/(1.0/precision_point + 1.0/recall_point)) |
| 352 | + return bcubed |
| 353 | + |
| 354 | + |
| 355 | +@numba.njit() |
| 356 | +def extract_clusters_bcubed(condensed_tree, cluster_tree, label_indices, allow_virtual_nodes=False, allow_single_cluster=False): |
| 357 | + |
| 358 | + if allow_virtual_nodes: |
| 359 | + |
| 360 | + cluster_tree_bcubed = cluster_tree_from_condensed_tree_bcubed(condensed_tree, cluster_tree, label_indices) |
| 361 | + cluster_elements, virtual_nodes = get_condensed_tree_clusters_bcubed(condensed_tree, cluster_tree, cluster_tree_bcubed, allow_virtual_nodes) |
| 362 | + stability_node_scores = score_condensed_tree_nodes(condensed_tree) |
| 363 | + for node in virtual_nodes: |
| 364 | + stability_node_scores[node] = 0.0 |
| 365 | + bcubed_node_scores = score_condensed_tree_nodes_bcubed(cluster_elements, label_indices) |
| 366 | + |
| 367 | + else: |
| 368 | + |
| 369 | + cluster_tree_bcubed = cluster_tree |
| 370 | + cluster_elements, virtual_nodes = get_condensed_tree_clusters_bcubed(condensed_tree) |
| 371 | + stability_node_scores = score_condensed_tree_nodes(condensed_tree) |
| 372 | + bcubed_node_scores = score_condensed_tree_nodes_bcubed(cluster_elements, label_indices) |
| 373 | + |
| 374 | + selected_clusters = {node: False for node in bcubed_node_scores} |
| 375 | + |
| 376 | + if len(cluster_tree_bcubed.parent) == 0: |
| 377 | + return np.zeros(0, dtype=np.int64) |
| 378 | + |
| 379 | + cluster_tree_root = cluster_tree_bcubed.parent.min() |
| 380 | + |
| 381 | + if allow_single_cluster: |
| 382 | + eom_recursion_bcubed(cluster_tree_root, cluster_tree_bcubed, stability_node_scores, bcubed_node_scores, selected_clusters) |
| 383 | + elif len(bcubed_node_scores) > 1: |
| 384 | + root_children = cluster_tree_bcubed.child[cluster_tree_bcubed.parent == cluster_tree_root] |
| 385 | + for child_node in root_children: |
| 386 | + eom_recursion_bcubed(child_node, cluster_tree_bcubed, stability_node_scores, bcubed_node_scores, selected_clusters) |
| 387 | + |
| 388 | + return np.asarray([node for node, selected in selected_clusters.items() if (selected and (node not in virtual_nodes))]) |
| 389 | + |
| 390 | + |
| 391 | + |
215 | 392 | @numba.njit()
|
216 | 393 | def score_condensed_tree_nodes(condensed_tree):
|
217 | 394 | result = {0: 0.0 for i in range(0)}
|
|
0 commit comments