Initial pass at max cluster size

lmcinnes · lmcinnes · commit 543bc239f41a · 2024-12-10T17:46:05.000Z
diff --git a/fast_hdbscan/cluster_trees.py b/fast_hdbscan/cluster_trees.py
@@ -161,7 +161,7 @@ def eliminate_branch(branch_node, parent_node, lambda_value, parents, children,
 
 
 @numba.njit(fastmath=True)
-def condense_tree(hierarchy, min_cluster_size=10, sample_weights=None):
+def condense_tree(hierarchy, min_cluster_size=10, max_cluster_size=np.inf, sample_weights=None):
     root = 2 * hierarchy.shape[0]
     num_points = hierarchy.shape[0] + 1
     next_label = num_points + 1
@@ -223,7 +223,10 @@ def condense_tree(hierarchy, min_cluster_size=10, sample_weights=None):
                                    hierarchy, num_points)
             idx = eliminate_branch(right, parent_node, lambda_value, parents, children, lambdas, sizes, idx, ignore,
                                    hierarchy, num_points)
-        # and finally if we actually have a legitimate cluster split, handle that correctly
+        # If both clusters are too large then relabel both
+        elif left_count > max_cluster_size and right_count > max_cluster_size:
+            relabel[left] = parent_node
+            relabel[right] = parent_node
         else:
             relabel[left] = next_label
 
@@ -471,34 +474,37 @@ def cluster_tree_from_condensed_tree(condensed_tree):
                          condensed_tree.child_size[mask])
 
 
-@numba.njit()
+#@numba.njit()
 def unselect_below_node(node, cluster_tree, selected_clusters):
     for child in cluster_tree.child[cluster_tree.parent == node]:
         unselect_below_node(child, cluster_tree, selected_clusters)
         selected_clusters[child] = False
 
 
-@numba.njit(fastmath=True)
-def eom_recursion(node, cluster_tree, node_scores, selected_clusters):
+#@numba.njit(fastmath=True)
+def eom_recursion(node, cluster_tree, node_scores, node_sizes, selected_clusters, max_cluster_size):
     current_score = node_scores[node]
+    current_size = node_sizes[node]
 
     children = cluster_tree.child[cluster_tree.parent == node]
     child_score_total = 0.0
 
     for child_node in children:
-        child_score_total += eom_recursion(child_node, cluster_tree, node_scores, selected_clusters)
+        child_score_total += eom_recursion(child_node, cluster_tree, node_scores, node_sizes, selected_clusters, max_cluster_size)
 
-    if child_score_total > current_score:
+    if child_score_total > current_score or current_size > max_cluster_size:
         return child_score_total
     else:
         selected_clusters[node] = True
         unselect_below_node(node, cluster_tree, selected_clusters)
         return current_score
 
 
-@numba.njit()
-def extract_eom_clusters(condensed_tree, cluster_tree, allow_single_cluster=False):
+#@numba.njit()
+def extract_eom_clusters(condensed_tree, cluster_tree, max_cluster_size=np.inf, allow_single_cluster=False):
     node_scores = score_condensed_tree_nodes(condensed_tree)
+    node_sizes = {node: size for node, size in zip(cluster_tree.child, cluster_tree.child_size.astype(np.float32))}
+    node_sizes[cluster_tree.parent.min()] = np.float32(cluster_tree.parent.min() - 1)
     selected_clusters = {node: False for node in node_scores}
 
     if len(cluster_tree.parent) == 0:
@@ -507,11 +513,11 @@ def extract_eom_clusters(condensed_tree, cluster_tree, allow_single_cluster=Fals
     cluster_tree_root = cluster_tree.parent.min()
 
     if allow_single_cluster:
-        eom_recursion(cluster_tree_root, cluster_tree, node_scores, selected_clusters)
+        eom_recursion(cluster_tree_root, cluster_tree, node_scores, node_sizes, selected_clusters, max_cluster_size)
     elif len(node_scores) > 1:
         root_children = cluster_tree.child[cluster_tree.parent == cluster_tree_root]
         for child_node in root_children:
-            eom_recursion(child_node, cluster_tree, node_scores, selected_clusters)
+            eom_recursion(child_node, cluster_tree, node_scores, node_sizes, selected_clusters, max_cluster_size)
 
     return np.asarray([node for node, selected in selected_clusters.items() if selected])
 
diff --git a/fast_hdbscan/hdbscan.py b/fast_hdbscan/hdbscan.py
@@ -141,6 +141,7 @@ def fast_hdbscan(
     min_samples=10,
     min_cluster_size=10,
     cluster_selection_method="eom",
+    max_cluster_size=np.inf,
     allow_single_cluster=False,
     cluster_selection_epsilon=0.0,
     sample_weights=None,
@@ -214,7 +215,7 @@ def fast_hdbscan(
                 raise ValueError(f"Invalid ss_algorithm {ss_algorithm}")
         else:
             selected_clusters = extract_eom_clusters(
-                condensed_tree, cluster_tree, allow_single_cluster=allow_single_cluster
+                condensed_tree, cluster_tree, max_cluster_size=max_cluster_size, allow_single_cluster=allow_single_cluster
             )
     elif cluster_selection_method == "leaf":
         selected_clusters = extract_leaves(
@@ -253,6 +254,7 @@ def __init__(
         min_samples=None,
         cluster_selection_method="eom",
         allow_single_cluster=False,
+        max_cluster_size=np.inf,
         cluster_selection_epsilon=0.0,
         semi_supervised=False,
         ss_algorithm=None,
@@ -262,6 +264,7 @@ def __init__(
         self.min_samples = min_samples
         self.cluster_selection_method = cluster_selection_method
         self.allow_single_cluster = allow_single_cluster
+        self.max_cluster_size = max_cluster_size
         self.cluster_selection_epsilon = cluster_selection_epsilon
         self.semi_supervised = semi_supervised
         self.ss_algorithm = ss_algorithm
diff --git a/fast_hdbscan/tests/test_hdbscan.py b/fast_hdbscan/tests/test_hdbscan.py
@@ -167,6 +167,13 @@ def test_fhdbscan_allow_single_cluster_with_epsilon():
     assert len(unique_labels) == 2
     assert counts[unique_labels == -1] == 2
 
+def test_fhdbscan_max_cluster_size():
+    model = HDBSCAN(max_cluster_size=30).fit(X)
+    assert len(set(model.labels_)) >= 3
+    for label in set(model.labels_):
+        if label != -1:
+            assert np.sum(model.labels_ == label) <= 30
+
 
 # Disable for now -- need to refactor to meet newer standards
 @pytest.mark.skip(reason="need to refactor to meet newer standards")