vda-lab
diff --git a/‎doc/detecting_branches.ipynb‎
Lines changed: 581 additions & 0 deletions b/‎doc/detecting_branches.ipynb‎
Lines changed: 581 additions & 0 deletions
diff --git a/‎doc/index.rst‎
Lines changed: 1 addition & 0 deletions b/‎doc/index.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎fast_hdbscan/branches.py‎
Lines changed: 44 additions & 36 deletions b/‎fast_hdbscan/branches.py‎
Lines changed: 44 additions & 36 deletions
diff --git a/‎fast_hdbscan/cluster_trees.py‎
Lines changed: 16 additions & 2 deletions b/‎fast_hdbscan/cluster_trees.py‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎fast_hdbscan/core_graph.py‎
Lines changed: 15 additions & 8 deletions b/‎fast_hdbscan/core_graph.py‎
Lines changed: 15 additions & 8 deletions
diff --git a/‎fast_hdbscan/hdbscan.py‎
Lines changed: 6 additions & 14 deletions b/‎fast_hdbscan/hdbscan.py‎
Lines changed: 6 additions & 14 deletions
@@ -76,6 +76,7 @@ User Guide
    basic_usage
    benchmarks
    comparable_clusterings
+   detecting_branches
 
 
 ----------
 
@@ -28,41 +28,34 @@ def apply_branch_threshold(
             labels[pts] = running_id
             probabilities[pts] = cluster_probabilities[pts]
             running_id += 1
-            continue
         else:
-            branch_labels[pts] = np.where(
-                branch_labels[pts] < 0, num_branches, branch_labels[pts]
-            )
-            labels[pts] = branch_labels[pts] + running_id
+            labels[pts] = branch_labels[pts] + has_noise + running_id
             running_id += num_branches + has_noise
 
 
 def find_branch_sub_clusters(
     clusterer,
     cluster_labels=None,
     cluster_probabilities=None,
-    *,
-    min_branch_size=None,
-    max_branch_size=None,
-    allow_single_branch=None,
-    branch_selection_method=None,
-    branch_selection_epsilon=0.0,
-    branch_selection_persistence=0.0,
     label_sides_as_branches=False,
-    propagate_labels=False,
+    min_cluster_size=None,
+    max_cluster_size=None,
+    allow_single_cluster=None,
+    cluster_selection_method=None,
+    cluster_selection_epsilon=0.0,
+    cluster_selection_persistence=0.0,
 ):
     result = find_sub_clusters(
         clusterer,
         cluster_labels,
         cluster_probabilities,
         lens_callback=compute_centrality,
-        min_cluster_size=min_branch_size,
-        max_cluster_size=max_branch_size,
-        allow_single_cluster=allow_single_branch,
-        cluster_selection_method=branch_selection_method,
-        cluster_selection_epsilon=branch_selection_epsilon,
-        cluster_selection_persistence=branch_selection_persistence,
-        propagate_labels=propagate_labels,
+        min_cluster_size=min_cluster_size,
+        max_cluster_size=max_cluster_size,
+        allow_single_cluster=allow_single_cluster,
+        cluster_selection_method=cluster_selection_method,
+        cluster_selection_epsilon=cluster_selection_epsilon,
+        cluster_selection_persistence=cluster_selection_persistence,
     )
     apply_branch_threshold(
         result[0],
@@ -95,29 +88,28 @@ class BranchDetector(SubClusterDetector):
 
     def __init__(
         self,
-        *,
-        min_branch_size=None,
-        max_branch_size=None,
-        allow_single_branch=None,
-        branch_selection_method=None,
-        branch_selection_epsilon=0.0,
-        branch_selection_persistence=0.0,
-        label_sides_as_branches=False,
+        min_cluster_size=None,
+        max_cluster_size=None,
+        allow_single_cluster=None,
+        cluster_selection_method=None,
+        cluster_selection_epsilon=0.0,
+        cluster_selection_persistence=0.0,
         propagate_labels=False,
+        label_sides_as_branches=False,
     ):
         super().__init__(
-            min_cluster_size=min_branch_size,
-            max_cluster_size=max_branch_size,
-            allow_single_cluster=allow_single_branch,
-            cluster_selection_method=branch_selection_method,
-            cluster_selection_epsilon=branch_selection_epsilon,
-            cluster_selection_persistence=branch_selection_persistence,
+            min_cluster_size=min_cluster_size,
+            max_cluster_size=max_cluster_size,
+            allow_single_cluster=allow_single_cluster,
+            cluster_selection_method=cluster_selection_method,
+            cluster_selection_epsilon=cluster_selection_epsilon,
+            cluster_selection_persistence=cluster_selection_persistence,
             propagate_labels=propagate_labels,
         )
         self.label_sides_as_branches = label_sides_as_branches
 
-    def fit(self, clusterer, labels=None, probabilities=None):
-        super().fit(clusterer, labels, probabilities, compute_centrality)
+    def fit(self, clusterer, labels=None, probabilities=None, sample_weight=None):
+        super().fit(clusterer, labels, probabilities, sample_weight, compute_centrality)
         apply_branch_threshold(
             self.labels_,
             self.sub_cluster_labels_,
@@ -132,6 +124,22 @@ def fit(self, clusterer, labels=None, probabilities=None):
         self.centralities_ = self.lens_values_
         return self
 
+    def propagated_labels(self, label_sides_as_branches=None):
+        if label_sides_as_branches is None:
+            label_sides_as_branches = self.label_sides_as_branches
+
+        labels, branch_labels = super().propagated_labels()
+        apply_branch_threshold(
+            labels,
+            branch_labels,
+            np.zeros_like(self.probabilities_),
+            np.zeros_like(self.probabilities_),
+            self.cluster_points_,
+            self.linkage_trees_,
+            label_sides_as_branches=label_sides_as_branches,
+        )
+        return labels, branch_labels
+
     @property
     def approximation_graph_(self):
         """See :class:`~hdbscan.plots.ApproximationGraph` for documentation."""
 
@@ -257,7 +257,21 @@ def condense_tree(hierarchy, min_cluster_size=10, max_cluster_size=np.inf, sampl
 
 
 @numba.njit()
-def extract_leaves(cluster_tree, n_points):
+def extract_leaves(condensed_tree, allow_single_cluster=True):
+    n_nodes = condensed_tree.parent.max() + 1
+    n_points = condensed_tree.parent.min()
+    leaf_indicator = np.ones(n_nodes, dtype=np.bool_)
+    leaf_indicator[:n_points] = False
+
+    for parent, child_size in zip(condensed_tree.parent, condensed_tree.child_size):
+        if child_size > 1:
+            leaf_indicator[parent] = False
+
+    return np.nonzero(leaf_indicator)[0]
+
+
+@numba.njit()
+def cluster_tree_leaves(cluster_tree, n_points):
     n_nodes = cluster_tree.child.max() + 1
     leaf_indicator = np.ones(n_nodes - n_points, dtype=np.bool_)
     leaf_indicator[cluster_tree.parent - n_points] = False
@@ -538,7 +552,7 @@ def simplify_hierarchy(condensed_tree, n_points, persistence_threshold):
     processed = {np.int64(0)}
     processed.clear()
     while cluster_tree.parent.shape[0] > 0:
-        leaves = set(extract_leaves(cluster_tree, n_points))
+        leaves = set(cluster_tree_leaves(cluster_tree, n_points))
         births = max_lambdas(condensed_tree, leaves)
         deaths = min_lambdas(cluster_tree, leaves)
 
 
@@ -70,15 +70,22 @@ def flatten_to_csr(graph):
 
 @numba.njit(parallel=True)
 def sort_by_lens(graph):
-    for point in numba.prange(len(graph)):
+    new_weights = np.empty_like(graph.weights)
+    new_distances = np.empty_like(graph.distances)
+    new_indices = np.empty_like(graph.indices)
+    for point in numba.prange(len(graph.indptr) - 1):
         start = graph.indptr[point]
         end = graph.indptr[point + 1]
-        weights = graph.weights[start:end]
-        order = np.argsort(weights)
-        graph.weights[start:end] = weights[order]
-        graph.distances[start:end] = graph.distances[start:end][order]
-        graph.indices[start:end] = graph.indices[start:end][order]
-    return graph
+        
+        row_weights = graph.weights[start:end]
+        row_distances = graph.distances[start:end]
+        row_indices = graph.indices[start:end]
+
+        order = np.argsort(row_weights)
+        new_weights[start:end] = row_weights[order]
+        new_distances[start:end] = row_distances[order]
+        new_indices[start:end] = row_indices[order]
+    return CoreGraph(new_weights, new_distances, new_indices, graph.indptr)
 
 
 @numba.njit(parallel=True)
@@ -173,7 +180,7 @@ def minimum_spanning_tree(graph, overwrite=False):
     return n_components, point_components, result
 
 
-# @numba.njit()
+@numba.njit()
 def core_graph_spanning_tree(neighbors, core_distances, min_spanning_tree, lens):
     graph = sort_by_lens(
         flatten_to_csr(
 
@@ -15,7 +15,7 @@
     condense_tree,
     simplify_hierarchy,
     extract_eom_clusters,
-    extract_leaves,
+    cluster_tree_leaves,
     cluster_epsilon_search,
     get_cluster_labelling_at_cut,
     get_cluster_label_vector,
@@ -266,7 +266,7 @@ def clusters_from_spanning_tree(
         if cluster_tree.parent.shape[0] == 0:
             selected_clusters = np.empty(0, dtype=np.int64)
         else:
-            selected_clusters = extract_leaves(cluster_tree, n_points)
+            selected_clusters = cluster_tree_leaves(cluster_tree, n_points)
     else:
         raise ValueError(f"Invalid cluster_selection_method {cluster_selection_method}")
 
@@ -319,8 +319,6 @@ def fit(self, X, y=None, sample_weight=None, **fit_params):
 
         if self.semi_supervised:
             X, y = check_X_y(X, y, accept_sparse="csr", force_all_finite=False)
-            if sample_weight is not None:
-                sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float32)
             self._raw_labels = y
             # Replace non-finite labels with -1 labels
             y[~np.isfinite(y)] = -1
@@ -331,20 +329,18 @@ def fit(self, X, y=None, sample_weight=None, **fit_params):
                 )
         else:
             X = check_array(X, accept_sparse="csr", force_all_finite=False)
-            if sample_weight is not None:
-                sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float32)
             self._raw_data = X
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float32)
 
         self._all_finite = np.all(np.isfinite(X))
         if ~self._all_finite:
             # Pass only the purely finite indices into hdbscan
             # We will later assign all non-finite points to the background -1 cluster
             finite_index = np.where(np.isfinite(X).sum(axis=1) == X.shape[1])[0]
             clean_data = X[finite_index]
-            clean_data_labels = y
-
-            if self.semi_supervised:
-                clean_data_labels = y[finite_index]
+            clean_data_labels = y[finite_index] if self.semi_supervised else None
+            sample_weight = sample_weight[finite_index] if sample_weight is not None else None
 
             internal_to_raw = {
                 x: y for x, y in zip(range(len(finite_index)), finite_index)
@@ -392,10 +388,6 @@ def fit(self, X, y=None, sample_weight=None, **fit_params):
 
         return self
 
-    def fit_predict(self, X, y=None, sample_weight=None, **fit_params):
-        self.fit(X, y, sample_weight, **fit_params)
-        return self.labels_
-
     def dbscan_clustering(self, epsilon):
         check_is_fitted(
             self,