Allow sample weights in HDBSCAN

lmcinnes · lmcinnes · commit 147b48909ceb · 2024-05-24T13:51:24.000-04:00
diff --git a/fast_hdbscan/boruvka.py b/fast_hdbscan/boruvka.py
@@ -247,22 +247,44 @@ def initialize_boruvka_from_knn(knn_indices, knn_distances, core_distances, disj
     return result[:result_idx]
 
 
-def parallel_boruvka(tree, min_samples=10):
+@numba.njit(parallel=True)
+def sample_weight_core_distance(distances, neighbors, sample_weights, min_samples):
+    core_distances = np.zeros(distances.shape[0], dtype=np.float32)
+    for i in numba.prange(distances.shape[0]):
+        total_weight = 0.0
+        j = 0
+        while total_weight < min_samples and j < neighbors.shape[1]:
+            total_weight += sample_weights[neighbors[i, j]]
+            j += 1
+
+        core_distances[i] = distances[i, j - 1]
+
+    return core_distances
+
+def parallel_boruvka(tree, min_samples=10, sample_weights=None):
     components_disjoint_set = ds_rank_create(tree.data.shape[0])
     point_components = np.arange(tree.data.shape[0])
     node_components = np.full(tree.node_data.shape[0], -1)
     n_components = point_components.shape[0]
 
-    if min_samples > 1:
-        distances, neighbors = parallel_tree_query(tree, tree.data, k=min_samples + 1, output_rdist=True)
-        core_distances = distances.T[-1]
+    if sample_weights is not None:
+        mean_sample_weight = np.mean(sample_weights)
+        expected_neighbors = min_samples / mean_sample_weight
+        distances, neighbors = parallel_tree_query(tree, tree.data, k=int(2 * expected_neighbors))
+        core_distances = sample_weight_core_distance(distances, neighbors, sample_weights, min_samples)
         edges = initialize_boruvka_from_knn(neighbors, distances, core_distances, components_disjoint_set)
         update_component_vectors(tree, components_disjoint_set, node_components, point_components)
     else:
-        core_distances = np.zeros(tree.data.shape[0], dtype=np.float32)
-        distances, neighbors = parallel_tree_query(tree, tree.data, k=2)
-        edges = initialize_boruvka_from_knn(neighbors, distances, core_distances, components_disjoint_set)
-        update_component_vectors(tree, components_disjoint_set, node_components, point_components)
+        if min_samples > 1:
+            distances, neighbors = parallel_tree_query(tree, tree.data, k=min_samples + 1, output_rdist=True)
+            core_distances = distances.T[-1]
+            edges = initialize_boruvka_from_knn(neighbors, distances, core_distances, components_disjoint_set)
+            update_component_vectors(tree, components_disjoint_set, node_components, point_components)
+        else:
+            core_distances = np.zeros(tree.data.shape[0], dtype=np.float32)
+            distances, neighbors = parallel_tree_query(tree, tree.data, k=2)
+            edges = initialize_boruvka_from_knn(neighbors, distances, core_distances, components_disjoint_set)
+            update_component_vectors(tree, components_disjoint_set, node_components, point_components)
 
     while n_components > 1:
         candidate_distances, candidate_indices = boruvka_tree_query(tree, node_components, point_components,
diff --git a/fast_hdbscan/cluster_trees.py b/fast_hdbscan/cluster_trees.py
@@ -17,6 +17,16 @@ def create_linkage_merge_data(base_size):
     return LinkageMergeData(parent, size, next_parent)
 
 
+@numba.njit()
+def create_linkage_merge_data_w_sample_weights(sample_weights):
+    base_size = sample_weights.shape[0]
+    parent = np.full(2 * base_size - 1, -1, dtype=np.intp)
+    size = np.concatenate((sample_weights, np.zeros(base_size - 1, dtype=np.float32)))
+    next_parent = np.array([base_size], dtype=np.intp)
+
+    return LinkageMergeData(parent, size, next_parent)
+
+
 @numba.njit()
 def linkage_merge_find(linkage_merge, node):
     relabel = node
@@ -43,11 +53,14 @@ def linkage_merge_join(linkage_merge, left, right):
 
 
 @numba.njit()
-def mst_to_linkage_tree(sorted_mst):
+def mst_to_linkage_tree(sorted_mst, sample_weights=None):
     result = np.empty((sorted_mst.shape[0], sorted_mst.shape[1] + 1))
 
     n_samples = sorted_mst.shape[0] + 1
-    linkage_merge = create_linkage_merge_data(n_samples)
+    if sample_weights is None:
+        linkage_merge = create_linkage_merge_data(n_samples)
+    else:
+        linkage_merge = create_linkage_merge_data_w_sample_weights(sample_weights)
 
     for index in range(sorted_mst.shape[0]):
 
@@ -116,7 +129,7 @@ def eliminate_branch(branch_node, parent_node, lambda_value, parents, children,
 
 
 @numba.njit(fastmath=True)
-def condense_tree(hierarchy, min_cluster_size=10):
+def condense_tree(hierarchy, min_cluster_size=10, sample_weights=None):
     root = 2 * hierarchy.shape[0]
     num_points = hierarchy.shape[0] + 1
     next_label = num_points + 1
@@ -133,6 +146,9 @@ def condense_tree(hierarchy, min_cluster_size=10):
 
     ignore = np.zeros(root + 1, dtype=np.bool8)
 
+    if sample_weights is None:
+        sample_weights = np.ones(num_points, dtype=np.float32)
+
     idx = 0
 
     for node in node_list:
@@ -148,8 +164,8 @@ def condense_tree(hierarchy, min_cluster_size=10):
         else:
             lambda_value = np.inf
 
-        left_count = np.int64(hierarchy[left - num_points, 3]) if left >= num_points else 1
-        right_count = np.int64(hierarchy[right - num_points, 3]) if right >= num_points else 1
+        left_count = np.int64(hierarchy[left - num_points, 3]) if left >= num_points else sample_weights[left]
+        right_count = np.int64(hierarchy[right - num_points, 3]) if right >= num_points else sample_weights[left]
 
         # The logic here is in a strange order, but it has non-trivial performance gains ...
         # The most common case by far is a singleton on the left; and cluster on the right take care of this separately
diff --git a/fast_hdbscan/hdbscan.py b/fast_hdbscan/hdbscan.py
@@ -2,6 +2,7 @@
 
 from sklearn.base import BaseEstimator, ClusterMixin
 from sklearn.utils import check_array
+from sklearn.utils.validation import check_is_fitted, _check_sample_weight
 from sklearn.neighbors import KDTree
 
 from warnings import warn
@@ -135,6 +136,7 @@ def fast_hdbscan(
     cluster_selection_method="eom",
     allow_single_cluster=False,
     cluster_selection_epsilon=0.0,
+    sample_weights=None,
     return_trees=False,
 ):
     data = check_array(data)
@@ -156,10 +158,10 @@ def fast_hdbscan(
     sklearn_tree = KDTree(data)
     numba_tree = kdtree_to_numba(sklearn_tree)
     edges = parallel_boruvka(
-        numba_tree, min_samples=min_cluster_size if min_samples is None else min_samples
+        numba_tree, min_samples=min_cluster_size if min_samples is None else min_samples, sample_weights=sample_weights
     )
     sorted_mst = edges[np.argsort(edges.T[2])]
-    linkage_tree = mst_to_linkage_tree(sorted_mst)
+    linkage_tree = mst_to_linkage_tree(sorted_mst, sample_weights=sample_weights)
     condensed_tree = condense_tree(linkage_tree, min_cluster_size=min_cluster_size)
     if cluster_selection_epsilon > 0.0 or cluster_selection_method == "eom":
         cluster_tree = cluster_tree_from_condensed_tree(condensed_tree)
@@ -208,8 +210,10 @@ def __init__(
         self.allow_single_cluster = allow_single_cluster
         self.cluster_selection_epsilon = cluster_selection_epsilon
 
-    def fit(self, X, y=None, **fit_params):
+    def fit(self, X, y=None, sample_weight=None, **fit_params):
         X = check_array(X, accept_sparse="csr", force_all_finite=False)
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float32)
         self._raw_data = X
 
         self._all_finite = np.all(np.isfinite(X))
@@ -233,7 +237,7 @@ def fit(self, X, y=None, **fit_params):
             self._single_linkage_tree,
             self._condensed_tree,
             self._min_spanning_tree,
-        ) = fast_hdbscan(clean_data, return_trees=True, **kwargs)
+        ) = fast_hdbscan(clean_data, return_trees=True, sample_weights=sample_weight, **kwargs)
 
         self._condensed_tree = to_numpy_rec_array(self._condensed_tree)
 
@@ -256,6 +260,7 @@ def fit(self, X, y=None, **fit_params):
         return self
 
     def dbscan_clustering(self, epsilon):
+        check_is_fitted(self, "_single_linkage_tree", msg="You first need to fit the HDBSCAN model before picking a DBSCAN clustering")
         return get_cluster_labelling_at_cut(
             self._single_linkage_tree,
             epsilon,
@@ -264,6 +269,7 @@ def dbscan_clustering(self, epsilon):
 
     @property
     def condensed_tree_(self):
+        check_is_fitted(self, "_condensed_tree", msg="You first need to fit the HDBSCAN model before accessing the condensed tree")
         if self._condensed_tree is not None:
             return CondensedTree(
                 self._condensed_tree,
@@ -277,6 +283,7 @@ def condensed_tree_(self):
 
     @property
     def single_linkage_tree_(self):
+        check_is_fitted(self, "_single_linkage_tree", msg="You first need to fit the HDBSCAN model before accessing the single linkage tree")
         if self._single_linkage_tree is not None:
             return SingleLinkageTree(self._single_linkage_tree)
         else:
@@ -286,6 +293,7 @@ def single_linkage_tree_(self):
 
     @property
     def minimum_spanning_tree_(self):
+        check_is_fitted(self, "_min_spanning_tree", msg="You first need to fit the HDBSCAN model before accessing the minimum spanning tree")
         if self._min_spanning_tree is not None:
             if self._raw_data is not None:
                 return MinimumSpanningTree(self._min_spanning_tree, self._raw_data)