Fix child sizes for weighted damples; fix segfault

lmcinnes · lmcinnes · commit e5d08fa0b2aa · 2024-09-30T16:55:21.000-04:00
diff --git a/fast_hdbscan/cluster_trees.py b/fast_hdbscan/cluster_trees.py
@@ -174,7 +174,7 @@ def condense_tree(hierarchy, min_cluster_size=10, sample_weights=None):
     parents = np.ones(root, dtype=np.int64)
     children = np.empty(root, dtype=np.int64)
     lambdas = np.empty(root, dtype=np.float32)
-    sizes = np.ones(root, dtype=np.int64)
+    sizes = np.ones(root, dtype=np.float32)
 
     ignore = np.zeros(root + 1, dtype=np.bool_) # 'bool' is no longer an attribute of 'numpy'
 
@@ -196,8 +196,8 @@ def condense_tree(hierarchy, min_cluster_size=10, sample_weights=None):
         else:
             lambda_value = np.inf
 
-        left_count = np.int64(hierarchy[left - num_points, 3]) if left >= num_points else sample_weights[left]
-        right_count = np.int64(hierarchy[right - num_points, 3]) if right >= num_points else sample_weights[left]
+        left_count = np.float32(hierarchy[left - num_points, 3]) if left >= num_points else sample_weights[left]
+        right_count = np.float32(hierarchy[right - num_points, 3]) if right >= num_points else sample_weights[left]
 
         # The logic here is in a strange order, but it has non-trivial performance gains ...
         # The most common case by far is a singleton on the left; and cluster on the right take care of this separately
@@ -434,7 +434,7 @@ def extract_clusters_bcubed(condensed_tree, cluster_tree, label_indices, allow_v
 
 @numba.njit()
 def score_condensed_tree_nodes(condensed_tree):
-    result = {0: 0.0 for i in range(0)}
+    result = {0: np.float32(0.0) for i in range(0)}
 
     for i in range(condensed_tree.parent.shape[0]):
         parent = condensed_tree.parent[i]
@@ -602,13 +602,16 @@ def get_cluster_labelling_at_cut(linkage_tree, cut, min_cluster_size):
 def get_cluster_label_vector(
         tree,
         clusters,
-        cluster_selection_epsilon
+        cluster_selection_epsilon,
+        n_samples,
 ):
+    if len(tree.parent) == 0:
+        return np.full(n_samples, -1, dtype=np.intp)
     root_cluster = tree.parent.min()
-    result = np.empty(root_cluster, dtype=np.intp)
+    result = np.full(n_samples, -1, dtype=np.intp)
     cluster_label_map = {c: n for n, c in enumerate(np.sort(clusters))}
 
-    disjoint_set = ds_rank_create(tree.parent.max() + 1)
+    disjoint_set = ds_rank_create(max(tree.parent.max() + 1, tree.child.max() + 1))
     clusters = set(clusters)
 
     for n in range(tree.parent.shape[0]):
diff --git a/fast_hdbscan/hdbscan.py b/fast_hdbscan/hdbscan.py
@@ -20,7 +20,7 @@
     get_cluster_label_vector,
     get_point_membership_strength_vector,
     cluster_tree_from_condensed_tree,
-    extract_clusters_bcubed
+    extract_clusters_bcubed,
 )
 
 try:
@@ -41,7 +41,7 @@ def to_numpy_rec_array(named_tuple_tree):
             ("parent", np.intp),
             ("child", np.intp),
             ("lambda_val", float),
-            ("child_size", np.intp),
+            ("child_size", np.float32),
         ],
     )
 
@@ -149,14 +149,16 @@ def fast_hdbscan(
     data = check_array(data)
 
     if semi_supervised and data_labels is None:
-        raise ValueError("data_labels must not be None when semi_supervised is set to True!") 
+        raise ValueError(
+            "data_labels must not be None when semi_supervised is set to True!"
+        )
 
     if semi_supervised:
         label_indices = np.flatnonzero(data_labels > -1)
         label_values = data_labels[label_indices]
         data_labels_dict = Dict()
         for index, label in zip(label_indices, label_values):
-            data_labels_dict[index] = label 
+            data_labels_dict[index] = label
 
     if (
         (not (np.issubdtype(type(min_samples), np.integer) or min_samples is None))
@@ -165,17 +167,21 @@ def fast_hdbscan(
         or min_cluster_size <= 0
     ):
         raise ValueError("Min samples and min cluster size must be positive integers!")
-    
+
     if (
         not np.issubdtype(type(cluster_selection_epsilon), np.floating)
         or cluster_selection_epsilon < 0.0
     ):
-        raise ValueError('Cluster selection epsilon must be a positive floating point number!')
+        raise ValueError(
+            "Cluster selection epsilon must be a positive floating point number!"
+        )
 
     sklearn_tree = KDTree(data)
     numba_tree = kdtree_to_numba(sklearn_tree)
     edges = parallel_boruvka(
-        numba_tree, min_samples=min_cluster_size if min_samples is None else min_samples, sample_weights=sample_weights
+        numba_tree,
+        min_samples=min_cluster_size if min_samples is None else min_samples,
+        sample_weights=sample_weights,
     )
     sorted_mst = edges[np.argsort(edges.T[2])]
     if sample_weights is None:
@@ -187,39 +193,49 @@ def fast_hdbscan(
         cluster_tree = cluster_tree_from_condensed_tree(condensed_tree)
 
     if cluster_selection_method == "eom":
-            if semi_supervised:
-                    if(ss_algorithm=="bc"):
-                        selected_clusters = extract_clusters_bcubed(condensed_tree, 
-                                                                    cluster_tree, 
-                                                                    data_labels_dict, 
-                                                                    allow_virtual_nodes=True, 
-                                                                    allow_single_cluster=allow_single_cluster)
-                    elif(ss_algorithm=="bc_without_vn"):
-                        selected_clusters = extract_clusters_bcubed(condensed_tree, 
-                                                                    cluster_tree, 
-                                                                    data_labels_dict, 
-                                                                    allow_virtual_nodes=False, 
-                                                                    allow_single_cluster=allow_single_cluster)
-                    else:
-                        raise ValueError(f"Invalid ss_algorithm {ss_algorithm}")
-            else:  
-                selected_clusters = extract_eom_clusters(condensed_tree, 
-                                                        cluster_tree, 
-                                                        allow_single_cluster=allow_single_cluster)  
+        if semi_supervised:
+            if ss_algorithm == "bc":
+                selected_clusters = extract_clusters_bcubed(
+                    condensed_tree,
+                    cluster_tree,
+                    data_labels_dict,
+                    allow_virtual_nodes=True,
+                    allow_single_cluster=allow_single_cluster,
+                )
+            elif ss_algorithm == "bc_without_vn":
+                selected_clusters = extract_clusters_bcubed(
+                    condensed_tree,
+                    cluster_tree,
+                    data_labels_dict,
+                    allow_virtual_nodes=False,
+                    allow_single_cluster=allow_single_cluster,
+                )
+            else:
+                raise ValueError(f"Invalid ss_algorithm {ss_algorithm}")
+        else:
+            selected_clusters = extract_eom_clusters(
+                condensed_tree, cluster_tree, allow_single_cluster=allow_single_cluster
+            )
     elif cluster_selection_method == "leaf":
         selected_clusters = extract_leaves(
             condensed_tree, allow_single_cluster=allow_single_cluster
         )
     else:
         raise ValueError(f"Invalid cluster_selection_method {cluster_selection_method}")
-    
+
     if len(selected_clusters) > 1 and cluster_selection_epsilon > 0.0:
         selected_clusters = cluster_epsilon_search(
-            selected_clusters, cluster_tree,
+            selected_clusters,
+            cluster_tree,
             min_persistence=cluster_selection_epsilon,
         )
 
-    clusters = get_cluster_label_vector(condensed_tree, selected_clusters, cluster_selection_epsilon)
+    clusters = get_cluster_label_vector(
+        condensed_tree,
+        selected_clusters,
+        cluster_selection_epsilon,
+        n_samples=data.shape[0],
+    )
     membership_strengths = get_point_membership_strength_vector(
         condensed_tree, selected_clusters, clusters
     )
@@ -252,16 +268,18 @@ def __init__(
 
     def fit(self, X, y=None, sample_weight=None, **fit_params):
 
-        if (self.semi_supervised):
+        if self.semi_supervised:
             X, y = check_X_y(X, y, accept_sparse="csr", force_all_finite=False)
             if sample_weight is not None:
                 sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float32)
             self._raw_labels = y
             # Replace non-finite labels with -1 labels
             y[~np.isfinite(y)] = -1
 
-            if ~np.any(y !=-1):
-                raise ValueError("y must contain at least one label > -1. Currently it only contains -1 and/or non-finite labels!")
+            if ~np.any(y != -1):
+                raise ValueError(
+                    "y must contain at least one label > -1. Currently it only contains -1 and/or non-finite labels!"
+                )
         else:
             X = check_array(X, accept_sparse="csr", force_all_finite=False)
             if sample_weight is not None:
@@ -275,7 +293,7 @@ def fit(self, X, y=None, sample_weight=None, **fit_params):
             finite_index = np.where(np.isfinite(X).sum(axis=1) == X.shape[1])[0]
             clean_data = X[finite_index]
             clean_data_labels = y
-            
+
             if self.semi_supervised:
                 clean_data_labels = y[finite_index]
 
@@ -295,7 +313,13 @@ def fit(self, X, y=None, sample_weight=None, **fit_params):
             self._single_linkage_tree,
             self._condensed_tree,
             self._min_spanning_tree,
-        ) = fast_hdbscan(clean_data, clean_data_labels, return_trees=True, sample_weights=sample_weight, **kwargs)
+        ) = fast_hdbscan(
+            clean_data,
+            clean_data_labels,
+            return_trees=True,
+            sample_weights=sample_weight,
+            **kwargs,
+        )
 
         self._condensed_tree = to_numpy_rec_array(self._condensed_tree)
 
@@ -318,7 +342,11 @@ def fit(self, X, y=None, sample_weight=None, **fit_params):
         return self
 
     def dbscan_clustering(self, epsilon):
-        check_is_fitted(self, "_single_linkage_tree", msg="You first need to fit the HDBSCAN model before picking a DBSCAN clustering")
+        check_is_fitted(
+            self,
+            "_single_linkage_tree",
+            msg="You first need to fit the HDBSCAN model before picking a DBSCAN clustering",
+        )
         return get_cluster_labelling_at_cut(
             self._single_linkage_tree,
             epsilon,
@@ -327,7 +355,11 @@ def dbscan_clustering(self, epsilon):
 
     @property
     def condensed_tree_(self):
-        check_is_fitted(self, "_condensed_tree", msg="You first need to fit the HDBSCAN model before accessing the condensed tree")
+        check_is_fitted(
+            self,
+            "_condensed_tree",
+            msg="You first need to fit the HDBSCAN model before accessing the condensed tree",
+        )
         if self._condensed_tree is not None:
             return CondensedTree(
                 self._condensed_tree,
@@ -341,7 +373,11 @@ def condensed_tree_(self):
 
     @property
     def single_linkage_tree_(self):
-        check_is_fitted(self, "_single_linkage_tree", msg="You first need to fit the HDBSCAN model before accessing the single linkage tree")
+        check_is_fitted(
+            self,
+            "_single_linkage_tree",
+            msg="You first need to fit the HDBSCAN model before accessing the single linkage tree",
+        )
         if self._single_linkage_tree is not None:
             return SingleLinkageTree(self._single_linkage_tree)
         else:
@@ -351,7 +387,11 @@ def single_linkage_tree_(self):
 
     @property
     def minimum_spanning_tree_(self):
-        check_is_fitted(self, "_min_spanning_tree", msg="You first need to fit the HDBSCAN model before accessing the minimum spanning tree")
+        check_is_fitted(
+            self,
+            "_min_spanning_tree",
+            msg="You first need to fit the HDBSCAN model before accessing the minimum spanning tree",
+        )
         if self._min_spanning_tree is not None:
             if self._raw_data is not None:
                 return MinimumSpanningTree(self._min_spanning_tree, self._raw_data)