Merge branch 'main' into sample_weights

lmcinnes · web-flow · commit c58f92d5a08f · 2024-09-30T10:30:00.000-04:00
diff --git a/README.rst b/README.rst
@@ -9,7 +9,7 @@
 Fast Multicore HDBSCAN
 ======================
 
-Ahe ``fast_hdbscan`` library provides a simple implementation of the HDBSCAN clustering algorithm designed specifically
+The ``fast_hdbscan`` library provides a simple implementation of the HDBSCAN clustering algorithm designed specifically
 for high performance on multicore machine with low dimensional data (2D to about 20D). The algorithm runs in parallel and can make
 effective use of as many cores as you wish to throw at a problem. It is thus ideal for large SMP systems, and even
 modern multicore laptops.
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -17,6 +17,12 @@ pr:
     - doc/*
     - README.rst
 
+parameters:
+  - name: includeReleaseCandidates
+    displayName: "Allow pre-release dependencies"
+    type: boolean
+    default: false
+
 variables:
   triggeredByPullRequest: $[eq(variables['Build.Reason'], 'PullRequest')]
 
@@ -66,8 +72,17 @@ stages:
 
         - script: |
             python -m pip install --upgrade pip
+          displayName: 'Upgrade pip'
+
+        - script: |
             pip install -r requirements.txt
           displayName: 'Install dependencies'
+          condition: ${{ eq(parameters.includeReleaseCandidates, false) }}
+
+        - script: |
+            pip install --pre -r requirements.txt
+          displayName: 'Install dependencies (allow pre-releases)'
+          condition: ${{ eq(parameters.includeReleaseCandidates, true) }}
 
         - script: |
             pip install -e .
diff --git a/fast_hdbscan/cluster_trees.py b/fast_hdbscan/cluster_trees.py
@@ -5,6 +5,11 @@
 
 from .disjoint_set import ds_rank_create, ds_find, ds_union_by_rank
 
+from numba.typed import Dict, List
+from numba.types import int64, ListType
+
+int64_list_type = ListType(int64)
+
 LinkageMergeData = namedtuple("LinkageMergeData", ["parent", "size", "next"])
 
 
@@ -171,7 +176,7 @@ def condense_tree(hierarchy, min_cluster_size=10, sample_weights=None):
     lambdas = np.empty(root, dtype=np.float32)
     sizes = np.ones(root, dtype=np.int64)
 
-    ignore = np.zeros(root + 1, dtype=np.bool8)
+    ignore = np.zeros(root + 1, dtype=np.bool_) # 'bool' is no longer an attribute of 'numpy'
 
     if sample_weights is None:
         sample_weights = np.ones(num_points, dtype=np.float32)
@@ -255,6 +260,178 @@ def extract_leaves(condensed_tree, allow_single_cluster=True):
     return np.nonzero(leaf_indicator)[0]
 
 
+
+# The *_bcubed functions below implement the (semi-supervised) HDBSCAN*(BC) algorithm presented
+# in Castro Gertrudes, J., Zimek, A., Sander, J. et al. A unified view of density-based methods 
+# for semi-supervised clustering and classification. Data Min Knowl Disc 33, 1894–1952 (2019).
+
+@numba.njit()
+def cluster_tree_from_condensed_tree_bcubed(condensed_tree, cluster_tree, label_indices):
+    # This functions returns a cluster_tree with virtual nodes (if applicable).
+
+    label_indices_list = list(label_indices.keys())
+    cluster_tree_parents = list(cluster_tree.parent)
+
+    # A labeled node that has no children and who's parent is not a leaf cluster, then it must be 
+    # a noisy node (virtual node). 
+
+    mask1 = condensed_tree.child_size > 1
+    mask2 = condensed_tree.child_size == 1
+    mask3 = np.array([child in label_indices_list for child in condensed_tree.child])
+    mask4 = np.array([parent in cluster_tree_parents for parent in condensed_tree.parent]) # check that it's not a leaf cluster
+
+    mask = (mask1 | (mask2 & mask3 & mask4)) 
+
+    return CondensedTree(condensed_tree.parent[mask], condensed_tree.child[mask], condensed_tree.lambda_val[mask],
+                         condensed_tree.child_size[mask])
+
+
+@numba.njit()
+def get_condensed_tree_clusters_bcubed(condensed_tree, cluster_tree=None, cluster_tree_bcubed=None, allow_virtual_nodes=False):
+
+    cluster_elements = Dict.empty(
+        key_type=int64,
+        value_type=int64_list_type,
+        )
+    
+    virtual_nodes = [0 for x in range(0)] 
+
+    parents_set = set(list(condensed_tree.parent))
+    for i in range(len(condensed_tree.child) - 1, -1, -1): # Traverse tree bottom up
+        parent = condensed_tree.parent[i]
+        child = condensed_tree.child[i]
+        if child in parents_set:
+            if parent in cluster_elements:
+                cluster_elements[parent].extend(cluster_elements[child])
+            else:
+                cluster_elements[parent] = List(cluster_elements[child])
+        elif parent in cluster_elements:
+            cluster_elements[parent].append(child)
+        else:
+            cluster_elements[parent] = List.empty_list(int64)
+            cluster_elements[parent].append(child)
+
+    if allow_virtual_nodes and (cluster_tree is not None) and (cluster_tree_bcubed is not None):
+        for i in list(set(cluster_tree_bcubed.child).difference(set(cluster_tree.child))):
+            virtual_nodes.append(i)
+        for node in virtual_nodes:
+            cluster_elements[node] = List.empty_list(int64)
+            cluster_elements[node].append(node)
+  
+    return cluster_elements, np.array(virtual_nodes)
+
+
+@numba.njit()
+def eom_recursion_bcubed(node, cluster_tree, stability_node_scores, bcubed_node_scores, selected_clusters):
+    current_score_stability_bcubed = np.array([stability_node_scores[node], bcubed_node_scores[node]], dtype=np.float32)
+
+    children = cluster_tree.child[cluster_tree.parent == node]
+    child_score_total_stability_bcubed = np.array([0.0, 0.0], dtype=np.float32)
+
+    for child_node in children:
+        child_score_total_stability_bcubed += eom_recursion_bcubed(child_node, cluster_tree, stability_node_scores, bcubed_node_scores, selected_clusters)
+
+    if child_score_total_stability_bcubed[1] > current_score_stability_bcubed[1]:
+        return child_score_total_stability_bcubed
+
+    elif child_score_total_stability_bcubed[1] < current_score_stability_bcubed[1]:
+        selected_clusters[node] = True
+        unselect_below_node(node, cluster_tree, selected_clusters)
+        return current_score_stability_bcubed   
+
+    # Stability scores used to resolve ties.
+    elif child_score_total_stability_bcubed[1] == current_score_stability_bcubed[1]:
+        
+        if child_score_total_stability_bcubed[0] > current_score_stability_bcubed[0]:
+            return child_score_total_stability_bcubed
+        else:
+            selected_clusters[node] = True
+            unselect_below_node(node, cluster_tree, selected_clusters)
+            return current_score_stability_bcubed
+
+
+@numba.njit()
+def score_condensed_tree_nodes_bcubed(cluster_elements, label_indices): 
+
+    label_values = label_indices.values()
+    label_counts = {0: 0 for i in range(0)}
+
+    for label in label_values:
+        if label in label_counts:
+            label_counts[label] +=1
+        else:
+            label_counts[label] = 1
+
+    label_counts_values = list(label_counts.values())
+    total_num_of_labeled_points = sum(label_counts_values)
+    bcubed = {0: 0.0 for i in range(0)}
+
+    for cluster, elements in cluster_elements.items():
+
+        cluster_labeled_points_dict = {0: 0 for i in range(0)}
+
+        cluster_labeled_points = list(set(elements) & set(label_indices.keys()))
+        bcubed[cluster] = 0.0
+
+        if len(cluster_labeled_points) > 0:
+            
+            for p in cluster_labeled_points:
+                p_label = label_indices[p]
+                if p_label in cluster_labeled_points_dict:
+                    cluster_labeled_points_dict[p_label] += 1
+                else:
+                    cluster_labeled_points_dict[p_label] = 1
+    
+            for label, num_points in cluster_labeled_points_dict.items():
+
+                total_num_of_class_label = label_counts[label]
+                num_labeled_in_node = len(cluster_labeled_points)
+
+                precision_point = (num_points/num_labeled_in_node)/total_num_of_labeled_points
+                recall_point = (num_points/total_num_of_class_label)/total_num_of_labeled_points
+
+                # Bcubed F-measure 
+                bcubed[cluster] += num_points*(2.0/(1.0/precision_point + 1.0/recall_point))
+    return bcubed
+
+
+@numba.njit()
+def extract_clusters_bcubed(condensed_tree, cluster_tree, label_indices, allow_virtual_nodes=False, allow_single_cluster=False):
+
+    if allow_virtual_nodes:
+
+        cluster_tree_bcubed = cluster_tree_from_condensed_tree_bcubed(condensed_tree, cluster_tree, label_indices)
+        cluster_elements, virtual_nodes = get_condensed_tree_clusters_bcubed(condensed_tree, cluster_tree, cluster_tree_bcubed, allow_virtual_nodes)
+        stability_node_scores = score_condensed_tree_nodes(condensed_tree)
+        for node in virtual_nodes:
+            stability_node_scores[node] = 0.0
+        bcubed_node_scores = score_condensed_tree_nodes_bcubed(cluster_elements, label_indices)
+              
+    else:
+
+        cluster_tree_bcubed = cluster_tree
+        cluster_elements, virtual_nodes = get_condensed_tree_clusters_bcubed(condensed_tree)
+        stability_node_scores = score_condensed_tree_nodes(condensed_tree) 
+        bcubed_node_scores = score_condensed_tree_nodes_bcubed(cluster_elements, label_indices)
+
+    selected_clusters = {node: False for node in bcubed_node_scores}
+
+    if len(cluster_tree_bcubed.parent) == 0:
+        return np.zeros(0, dtype=np.int64)
+
+    cluster_tree_root = cluster_tree_bcubed.parent.min()
+
+    if allow_single_cluster:
+        eom_recursion_bcubed(cluster_tree_root, cluster_tree_bcubed, stability_node_scores, bcubed_node_scores, selected_clusters)
+    elif len(bcubed_node_scores) > 1:
+        root_children = cluster_tree_bcubed.child[cluster_tree_bcubed.parent == cluster_tree_root]
+        for child_node in root_children:
+            eom_recursion_bcubed(child_node, cluster_tree_bcubed, stability_node_scores, bcubed_node_scores, selected_clusters)
+
+    return np.asarray([node for node, selected in selected_clusters.items() if (selected and (node not in virtual_nodes))])
+
+
+
 @numba.njit()
 def score_condensed_tree_nodes(condensed_tree):
     result = {0: 0.0 for i in range(0)}
diff --git a/fast_hdbscan/hdbscan.py b/fast_hdbscan/hdbscan.py
@@ -1,7 +1,7 @@
 import numpy as np
 
 from sklearn.base import BaseEstimator, ClusterMixin
-from sklearn.utils import check_array
+from sklearn.utils import check_array, check_X_y
 from sklearn.utils.validation import check_is_fitted, _check_sample_weight
 from sklearn.neighbors import KDTree
 
@@ -20,6 +20,7 @@
     get_cluster_label_vector,
     get_point_membership_strength_vector,
     cluster_tree_from_condensed_tree,
+    extract_clusters_bcubed
 )
 
 try:
@@ -29,6 +30,8 @@
 except ImportError:
     _HAVE_HDBSCAN = False
 
+from numba.typed import Dict
+
 
 def to_numpy_rec_array(named_tuple_tree):
     size = named_tuple_tree.parent.shape[0]
@@ -132,6 +135,9 @@ def remap_single_linkage_tree(tree, internal_to_raw, outliers):
 
 def fast_hdbscan(
     data,
+    data_labels=None,
+    semi_supervised=False,
+    ss_algorithm=None,
     min_samples=10,
     min_cluster_size=10,
     cluster_selection_method="eom",
@@ -142,6 +148,16 @@ def fast_hdbscan(
 ):
     data = check_array(data)
 
+    if semi_supervised and data_labels is None:
+        raise ValueError("data_labels must not be None when semi_supervised is set to True!") 
+
+    if semi_supervised:
+        label_indices = np.flatnonzero(data_labels > -1)
+        label_values = data_labels[label_indices]
+        data_labels_dict = Dict()
+        for index, label in zip(label_indices, label_values):
+            data_labels_dict[index] = label 
+
     if (
         (not (np.issubdtype(type(min_samples), np.integer) or min_samples is None))
         or not np.issubdtype(type(min_cluster_size), np.integer)
@@ -171,9 +187,25 @@ def fast_hdbscan(
         cluster_tree = cluster_tree_from_condensed_tree(condensed_tree)
 
     if cluster_selection_method == "eom":
-        selected_clusters = extract_eom_clusters(
-            condensed_tree, cluster_tree, allow_single_cluster=allow_single_cluster
-        )
+            if semi_supervised:
+                    if(ss_algorithm=="bc"):
+                        selected_clusters = extract_clusters_bcubed(condensed_tree, 
+                                                                    cluster_tree, 
+                                                                    data_labels_dict, 
+                                                                    allow_virtual_nodes=True, 
+                                                                    allow_single_cluster=allow_single_cluster)
+                    elif(ss_algorithm=="bc_without_vn"):
+                        selected_clusters = extract_clusters_bcubed(condensed_tree, 
+                                                                    cluster_tree, 
+                                                                    data_labels_dict, 
+                                                                    allow_virtual_nodes=False, 
+                                                                    allow_single_cluster=allow_single_cluster)
+                    else:
+                        raise ValueError(f"Invalid ss_algorithm {ss_algorithm}")
+            else:  
+                selected_clusters = extract_eom_clusters(condensed_tree, 
+                                                        cluster_tree, 
+                                                        allow_single_cluster=allow_single_cluster)  
     elif cluster_selection_method == "leaf":
         selected_clusters = extract_leaves(
             condensed_tree, allow_single_cluster=allow_single_cluster
@@ -206,32 +238,54 @@ def __init__(
         cluster_selection_method="eom",
         allow_single_cluster=False,
         cluster_selection_epsilon=0.0,
+        semi_supervised=False,
+        ss_algorithm=None,
         **kwargs,
     ):
         self.min_cluster_size = min_cluster_size
         self.min_samples = min_samples
         self.cluster_selection_method = cluster_selection_method
         self.allow_single_cluster = allow_single_cluster
         self.cluster_selection_epsilon = cluster_selection_epsilon
+        self.semi_supervised = semi_supervised
+        self.ss_algorithm = ss_algorithm
 
-    def fit(self, X, y=None, sample_weight=None, **fit_params):
-        X = check_array(X, accept_sparse="csr", force_all_finite=False)
-        if sample_weight is not None:
-            sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float32)
-        self._raw_data = X
+    def fit(self, X, y=None, **fit_params):
+
+        if (self.semi_supervised):
+            X, y = check_X_y(X, y, accept_sparse="csr", force_all_finite=False)
+            if sample_weight is not None:
+                sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float32)
+            self._raw_labels = y
+            # Replace non-finite labels with -1 labels
+            y[~np.isfinite(y)] = -1
+
+            if ~np.any(y !=-1):
+                raise ValueError("y must contain at least one label > -1. Currently it only contains -1 and/or non-finite labels!")
+        else:
+            X = check_array(X, accept_sparse="csr", force_all_finite=False)
+            if sample_weight is not None:
+                sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float32)
+            self._raw_data = X
 
         self._all_finite = np.all(np.isfinite(X))
         if ~self._all_finite:
             # Pass only the purely finite indices into hdbscan
             # We will later assign all non-finite points to the background -1 cluster
             finite_index = np.where(np.isfinite(X).sum(axis=1) == X.shape[1])[0]
             clean_data = X[finite_index]
+            clean_data_labels = y
+            
+            if self.semi_supervised:
+                clean_data_labels = y[finite_index]
+
             internal_to_raw = {
                 x: y for x, y in zip(range(len(finite_index)), finite_index)
             }
             outliers = list(set(range(X.shape[0])) - set(finite_index))
         else:
             clean_data = X
+            clean_data_labels = y
 
         kwargs = self.get_params()
 
@@ -241,7 +295,7 @@ def fit(self, X, y=None, sample_weight=None, **fit_params):
             self._single_linkage_tree,
             self._condensed_tree,
             self._min_spanning_tree,
-        ) = fast_hdbscan(clean_data, return_trees=True, sample_weights=sample_weight, **kwargs)
+        ) = fast_hdbscan(clean_data, clean_data_labels, return_trees=True, sample_weights=sample_weight, **kwargs)
 
         self._condensed_tree = to_numpy_rec_array(self._condensed_tree)
 
diff --git a/fast_hdbscan/tests/test_hdbscan.py b/fast_hdbscan/tests/test_hdbscan.py