Update

Krsto Proroković · Krsto Proroković · commit 960d13b25019 · 2025-05-13T19:15:39.000+02:00
diff --git a/unsupervised_bias_detection/cluster/__init__.py b/unsupervised_bias_detection/cluster/__init__.py
@@ -1,9 +1,11 @@
 """The :mod:`unsupervised_bias_detection.cluster` module implements bias-aware clustering algorithms."""
 
+from ._bahc import BiasAwareHierarchicalClustering
 from ._kmeans import BiasAwareHierarchicalKMeans
 from ._kmodes import BiasAwareHierarchicalKModes
 
 __all__ = [
+    "BiasAwareHierarchicalClustering",
     "BiasAwareHierarchicalKMeans",
     "BiasAwareHierarchicalKModes",
 ]
diff --git a/unsupervised_bias_detection/cluster/_bahc.py b/unsupervised_bias_detection/cluster/_bahc.py
@@ -1,3 +1,5 @@
+from ._cluster_node import ClusterNode
+from collections import deque
 import heapq
 from numbers import Integral
 import numpy as np
@@ -58,22 +60,25 @@ def fit(self, X, y):
             order="C",
         )
         n_samples, _ = X.shape
-        # We start with all samples in a single cluster
+        # We start with all samples being in a single cluster
         self.n_clusters_ = 1
         # We assign all samples a label of zero
         labels = np.zeros(n_samples, dtype=np.uint32)
-        clusters = []
+        leaves = []
         scores = []
         label = 0
+        root = ClusterNode(label)
+        self.cluster_tree_ = root
         # The entire dataset has a discrimination score of zero
         score = 0
-        heap = [(None, label, score)]
+        heap = [(None, root, score)]
         for _ in range(self.bahc_max_iter):
             if not heap:
                 # If the heap is empty we stop iterating
                 break
             # Take the cluster with the highest standard deviation of metric y
-            _, label, score = heapq.heappop(heap)
+            _, node, score = heapq.heappop(heap)
+            label = node.label
             cluster_indices = np.nonzero(labels == label)[0]
             cluster = X[cluster_indices]
 
@@ -97,32 +102,73 @@ def fit(self, X, y):
                 mask1[indices1] = False
                 score1 = np.mean(y[mask1]) - np.mean(y[indices1])
                 if max(score0, score1) >= score:
+                    std0 = np.std(y[indices0])
+                    node0 = ClusterNode(label)
                     # heapq implements min-heap
                     # so we have to negate std before pushing
-                    std0 = np.std(y[indices0])
-                    heapq.heappush(heap, (-std0, label, score0))
+                    heapq.heappush(heap, (-std0, node0, score0))
                     std1 = np.std(y[indices1])
-                    heapq.heappush(heap, (-std1, self.n_clusters_, score1))
+                    node1 = ClusterNode(self.n_clusters_)
+                    heapq.heappush(heap, (-std1, node1, score1))
                     labels[indices1] = self.n_clusters_
+                    # TODO: Increase n_clusters_ by clustering_model.n_clusters_ - 1
                     self.n_clusters_ += 1
+                    children = [node0, node1]
+                    node.split(clustering_model, children)
                 else:
-                    clusters.append(label)
+                    leaves.append(node)
                     scores.append(score)
             else:
-                clusters.append(label)
+                leaves.append(node)
                 scores.append(score)
         if heap:
-            clusters = np.concatenate([clusters, [label for _, label, _ in heap]])
+            # TODO: Check if this can be made more efficient
+            leaves.extend((node for _, node, _ in heap))
             scores = np.concatenate([scores, [score for _, _, score in heap]])
         else:
-            clusters = np.array(clusters)
             scores = np.array(scores)
 
         # We sort clusters by decreasing scores
         indices = np.argsort(-scores)
-        clusters = clusters[indices]
         self.scores_ = scores[indices]
-        mapping = np.zeros(self.n_clusters_, dtype=np.uint32)
-        mapping[clusters] = np.arange(self.n_clusters_, dtype=np.uint32)
-        self.labels_ = mapping[labels]
+        leaf_labels = np.array([leaf.label for leaf in leaves])
+        leaf_labels = leaf_labels[indices]
+        # TODO: Check this!!!
+        for i, leaf in enumerate(leaves):
+            leaf.label = leaf_labels[i]
+        label_mapping = np.zeros(self.n_clusters_, dtype=np.uint32)
+        label_mapping[leaf_labels] = np.arange(self.n_clusters_, dtype=np.uint32)
+        self.labels_ = label_mapping[labels]
         return self
+    
+    def predict(self, X):
+        """Predict the cluster labels for the given data.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+        """
+        # TODO: Assert that fit has been called
+        # TODO: Assert that X has the same number of features as the data used to fit
+        # TODO: Assert that clustering_model has predict method
+        # TODO: Validate X
+        n_samples, _ = X.shape
+        labels = np.zeros(n_samples, dtype=np.uint32)
+        queue = deque([(self.cluster_tree_, np.arange(n_samples))])
+        while queue:
+            node, indices = queue.popleft()
+            if node.is_leaf:
+                labels[indices] = node.label
+            else:
+                cluster = X[indices]
+                clustering_model = node.clustering_model
+                cluster_labels = clustering_model.predict(cluster)
+                if hasattr(clustering_model, "n_clusters_"):
+                    n_clusters = clustering_model.n_clusters_
+                else:
+                    n_clusters = len(np.unique(cluster_labels))
+                for i in range(n_clusters):
+                    child_indices = indices[np.nonzero(cluster_labels == i)[0]]
+                    if child_indices.size > 0:
+                        queue.append((node.children[i], child_indices))
+        return labels
diff --git a/unsupervised_bias_detection/cluster/_cluster_node.py b/unsupervised_bias_detection/cluster/_cluster_node.py
@@ -4,10 +4,10 @@
 class ClusterNode:
     def __init__(self, label: int):
         """
-        Initialize a node in the cluster tree
+        Initialize a node in the cluster tree.
         
-        Parameters:
-        -----------
+        Parameters
+        ----------
         label : int
             The cluster label for this node (required as all nodes start as leaves)
         """
@@ -21,12 +21,12 @@ def is_leaf(self):
     
     def split(self, clustering_model: ClusterMixin, children: list[Self]):
         """
-        Split this node by setting its clustering model and adding children
+        Split this node by setting its clustering model and adding children.
         
         This converts the node to an internal node and removes its label
         
-        Parameters:
-        -----------
+        Parameters
+        ----------
         clustering_model : ClusterMixin
             The clustering model used to split this node
         children : list of ClusterNode
@@ -38,10 +38,10 @@ def split(self, clustering_model: ClusterMixin, children: list[Self]):
     
     def get_leaves(self) -> list[Self]:
         """
-        Get all leaf nodes in the subtree rooted at this node
+        Get all leaf nodes in the subtree rooted at this node.
         
-        Returns:
-        --------
+        Returns
+        -------
         list of ClusterNode
             All leaf nodes in the subtree
         """
diff --git a/unsupervised_bias_detection/cluster/_kmeans.py b/unsupervised_bias_detection/cluster/_kmeans.py
@@ -8,9 +8,9 @@ class BiasAwareHierarchicalKMeans(BaseEstimator, ClusterMixin):
 
     Parameters
     ----------
-    hbac_max_iter : int
+    bahc_max_iter : int
         Maximum number of iterations.
-    hbac_min_cluster_size : int
+    bahc_min_cluster_size : int
         Minimum size of a cluster.
     kmeans_params : dict
         k-means parameters
@@ -48,6 +48,7 @@ def __init__(
         bahc_min_cluster_size,
         **kmeans_params,
     ):
+        # TODO: Remove this once we have a better way to handle the number of clusters
         if "n_clusters" in kmeans_params and kmeans_params["n_clusters"] != 2:
             raise ValueError(
                 f"The parameter `n_clusters` should be 2, got {kmeans_params['n_clusters']}."
@@ -60,16 +61,20 @@ def __init__(
 
         self.bahc_max_iter = bahc_max_iter
         self.bahc_min_cluster_size = bahc_min_cluster_size
-        self._hbac = BiasAwareHierarchicalClustering(
+        self._bahc = BiasAwareHierarchicalClustering(
             KMeans,
             bahc_max_iter,
             bahc_min_cluster_size,
             **kmeans_params,
         )
 
     def fit(self, X, y):
-        self._hbac.fit(X, y)
-        self.n_clusters_ = self._hbac.n_clusters_
-        self.labels_ = self._hbac.labels_
-        self.scores_ = self._hbac.scores_
+        self._bahc.fit(X, y)
+        self.n_clusters_ = self._bahc.n_clusters_
+        self.labels_ = self._bahc.labels_
+        self.scores_ = self._bahc.scores_
+        self.cluster_tree_ = self._bahc.cluster_tree_
         return self
+    
+    def predict(self, X):
+        return self._bahc.predict(X)
diff --git a/unsupervised_bias_detection/cluster/_kmodes.py b/unsupervised_bias_detection/cluster/_kmodes.py
@@ -43,6 +43,7 @@ class BiasAwareHierarchicalKModes(BaseEstimator, ClusterMixin):
     """
 
     def __init__(self, bahc_max_iter, bahc_min_cluster_size, **kmodes_params):
+        # TODO: Remove this once we have a better way to handle the number of clusters
         if "n_clusters" in kmodes_params and kmodes_params["n_clusters"] != 2:
             raise ValueError(
                 f"The parameter `n_clusters` should be 2, got {kmodes_params['n_clusters']}."
@@ -61,4 +62,8 @@ def fit(self, X, y):
         self.n_clusters_ = self._hbac.n_clusters_
         self.labels_ = self._hbac.labels_
         self.scores_ = self._hbac.scores_
+        self.cluster_tree_ = self._hbac.cluster_tree_
         return self
+
+    def predict(self, X):
+        return self._hbac.predict(X)