Merge pull request #19 from krstopro/master

krstopro · web-flow · commit f57d6d743db0 · 2025-05-14T20:11:26.000+02:00
Add `predict` to BAHC and `get_column_dtypes` to utils
diff --git a/tests/test_bahc.py b/tests/test_bahc.py
@@ -3,7 +3,7 @@
 
 
 def test_shapes():
-    # Checks that labels and biases have the right shapes
+    # Checks that labels and scores have the right shapes
     rng = np.random.RandomState(12)
     X = rng.rand(20, 10)
     y = rng.rand(20)
@@ -23,11 +23,40 @@ def test_labels():
     assert np.array_equal(np.unique(bahc.labels_), np.arange(bahc.n_clusters_))
 
 
-def test_biases():
-    # Checks that biases are sorted in descending order
+# def test_cluster_sizes():
+    # Checks that cluster sizes are at least bahc_min_cluster_size
+
+
+def test_scores():
+    # Checks that scores are computed correctly
+    rng = np.random.RandomState(12)
+    X = rng.rand(20, 10)
+    y = rng.rand(20)
+    bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2)
+    bahc.fit(X, y)
+    # TODO: Check this!!!
+    for i in range(bahc.n_clusters_):
+        cluster_indices = np.arange(20)[bahc.labels_ == i]
+        complement_indices = np.arange(20)[bahc.labels_ != i]
+        score = np.mean(y[complement_indices]) - np.mean(y[cluster_indices])
+        assert bahc.scores_[i] == score
+
+
+def test_scores_are_sorted():
+    # Checks that scores are sorted in descending order
     rng = np.random.RandomState(12)
     X = rng.rand(20, 10)
     y = rng.rand(20)
     bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2)
     bahc.fit(X, y)
     assert np.all(bahc.scores_[:-1] >= bahc.scores_[1:])
+
+
+def test_predict():
+    # Checks that predict returns the same labels as fit
+    rng = np.random.RandomState(12)
+    X = rng.rand(20, 10)
+    y = rng.rand(20)
+    bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2)
+    bahc.fit(X, y)
+    assert np.array_equal(bahc.predict(X), bahc.labels_)
diff --git a/unsupervised_bias_detection/__init__.py b/unsupervised_bias_detection/__init__.py
@@ -1 +1 @@
-"""unsupervised-bias-detection."""
+"""unsupervised-bias-detection."""
diff --git a/unsupervised_bias_detection/cluster/__init__.py b/unsupervised_bias_detection/cluster/__init__.py
@@ -1,9 +1,11 @@
 """The :mod:`unsupervised_bias_detection.cluster` module implements bias-aware clustering algorithms."""
 
+from ._bahc import BiasAwareHierarchicalClustering
 from ._kmeans import BiasAwareHierarchicalKMeans
 from ._kmodes import BiasAwareHierarchicalKModes
 
 __all__ = [
+    "BiasAwareHierarchicalClustering",
     "BiasAwareHierarchicalKMeans",
     "BiasAwareHierarchicalKModes",
 ]
diff --git a/unsupervised_bias_detection/cluster/_bahc.py b/unsupervised_bias_detection/cluster/_bahc.py
@@ -1,3 +1,5 @@
+from ._cluster_node import ClusterNode
+from collections import deque
 import heapq
 from numbers import Integral
 import numpy as np
@@ -58,22 +60,25 @@ def fit(self, X, y):
             order="C",
         )
         n_samples, _ = X.shape
-        # We start with all samples in a single cluster
+        # We start with all samples being in a single cluster
         self.n_clusters_ = 1
         # We assign all samples a label of zero
         labels = np.zeros(n_samples, dtype=np.uint32)
-        clusters = []
+        leaves = []
         scores = []
         label = 0
+        root = ClusterNode(label)
+        self.cluster_tree_ = root
         # The entire dataset has a discrimination score of zero
         score = 0
-        heap = [(None, label, score)]
+        heap = [(None, root, score)]
         for _ in range(self.bahc_max_iter):
             if not heap:
                 # If the heap is empty we stop iterating
                 break
             # Take the cluster with the highest standard deviation of metric y
-            _, label, score = heapq.heappop(heap)
+            _, node, score = heapq.heappop(heap)
+            label = node.label
             cluster_indices = np.nonzero(labels == label)[0]
             cluster = X[cluster_indices]
 
@@ -90,39 +95,82 @@ def fit(self, X, y):
                 and len(indices1) >= self.bahc_min_cluster_size
             ):
                 # We calculate the discrimination scores using formula (1) in [1]
+                # TODO: Move y[indices0] and y[indices1] into separate variables
+                # to avoid recomputing them
+                # Maybe create a function to compute the score
                 mask0 = np.ones(n_samples, dtype=bool)
                 mask0[indices0] = False
                 score0 = np.mean(y[mask0]) - np.mean(y[indices0])
                 mask1 = np.ones(n_samples, dtype=bool)
                 mask1[indices1] = False
                 score1 = np.mean(y[mask1]) - np.mean(y[indices1])
                 if max(score0, score1) >= score:
+                    std0 = np.std(y[indices0])
+                    node0 = ClusterNode(label)
                     # heapq implements min-heap
                     # so we have to negate std before pushing
-                    std0 = np.std(y[indices0])
-                    heapq.heappush(heap, (-std0, label, score0))
+                    heapq.heappush(heap, (-std0, node0, score0))
                     std1 = np.std(y[indices1])
-                    heapq.heappush(heap, (-std1, self.n_clusters_, score1))
+                    node1 = ClusterNode(self.n_clusters_)
+                    heapq.heappush(heap, (-std1, node1, score1))
                     labels[indices1] = self.n_clusters_
+                    # TODO: Increase n_clusters_ by clustering_model.n_clusters_ - 1
                     self.n_clusters_ += 1
+                    children = [node0, node1]
+                    node.split(clustering_model, children)
                 else:
-                    clusters.append(label)
+                    leaves.append(node)
                     scores.append(score)
             else:
-                clusters.append(label)
+                leaves.append(node)
                 scores.append(score)
         if heap:
-            clusters = np.concatenate([clusters, [label for _, label, _ in heap]])
+            # TODO: Check if this can be made more efficient
+            leaves.extend((node for _, node, _ in heap))
             scores = np.concatenate([scores, [score for _, _, score in heap]])
         else:
-            clusters = np.array(clusters)
             scores = np.array(scores)
 
         # We sort clusters by decreasing scores
-        indices = np.argsort(-scores)
-        clusters = clusters[indices]
-        self.scores_ = scores[indices]
-        mapping = np.zeros(self.n_clusters_, dtype=np.uint32)
-        mapping[clusters] = np.arange(self.n_clusters_, dtype=np.uint32)
-        self.labels_ = mapping[labels]
+        sorted_indices = np.argsort(-scores)
+        self.scores_ = scores[sorted_indices]
+        leaf_labels = np.array([leaf.label for leaf in leaves])
+        leaf_labels = leaf_labels[sorted_indices]
+        label_mapping = np.zeros(self.n_clusters_, dtype=np.uint32)
+        label_mapping[leaf_labels] = np.arange(self.n_clusters_, dtype=np.uint32)
+        self.labels_ = label_mapping[labels]
+        for leaf in leaves:
+            leaf.label = label_mapping[leaf.label]
         return self
+    
+    def predict(self, X):
+        """Predict the cluster labels for the given data.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+        """
+        # TODO: Assert that fit has been called
+        # TODO: Assert that X has the same number of features as the data used to fit
+        # TODO: Assert that clustering_model has predict method
+        # TODO: Validate X
+        n_samples, _ = X.shape
+        labels = np.zeros(n_samples, dtype=np.uint32)
+        queue = deque([(self.cluster_tree_, np.arange(n_samples))])
+        while queue:
+            node, indices = queue.popleft()
+            if node.is_leaf:
+                labels[indices] = node.label
+            else:
+                cluster = X[indices]
+                clustering_model = node.clustering_model
+                cluster_labels = clustering_model.predict(cluster)
+                if hasattr(clustering_model, "n_clusters_"):
+                    n_clusters = clustering_model.n_clusters_
+                else:
+                    n_clusters = len(np.unique(cluster_labels))
+                for i in range(n_clusters):
+                    child_indices = indices[np.nonzero(cluster_labels == i)[0]]
+                    if child_indices.size > 0:
+                        queue.append((node.children[i], child_indices))
+        return labels
diff --git a/unsupervised_bias_detection/cluster/_cluster_node.py b/unsupervised_bias_detection/cluster/_cluster_node.py
@@ -0,0 +1,54 @@
+from sklearn.base import ClusterMixin
+from typing import Self
+
+class ClusterNode:
+    def __init__(self, label: int):
+        """
+        Initialize a node in the cluster tree.
+        
+        Parameters
+        ----------
+        label : int
+            The cluster label for this node (required as all nodes start as leaves)
+        """
+        self.label = label
+        self.clustering_model = None
+        self.children = []
+    
+    @property
+    def is_leaf(self):
+        return len(self.children) == 0
+    
+    def split(self, clustering_model: ClusterMixin, children: list[Self]):
+        """
+        Split this node by setting its clustering model and adding children.
+        
+        This converts the node to an internal node and removes its label
+        
+        Parameters
+        ----------
+        clustering_model : ClusterMixin
+            The clustering model used to split this node
+        children : list of ClusterNode
+            The child nodes resulting from the split
+        """   
+        self.label = None
+        self.clustering_model = clustering_model
+        self.children = children
+    
+    def get_leaves(self) -> list[Self]:
+        """
+        Get all leaf nodes in the subtree rooted at this node.
+        
+        Returns
+        -------
+        list of ClusterNode
+            All leaf nodes in the subtree
+        """
+        if not self.children:
+            return [self]
+        
+        leaves = []
+        for child in self.children:
+            leaves.extend(child.get_leaves())
+        return leaves
diff --git a/unsupervised_bias_detection/cluster/_kmeans.py b/unsupervised_bias_detection/cluster/_kmeans.py
@@ -8,9 +8,9 @@ class BiasAwareHierarchicalKMeans(BaseEstimator, ClusterMixin):
 
     Parameters
     ----------
-    hbac_max_iter : int
+    bahc_max_iter : int
         Maximum number of iterations.
-    hbac_min_cluster_size : int
+    bahc_min_cluster_size : int
         Minimum size of a cluster.
     kmeans_params : dict
         k-means parameters
@@ -48,6 +48,7 @@ def __init__(
         bahc_min_cluster_size,
         **kmeans_params,
     ):
+        # TODO: Remove this once we have a better way to handle the number of clusters
         if "n_clusters" in kmeans_params and kmeans_params["n_clusters"] != 2:
             raise ValueError(
                 f"The parameter `n_clusters` should be 2, got {kmeans_params['n_clusters']}."
@@ -60,16 +61,20 @@ def __init__(
 
         self.bahc_max_iter = bahc_max_iter
         self.bahc_min_cluster_size = bahc_min_cluster_size
-        self._hbac = BiasAwareHierarchicalClustering(
+        self._bahc = BiasAwareHierarchicalClustering(
             KMeans,
             bahc_max_iter,
             bahc_min_cluster_size,
             **kmeans_params,
         )
 
     def fit(self, X, y):
-        self._hbac.fit(X, y)
-        self.n_clusters_ = self._hbac.n_clusters_
-        self.labels_ = self._hbac.labels_
-        self.scores_ = self._hbac.scores_
+        self._bahc.fit(X, y)
+        self.n_clusters_ = self._bahc.n_clusters_
+        self.labels_ = self._bahc.labels_
+        self.scores_ = self._bahc.scores_
+        self.cluster_tree_ = self._bahc.cluster_tree_
         return self
+    
+    def predict(self, X):
+        return self._bahc.predict(X)
diff --git a/unsupervised_bias_detection/cluster/_kmodes.py b/unsupervised_bias_detection/cluster/_kmodes.py
@@ -43,6 +43,7 @@ class BiasAwareHierarchicalKModes(BaseEstimator, ClusterMixin):
     """
 
     def __init__(self, bahc_max_iter, bahc_min_cluster_size, **kmodes_params):
+        # TODO: Remove this once we have a better way to handle the number of clusters
         if "n_clusters" in kmodes_params and kmodes_params["n_clusters"] != 2:
             raise ValueError(
                 f"The parameter `n_clusters` should be 2, got {kmodes_params['n_clusters']}."
@@ -61,4 +62,8 @@ def fit(self, X, y):
         self.n_clusters_ = self._hbac.n_clusters_
         self.labels_ = self._hbac.labels_
         self.scores_ = self._hbac.scores_
+        self.cluster_tree_ = self._hbac.cluster_tree_
         return self
+
+    def predict(self, X):
+        return self._hbac.predict(X)
diff --git a/unsupervised_bias_detection/utils/__init__.py b/unsupervised_bias_detection/utils/__init__.py
@@ -0,0 +1,7 @@
+"""The :mod:`unsupervised_bias_detection.utils` module implements utility functions."""
+
+from ._get_column_dtypes import get_column_dtypes
+
+__all__ = [
+    "get_column_dtypes",
+]
diff --git a/unsupervised_bias_detection/utils/_get_column_dtypes.py b/unsupervised_bias_detection/utils/_get_column_dtypes.py
@@ -0,0 +1,33 @@
+import numpy as np
+import pandas as pd
+
+
+def get_column_dtypes(data) -> dict:
+    """
+    Return a dictionary mapping column names to abstract data types that are compatible with the processor.
+    
+    The mapping is as follows:
+    - float64, float32, int64, int32 -> "numerical"
+    - bool -> "boolean"
+    - datetime64[...] -> "datetime"
+    - timedelta64[...] -> "timedelta"
+    - All others (e.g., object) -> "categorical"
+    """
+    def map_dtype(dtype: str) -> str:
+        if dtype in ['float64', 'float32', 'int64', 'int32']:
+            return "numerical"
+        elif dtype == 'bool':
+            return "boolean"
+        elif 'datetime' in dtype:
+            return "datetime"
+        elif 'timedelta' in dtype:
+            return "timedelta"
+        else:
+            return "categorical"
+    
+    if isinstance(data, pd.DataFrame):
+        return {col: map_dtype(str(dtype)) for col, dtype in data.dtypes.items()}
+    elif isinstance(data, np.ndarray) and data.dtype.names is not None:
+        return {name: map_dtype(str(data.dtype.fields[name][0])) for name in data.dtype.names}
+    else:
+        raise TypeError("Data must be a pandas DataFrame or a structured numpy array.")

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-"""unsupervised-bias-detection."""`
	`1`	`+"""unsupervised-bias-detection."""`