Lint and add __init__.py to utils

Krsto Proroković · Krsto Proroković · commit a2f71582bf37 · 2025-05-14T20:08:58.000+02:00
diff --git a/tests/test_bahc.py b/tests/test_bahc.py
@@ -3,7 +3,7 @@
 
 
 def test_shapes():
-    # Checks that labels and biases have the right shapes
+    # Checks that labels and scores have the right shapes
     rng = np.random.RandomState(12)
     X = rng.rand(20, 10)
     y = rng.rand(20)
@@ -23,11 +23,40 @@ def test_labels():
     assert np.array_equal(np.unique(bahc.labels_), np.arange(bahc.n_clusters_))
 
 
-def test_biases():
-    # Checks that biases are sorted in descending order
+# def test_cluster_sizes():
+    # Checks that cluster sizes are at least bahc_min_cluster_size
+
+
+def test_scores():
+    # Checks that scores are computed correctly
+    rng = np.random.RandomState(12)
+    X = rng.rand(20, 10)
+    y = rng.rand(20)
+    bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2)
+    bahc.fit(X, y)
+    # TODO: Check this!!!
+    for i in range(bahc.n_clusters_):
+        cluster_indices = np.arange(20)[bahc.labels_ == i]
+        complement_indices = np.arange(20)[bahc.labels_ != i]
+        score = np.mean(y[complement_indices]) - np.mean(y[cluster_indices])
+        assert bahc.scores_[i] == score
+
+
+def test_scores_are_sorted():
+    # Checks that scores are sorted in descending order
     rng = np.random.RandomState(12)
     X = rng.rand(20, 10)
     y = rng.rand(20)
     bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2)
     bahc.fit(X, y)
     assert np.all(bahc.scores_[:-1] >= bahc.scores_[1:])
+
+
+def test_predict():
+    # Checks that predict returns the same labels as fit
+    rng = np.random.RandomState(12)
+    X = rng.rand(20, 10)
+    y = rng.rand(20)
+    bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2)
+    bahc.fit(X, y)
+    assert np.array_equal(bahc.predict(X), bahc.labels_)
diff --git a/unsupervised_bias_detection/cluster/_bahc.py b/unsupervised_bias_detection/cluster/_bahc.py
@@ -97,6 +97,7 @@ def fit(self, X, y):
                 # We calculate the discrimination scores using formula (1) in [1]
                 # TODO: Move y[indices0] and y[indices1] into separate variables
                 # to avoid recomputing them
+                # Maybe create a function to compute the score
                 mask0 = np.ones(n_samples, dtype=bool)
                 mask0[indices0] = False
                 score0 = np.mean(y[mask0]) - np.mean(y[indices0])
diff --git a/unsupervised_bias_detection/utils/__init__.py b/unsupervised_bias_detection/utils/__init__.py
@@ -0,0 +1,7 @@
+"""The :mod:`unsupervised_bias_detection.utils` module implements utility functions."""
+
+from ._get_column_dtypes import get_column_dtypes
+
+__all__ = [
+    "get_column_dtypes",
+]
diff --git a/unsupervised_bias_detection/utils/_get_column_dtypes.py b/unsupervised_bias_detection/utils/_get_column_dtypes.py
@@ -0,0 +1,33 @@
+import numpy as np
+import pandas as pd
+
+
+def get_column_dtypes(data) -> dict:
+    """
+    Return a dictionary mapping column names to abstract data types that are compatible with the processor.
+    
+    The mapping is as follows:
+    - float64, float32, int64, int32 -> "numerical"
+    - bool -> "boolean"
+    - datetime64[...] -> "datetime"
+    - timedelta64[...] -> "timedelta"
+    - All others (e.g., object) -> "categorical"
+    """
+    def map_dtype(dtype: str) -> str:
+        if dtype in ['float64', 'float32', 'int64', 'int32']:
+            return "numerical"
+        elif dtype == 'bool':
+            return "boolean"
+        elif 'datetime' in dtype:
+            return "datetime"
+        elif 'timedelta' in dtype:
+            return "timedelta"
+        else:
+            return "categorical"
+    
+    if isinstance(data, pd.DataFrame):
+        return {col: map_dtype(str(dtype)) for col, dtype in data.dtypes.items()}
+    elif isinstance(data, np.ndarray) and data.dtype.names is not None:
+        return {name: map_dtype(str(data.dtype.fields[name][0])) for name in data.dtype.names}
+    else:
+        raise TypeError("Data must be a pandas DataFrame or a structured numpy array.")