NGO-Algorithm-Audit
diff --git a/‎tests/test_bahc.py‎
Lines changed: 13 additions & 12 deletions b/‎tests/test_bahc.py‎
Lines changed: 13 additions & 12 deletions
diff --git a/‎tests/test_dataset.py‎
Lines changed: 2 additions & 1 deletion b/‎tests/test_dataset.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎tests/test_validation.py‎
Lines changed: 49 additions & 11 deletions b/‎tests/test_validation.py‎
Lines changed: 49 additions & 11 deletions
diff --git a/‎unsupervised_bias_detection/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎unsupervised_bias_detection/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎…ed_bias_detection/clustering/__init__.py‎ ‎…vised_bias_detection/cluster/__init__.py‎unsupervised_bias_detection/clustering/__init__.py renamed to unsupervised_bias_detection/cluster/__init__.py
Lines changed: 1 addition & 1 deletion b/‎…ed_bias_detection/clustering/__init__.py‎ ‎…vised_bias_detection/cluster/__init__.py‎unsupervised_bias_detection/clustering/__init__.py renamed to unsupervised_bias_detection/cluster/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎…vised_bias_detection/clustering/_bahc.py‎ ‎…pervised_bias_detection/cluster/_bahc.py‎unsupervised_bias_detection/clustering/_bahc.py renamed to unsupervised_bias_detection/cluster/_bahc.py
Lines changed: 49 additions & 34 deletions b/‎…vised_bias_detection/clustering/_bahc.py‎ ‎…pervised_bias_detection/cluster/_bahc.py‎unsupervised_bias_detection/clustering/_bahc.py renamed to unsupervised_bias_detection/cluster/_bahc.py
Lines changed: 49 additions & 34 deletions
@@ -1,32 +1,33 @@
 import numpy as np
-
-from unsupervised_bias_detection.clustering import BiasAwareHierarchicalKMeans
+from unsupervised_bias_detection.cluster import BiasAwareHierarchicalKMeans
 
 
 def test_shapes():
     # Checks that labels and biases have the right shapes
     rng = np.random.RandomState(12)
     X = rng.rand(20, 10)
     y = rng.rand(20)
-    hbac = BiasAwareHierarchicalKMeans(n_iter=5, min_cluster_size=2)
-    hbac.fit(X, y)
-    assert len(hbac.labels_) == len(X)
-    assert len(hbac.scores_) == hbac.n_clusters_
+    bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2)
+    bahc.fit(X, y)
+    assert len(bahc.labels_) == len(X)
+    assert len(bahc.scores_) == bahc.n_clusters_
+
 
 def test_labels():
     # Checks that label values are between 0 and n_clusters
     rng = np.random.RandomState(12)
     X = rng.rand(20, 10)
     y = rng.rand(20)
-    hbac = BiasAwareHierarchicalKMeans(n_iter=5, min_cluster_size=2)
-    hbac.fit(X, y)
-    assert np.array_equal(np.unique(hbac.labels_), np.arange(hbac.n_clusters_))
+    bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2)
+    bahc.fit(X, y)
+    assert np.array_equal(np.unique(bahc.labels_), np.arange(bahc.n_clusters_))
+
 
 def test_biases():
     # Checks that biases are sorted in descending order
     rng = np.random.RandomState(12)
     X = rng.rand(20, 10)
     y = rng.rand(20)
-    hbac = BiasAwareHierarchicalKMeans(n_iter=5, min_cluster_size=2)
-    hbac.fit(X, y)
-    assert np.all(hbac.scores_[:-1] >= hbac.scores_[1:])
+    bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2)
+    bahc.fit(X, y)
+    assert np.all(bahc.scores_[:-1] >= bahc.scores_[1:])
@@ -9,7 +9,8 @@ def test_loading_dataset_passes():
     data, true_labels = load_default_dataset()
     assert data is not None and true_labels is not None
 
+
 @pytest.mark.xfail
 def test_unneeded_argument():
     """Checks that no argument is necessary for the function call."""
-    assert load_default_dataset(False) is TypeError
+    assert load_default_dataset(False) is TypeError
@@ -8,75 +8,113 @@
 
 def test_always_passes():
     """Test0: all numerical and good (no errors expected)."""
-    dict0 = {'x': [[1, 2, 3], [3, 2, 1],[4, 5, 6]], 'preds': [0, 1, 1], 'true_labels': [0, 0, 1]}
+    dict0 = {
+        "x": [[1, 2, 3], [3, 2, 1], [4, 5, 6]],
+        "preds": [0, 1, 1],
+        "true_labels": [0, 0, 1],
+    }
     df_test0 = pd.DataFrame(data=dict0)
     assert not run_checks(df_test0) is ValueError
 
+
 @pytest.mark.xfail
 def test_not_binary_y():
     """Test1: all numerical BUT predictions and labels are not binary."""
-    dict1 = {'x': [[1, 2, 3], [3, 2, 1],[4, 5, 6]], 'preds': [6, 7, 8], 'true_labels': [11, 0, 2]}
+    dict1 = {
+        "x": [[1, 2, 3], [3, 2, 1], [4, 5, 6]],
+        "preds": [6, 7, 8],
+        "true_labels": [11, 0, 2],
+    }
     df_test1 = pd.DataFrame(data=dict1)
     assert run_checks(df_test1) is ValueError
 
+
 @pytest.mark.xfail
 def test_categorical_preds():
     """Test2: all numerical BUT predictions are categorical."""
-    dict2 = {'x': [[1, 2, 3], [3, 2, 1],[4, 5, 6]], 'preds': ['yellow', 'yellow', 'blue'], 'true_labels': [0, 1, 1]}
+    dict2 = {
+        "x": [[1, 2, 3], [3, 2, 1], [4, 5, 6]],
+        "preds": ["yellow", "yellow", "blue"],
+        "true_labels": [0, 1, 1],
+    }
     df_test2 = pd.DataFrame(data=dict2)
     assert run_checks(df_test2) is ValueError
 
+
 @pytest.mark.xfail
 def test_categorical_true_labels():
     """Test3: all numerical BUT true labels are categorical."""
-    dict3 = {'x': [[1, 2, 3], [3, 2, 1],[4, 5, 6]], 'preds': [0, 1, 0], 'true_labels':  ['red', 'red', 'yellow']}
+    dict3 = {
+        "x": [[1, 2, 3], [3, 2, 1], [4, 5, 6]],
+        "preds": [0, 1, 0],
+        "true_labels": ["red", "red", "yellow"],
+    }
     df_test3 = pd.DataFrame(data=dict3)
     assert run_checks(df_test3) is ValueError
 
+
 @pytest.mark.xfail
 def test_multiclass_preds():
     """Test4: all numerical BUT predictions are multi-class."""
-    dict4 = {'x': [[1, 2, 3], [3, 2, 1],[4, 5, 6]], 'preds': [0,1,2], 'true_labels': [0, 1, 1]}
+    dict4 = {
+        "x": [[1, 2, 3], [3, 2, 1], [4, 5, 6]],
+        "preds": [0, 1, 2],
+        "true_labels": [0, 1, 1],
+    }
     df_test4 = pd.DataFrame(data=dict4)
     assert run_checks(df_test4) is ValueError
 
+
 @pytest.mark.xfail
 def test_multiclass_true_labels():
     """Test5: all numerical BUT true labels are multi-class."""
-    dict5 = {'x': [[1, 2, 3], [3, 2, 1],[4, 5, 6]], 'preds': [0, 1, 1], 'true_labels': [0,1,2]}
+    dict5 = {
+        "x": [[1, 2, 3], [3, 2, 1], [4, 5, 6]],
+        "preds": [0, 1, 1],
+        "true_labels": [0, 1, 2],
+    }
     df_test5 = pd.DataFrame(data=dict5)
     assert run_checks(df_test5) is ValueError
 
+
 @pytest.mark.xfail
 def test_features_nonnumerical():
     """Test6: x includes categorical values."""
-    dict6 = {'x': [[1, 'three', 2], ['blue', 100, 0], [0, 0, 0]], 'preds': [0, 1, 1], 'true_labels': [1, 1, 1]}
+    dict6 = {
+        "x": [[1, "three", 2], ["blue", 100, 0], [0, 0, 0]],
+        "preds": [0, 1, 1],
+        "true_labels": [1, 1, 1],
+    }
     df_test6 = pd.DataFrame(data=dict6)
     assert run_checks(df_test6) is ValueError
 
+
 @pytest.mark.xfail
 def test_two_missing_columns():
     """Test7: only features present, missing predictions and true labels."""
-    dict7 = {'x': [[1, 2, 3], [3, 2, 1],[4, 5, 6]]}
+    dict7 = {"x": [[1, 2, 3], [3, 2, 1], [4, 5, 6]]}
     df_test7 = pd.DataFrame(data=dict7)
     assert run_checks(df_test7) is IndexError
 
+
 @pytest.mark.xfail
 def test_missing_true_labels():
     """Test8: true labels column missing."""
-    dict8 = {'x': [[1, 'three', 2], ['blue', 100, 0], [0, 0, 0]], 'preds': [0, 1, 1]}
+    dict8 = {"x": [[1, "three", 2], ["blue", 100, 0], [0, 0, 0]], "preds": [0, 1, 1]}
     df_test8 = pd.DataFrame(data=dict8)
     assert run_checks(df_test8) is IndexError
 
+
 @pytest.mark.xfail
 def test_missing_features():
     """Test9: features missing."""
-    dict9 = {'preds': [0, 1, 1], 'true_labels': [0, 1, 1]}
+    dict9 = {"preds": [0, 1, 1], "true_labels": [0, 1, 1]}
     df_test9 = pd.DataFrame(data=dict9)
     assert run_checks(df_test9) is IndexError
 
+
 @pytest.mark.xfail
 def test_not_pandas_type():
     """Test10: the data is not of type pandas."""
     array10 = np.array([[1, 2, 3, 0, 1], [4, 5, 6, 0, 0], [7, 8, 9, 1, 1]])
-    assert run_checks(array10) is TypeError
+    assert run_checks(array10) is TypeError
@@ -1 +1 @@
-"""unsupervised-bias-detection."""
+"""unsupervised-bias-detection."""
@@ -1,4 +1,4 @@
-"""The :mod:`unsupervised_bias_detection.clustering` module implements bias-aware clustering algorithms."""
+"""The :mod:`unsupervised_bias_detection.cluster` module implements bias-aware clustering algorithms."""
 
 from ._kmeans import BiasAwareHierarchicalKMeans
 from ._kmodes import BiasAwareHierarchicalKModes
 
@@ -1,24 +1,37 @@
-import numpy as np
 import heapq
-from abc import ABC, abstractmethod
+from numbers import Integral
+import numpy as np
 from sklearn.base import BaseEstimator, ClusterMixin
+from sklearn.utils._param_validation import Interval
+from sklearn.utils.validation import validate_data
+from typing import Any, Type
 
 
-class BiasAwareHierarchicalClustering(ABC, BaseEstimator, ClusterMixin):
-    """
-    Base class for Bias-Aware Hierarchical Clustering.
-
-    This abstract class specifies an interface for all bias-aware hierarchical clustering classes.
+class BiasAwareHierarchicalClustering(BaseEstimator, ClusterMixin):
+    """TODO: Add docstring
 
     References
     ----------
     .. [1] J. Misztal-Radecka, B. Indurkhya, "Bias-Aware Hierarchical Clustering for detecting the discriminated
            groups of users in recommendation systems", Information Processing & Management, vol. 58, no. 3, May. 2021.
     """
 
-    def __init__(self, n_iter, min_cluster_size):
-        self.n_iter = n_iter
-        self.min_cluster_size = min_cluster_size
+    _parameter_constraints: dict = {
+        "bahc_max_iter": [Interval(Integral, 1, None, closed="left")],
+        "bahc_min_cluster_size": [Interval(Integral, 1, None, closed="left")],
+    }
+
+    def __init__(
+        self,
+        clustering_cls: Type[ClusterMixin],
+        bahc_max_iter: int,
+        bahc_min_cluster_size: int,
+        **clustering_params: Any,
+    ):
+        self.clustering_cls = clustering_cls
+        self.bahc_max_iter = bahc_max_iter
+        self.bahc_min_cluster_size = bahc_min_cluster_size
+        self.clustering_params = clustering_params
 
     def fit(self, X, y):
         """Compute bias-aware hierarchical clustering.
@@ -36,8 +49,13 @@ def fit(self, X, y):
         self : object
             Fitted estimator.
         """
-        X, y = self._validate_data(
-            X, y, reset=False, accept_large_sparse=False, dtype=self._dtype, order="C"
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            reset=False,
+            accept_large_sparse=False,
+            order="C",
         )
         n_samples, _ = X.shape
         # We start with all samples in a single cluster
@@ -50,20 +68,26 @@ def fit(self, X, y):
         # The entire dataset has a discrimination score of zero
         score = 0
         heap = [(None, label, score)]
-        for _ in range(self.n_iter):
+        for _ in range(self.bahc_max_iter):
             if not heap:
                 # If the heap is empty we stop iterating
                 break
             # Take the cluster with the highest standard deviation of metric y
             _, label, score = heapq.heappop(heap)
             cluster_indices = np.nonzero(labels == label)[0]
             cluster = X[cluster_indices]
-            cluster_labels = self._split(cluster)
+
+            clustering_model = self.clustering_cls(**self.clustering_params)
+            cluster_labels = clustering_model.fit_predict(cluster)
+
+            # TODO: Generalize for more than 2 clusters
+            # Can do this by checking clustering_model.n_clusters_ (if it exists)
+            # or by checking the number of unique values in cluster_labels
             indices0 = cluster_indices[np.nonzero(cluster_labels == 0)[0]]
             indices1 = cluster_indices[np.nonzero(cluster_labels == 1)[0]]
             if (
-                len(indices0) >= self.min_cluster_size
-                and len(indices1) >= self.min_cluster_size
+                len(indices0) >= self.bahc_min_cluster_size
+                and len(indices1) >= self.bahc_min_cluster_size
             ):
                 # We calculate the discrimination scores using formula (1) in [1]
                 mask0 = np.ones(n_samples, dtype=bool)
@@ -87,8 +111,15 @@ def fit(self, X, y):
             else:
                 clusters.append(label)
                 scores.append(score)
-        clusters = np.array(clusters + [label for _, label, _ in heap])
-        scores = np.array(scores + [score for _, _, score in heap])
+        # clusters = np.array(clusters + [label for _, label, _ in heap])
+        # scores = np.array(scores + [score for _, _, score in heap])
+        if heap:
+            clusters = np.concatenate([clusters, [label for _, label, _ in heap]])
+            scores = np.concatenate([scores, [score for _, _, score in heap]])
+        else:
+            clusters = np.array(clusters)
+            scores = np.array(scores)
+
         # We sort clusters by decreasing scores
         indices = np.argsort(-scores)
         clusters = clusters[indices]
@@ -97,19 +128,3 @@ def fit(self, X, y):
         mapping[clusters] = np.arange(self.n_clusters_, dtype=np.uint32)
         self.labels_ = mapping[labels]
         return self
-
-    @abstractmethod
-    def _split(self, X):
-        """Split the data into two clusters.
-
-        Parameters
-        ----------
-        X : ndarray of shape (n_samples, n_features)
-
-        Returns
-        -------
-        labels : ndarray of shape (n_samples,)
-            Cluster labels for each point. Every label is either 0 or 1 indicating
-            that the point belongs to the first or the second cluster, respectively.
-        """
-        pass
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-"""unsupervised-bias-detection."""`
	`1`	`+"""unsupervised-bias-detection."""`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		-"""The :mod:`unsupervised_bias_detection.clustering` module implements bias-aware clustering algorithms."""
	`1`	+"""The :mod:`unsupervised_bias_detection.cluster` module implements bias-aware clustering algorithms."""
`2`	`2`
`3`	`3`	`from ._kmeans import BiasAwareHierarchicalKMeans`
`4`	`4`	`from ._kmodes import BiasAwareHierarchicalKModes`