Merge pull request NGO-Algorithm-Audit#5 from krstopro/master

krstopro · web-flow · commit 0bc19df637bd · 2024-04-16T11:41:59.000+02:00
Docs, more tests, release workflow, new name
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -1,22 +1,16 @@
 name: Continuous Integration Workflow
 
 on:
-    workflow_dispatch:
-    push:
-      branches:
-          - master
-          - dev
-    pull_request:
-      branches:
-          - master
-          - dev
+  workflow_dispatch:
+  push:
+  pull_request:
 
 jobs:
   main:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.11']
+        python-version: ["3.11"]
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -0,0 +1,23 @@
+name: Release Workflow
+
+on:
+  release:
+      types: [created]
+
+jobs:
+  main:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Continuous integration
+        uses: ./.github/workflows/ci.yml
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      
+      - name: Install poetry
+        run: |
+          pipx install poetry
+          # poetry config virtualenvs.path .virtualenvs
+      
+      - name: Publish to PyPI
+        run: poetry publish --build --username "__token__" --password ${{ secrets.PIPY_TOKEN }}
diff --git a/bias_detection_tool/__init__.py b/bias_detection_tool/__init__.py
@@ -0,0 +1 @@
+"""bias-detection-tool."""
diff --git a/bias_detection_tool/clustering/__init__.py b/bias_detection_tool/clustering/__init__.py
@@ -1,3 +1,5 @@
+"""The :mod:`bias_detection_tool.clustering` module implements bias-aware clustering algorithms."""
+
 from ._kmeans import BiasAwareHierarchicalKMeans
 from ._kmodes import BiasAwareHierarchicalKModes
 
diff --git a/bias_detection_tool/clustering/_bahc.py b/bias_detection_tool/clustering/_bahc.py
@@ -8,8 +8,7 @@ class BiasAwareHierarchicalClustering(ABC, BaseEstimator, ClusterMixin):
     """
     Base class for Bias-Aware Hierarchical Clustering.
 
-    This abstract class specifies an interface for all bias-aware hierarchical
-    clustering classes.
+    This abstract class specifies an interface for all bias-aware hierarchical clustering classes.
     """
 
     def __init__(self, max_iter, min_cluster_size):
@@ -32,6 +31,9 @@ def fit(self, X, y):
         self : object
             Fitted estimator.
         """
+        X, y = self._validate_data(
+            X, y, reset=False, accept_large_sparse=False, dtype=[np.float32, np.float64], order="C"
+        )
         n_samples, _ = X.shape
         self.n_clusters_ = 1
         labels = np.zeros(n_samples, dtype=np.uint32)
@@ -40,15 +42,13 @@ def fit(self, X, y):
         label = 0
         bias = -np.mean(y)
         heap = [(None, label, bias)]
-        print(labels)
         for _ in range(self.max_iter):
             if not heap:
                 break
             _, label, bias = heapq.heappop(heap)
             cluster_indices = np.nonzero(labels == label)[0]
             cluster = X[cluster_indices]
             cluster_labels = self._split(cluster)
-            # TODO: Maybe check if cluster_labels are 0s and 1s
             indices0 = cluster_indices[np.nonzero(cluster_labels == 0)[0]]
             indices1 = cluster_indices[np.nonzero(cluster_labels == 1)[0]]
             if (
@@ -74,13 +74,8 @@ def fit(self, X, y):
             else:
                 clusters.append(label)
                 biases.append(bias)
-            print(labels)
-            print(heap)
-            print(clusters)
         clusters = np.array(clusters + [label for _, label, _ in heap])
         biases = np.array(biases + [bias for _, _, bias in heap])
-        print(clusters)
-        print(biases)
         indices = np.argsort(-biases)
         clusters = clusters[indices]
         self.biases_ = biases[indices]
@@ -91,15 +86,16 @@ def fit(self, X, y):
 
     @abstractmethod
     def _split(self, X):
-        """Splits the data into two clusters.
+        """Split the data into two clusters.
 
         Parameters
         ----------
-        X : array-like of shape  (n_samples, n_features)
+        X : ndarray of shape (n_samples, n_features)
 
         Returns
         -------
-        labels : (n_samples)
-            ndarray of shape (n_samples,)
+        labels : ndarray of shape (n_samples,)
+            Cluster labels for each point. Every label is either 0 or 1 indicating
+            that the point belongs to the first or the second cluster, respectively.
         """
         pass
diff --git a/bias_detection_tool/clustering/_kmeans.py b/bias_detection_tool/clustering/_kmeans.py
@@ -0,0 +1,65 @@
+from ._bahc import BiasAwareHierarchicalClustering
+from sklearn.cluster import KMeans
+
+
+class BiasAwareHierarchicalKMeans(BiasAwareHierarchicalClustering):
+    """Bias-Aware Hierarchical k-Means Clustering.
+
+    Parameters
+    ----------
+    max_iter : int
+        Maximum number of iterations.
+    min_cluster_size : int
+        Minimum size of a cluster.
+    kmeans_params : dict
+        k-means parameters
+    
+    Attributes
+    ----------
+    n_clusters_ : int
+        The number of clusters found by the algorithm.
+    labels_ : ndarray of shape (n_samples,)
+        Cluster labels for each point.
+    biases_ : ndarray of shape (n_clusters_,)
+        Bias values for each cluster.
+    
+    References
+    ----------
+    .. [1] J. Misztal-Radecka, B. Indurkhya, "Bias-Aware Hierarchical Clustering for detecting the discriminated
+           groups of users in recommendation systems", Information Processing & Management, vol. 58, no. 3, May. 2021.
+    
+    Examples
+    --------
+    >>> from bias_detection_tool.clustering import BiasAwareHierarchicalKMeans
+    >>> import numpy as np
+    >>> X = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]])
+    >>> y = np.array([0, 0, 0, 10, 10, 10])
+    >>> bias_aware_kmeans = BiasAwareHierarchicalKMeans(max_iter=1, min_cluster_size=1, random_state=12).fit(X, y)
+    >>> bias_aware_kmeans.labels_
+    array([0, 0, 0, 1, 1, 1], dtype=uint32)
+    >>> bias_aware_kmeans.biases_
+    array([ 10., -10.])
+    """
+
+    def __init__(
+        self,
+        max_iter,
+        min_cluster_size,
+        **kmeans_params,
+    ):
+        super().__init__(max_iter, min_cluster_size)
+
+        if "n_clusters" in kmeans_params and kmeans_params["n_clusters"] != 2:
+            raise ValueError(
+                f"The parameter `n_clusters` should be 2, got {kmeans_params['n_clusters']}."
+            )
+        else:
+            kmeans_params["n_clusters"] = 2
+        
+        if "n_init" not in kmeans_params:
+            kmeans_params["n_init"] = "auto"
+        
+        self.kmeans = KMeans(**kmeans_params)
+
+    def _split(self, X):
+        return self.kmeans.fit_predict(X)
diff --git a/bias_detection_tool/clustering/_kmodes.py b/bias_detection_tool/clustering/_kmodes.py
@@ -0,0 +1,57 @@
+from ._bahc import BiasAwareHierarchicalClustering
+from kmodes.kmodes import KModes
+
+
+class BiasAwareHierarchicalKModes(BiasAwareHierarchicalClustering):
+    """Bias-Aware Hierarchical k-Modes Clustering.
+
+    Parameters
+    ----------
+    max_iter : int
+        Maximum number of iterations.
+    min_cluster_size : int
+        Minimum size of a cluster.
+    kmodes_params : dict
+        k-modes parameters
+    
+    Attributes
+    ----------
+    n_clusters_ : int
+        The number of clusters found by the algorithm.
+    labels_ : ndarray of shape (n_samples,)
+        Cluster labels for each point.
+    biases_ : ndarray of shape (n_clusters_,)
+        Bias values for each cluster.
+    
+    References
+    ----------
+    .. [1] J. Misztal-Radecka, B. Indurkhya, "Bias-Aware Hierarchical Clustering for detecting the discriminated
+           groups of users in recommendation systems", Information Processing & Management, vol. 58, no. 3, May. 2021.
+    
+    Examples
+    --------
+    >>> from bias_detection_tool.clustering import BiasAwareHierarchicalKModes
+    >>> import numpy as np
+    >>> X = np.array([[0, 1], [0, 2], [0, 0], [1, 4], [1, 5], [1, 3]])
+    >>> y = np.array([0, 0, 0, 10, 10, 10])
+    >>> bias_aware_kmodes = BiasAwareHierarchicalKModes(max_iter=1, min_cluster_size=1, random_state=12).fit(X, y)
+    >>> bias_aware_kmodes.labels_
+    array([0, 0, 0, 1, 1, 1], dtype=uint32)
+    >>> bias_aware_kmodes.biases_
+    array([ 10., -10.])
+    """
+
+    def __init__(self, max_iter, min_cluster_size, **kmodes_params):
+        super().__init__(max_iter, min_cluster_size)
+
+        if "n_clusters" in kmodes_params and kmodes_params["n_clusters"] != 2:
+            raise ValueError(
+                f"The parameter `n_clusters` should be 2, got {kmodes_params['n_clusters']}."
+            )
+        else:
+            kmodes_params["n_clusters"] = 2
+
+        self.kmodes = KModes(**kmodes_params)
+
+    def _split(self, X):
+        return self.kmodes.fit_predict(X)
diff --git a/bias_scan/clustering/_kmeans.py b/bias_scan/clustering/_kmeans.py
diff --git a/bias_scan/clustering/_kmodes.py b/bias_scan/clustering/_kmodes.py
diff --git a/bias_scan/clustering/utils.py b/bias_scan/clustering/utils.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,5 +1,5 @@
 [tool.poetry]
-name = "bias-scan"
+name = "bias-detection-tool"
 version = "0.1.0"
 description = ""
 authors = []
@@ -19,6 +19,12 @@ kmodes = "^0.12.2"
 ruff = "^0.2.2"
 pytest = "^8.0.2"
 
+[tool.ruff.lint]
+select = ["D"]
+
+[tool.ruff.lint.pydocstyle]
+convention = "numpy"
+
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
diff --git a/tests/test_bahc.py b/tests/test_bahc.py
@@ -1,6 +1,6 @@
 import numpy as np
 
-from bias_scan.clustering import BiasAwareHierarchicalKMeans
+from bias_detection_tool.clustering import BiasAwareHierarchicalKMeans
 
 
 def test_shapes():
@@ -10,14 +10,23 @@ def test_shapes():
     y = rng.rand(20)
     algo = BiasAwareHierarchicalKMeans(max_iter=5, min_cluster_size=2)
     algo.fit(X, y)
-    assert len(algo.labels_) == 20
+    assert len(algo.labels_) == len(X)
     assert len(algo.biases_) == algo.n_clusters_
 
-def test_clusters():
+def test_labels():
     # Checks that label values are between 0 and n_clusters
     rng = np.random.RandomState(12)
     X = rng.rand(20, 10)
     y = rng.rand(20)
     algo = BiasAwareHierarchicalKMeans(max_iter=5, min_cluster_size=2)
     algo.fit(X, y)
     assert np.array_equal(np.unique(algo.labels_), np.arange(algo.n_clusters_))
+
+def test_biases():
+    # Checks that biases are sorted in descending order
+    rng = np.random.RandomState(12)
+    X = rng.rand(20, 10)
+    y = rng.rand(20)
+    algo = BiasAwareHierarchicalKMeans(max_iter=5, min_cluster_size=2)
+    algo.fit(X, y)
+    assert np.all(algo.biases_[:-1] >= algo.biases_[1:])

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	+"""The :mod:`bias_detection_tool.clustering` module implements bias-aware clustering algorithms."""
	`2`	`+`
`1`	`3`	`from ._kmeans import BiasAwareHierarchicalKMeans`
`2`	`4`	`from ._kmodes import BiasAwareHierarchicalKModes`
`3`	`5`