Skip to content

Commit a947851

Browse files
authored
Merge pull request #16 from krstopro/master
Restructure BAHC, rename `clustering` to `cluster`, add parameter constraints
2 parents 60c59f0 + 1e6617b commit a947851

File tree

10 files changed

+188
-114
lines changed

10 files changed

+188
-114
lines changed

tests/test_bahc.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,33 @@
11
import numpy as np
2-
3-
from unsupervised_bias_detection.clustering import BiasAwareHierarchicalKMeans
2+
from unsupervised_bias_detection.cluster import BiasAwareHierarchicalKMeans
43

54

65
def test_shapes():
76
# Checks that labels and biases have the right shapes
87
rng = np.random.RandomState(12)
98
X = rng.rand(20, 10)
109
y = rng.rand(20)
11-
hbac = BiasAwareHierarchicalKMeans(n_iter=5, min_cluster_size=2)
12-
hbac.fit(X, y)
13-
assert len(hbac.labels_) == len(X)
14-
assert len(hbac.scores_) == hbac.n_clusters_
10+
bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2)
11+
bahc.fit(X, y)
12+
assert len(bahc.labels_) == len(X)
13+
assert len(bahc.scores_) == bahc.n_clusters_
14+
1515

1616
def test_labels():
1717
# Checks that label values are between 0 and n_clusters
1818
rng = np.random.RandomState(12)
1919
X = rng.rand(20, 10)
2020
y = rng.rand(20)
21-
hbac = BiasAwareHierarchicalKMeans(n_iter=5, min_cluster_size=2)
22-
hbac.fit(X, y)
23-
assert np.array_equal(np.unique(hbac.labels_), np.arange(hbac.n_clusters_))
21+
bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2)
22+
bahc.fit(X, y)
23+
assert np.array_equal(np.unique(bahc.labels_), np.arange(bahc.n_clusters_))
24+
2425

2526
def test_biases():
2627
# Checks that biases are sorted in descending order
2728
rng = np.random.RandomState(12)
2829
X = rng.rand(20, 10)
2930
y = rng.rand(20)
30-
hbac = BiasAwareHierarchicalKMeans(n_iter=5, min_cluster_size=2)
31-
hbac.fit(X, y)
32-
assert np.all(hbac.scores_[:-1] >= hbac.scores_[1:])
31+
bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2)
32+
bahc.fit(X, y)
33+
assert np.all(bahc.scores_[:-1] >= bahc.scores_[1:])

tests/test_dataset.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ def test_loading_dataset_passes():
99
data, true_labels = load_default_dataset()
1010
assert data is not None and true_labels is not None
1111

12+
1213
@pytest.mark.xfail
1314
def test_unneeded_argument():
1415
"""Checks that no argument is necessary for the function call."""
15-
assert load_default_dataset(False) is TypeError
16+
assert load_default_dataset(False) is TypeError

tests/test_validation.py

Lines changed: 49 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,75 +8,113 @@
88

99
def test_always_passes():
1010
"""Test0: all numerical and good (no errors expected)."""
11-
dict0 = {'x': [[1, 2, 3], [3, 2, 1],[4, 5, 6]], 'preds': [0, 1, 1], 'true_labels': [0, 0, 1]}
11+
dict0 = {
12+
"x": [[1, 2, 3], [3, 2, 1], [4, 5, 6]],
13+
"preds": [0, 1, 1],
14+
"true_labels": [0, 0, 1],
15+
}
1216
df_test0 = pd.DataFrame(data=dict0)
1317
assert not run_checks(df_test0) is ValueError
1418

19+
1520
@pytest.mark.xfail
1621
def test_not_binary_y():
1722
"""Test1: all numerical BUT predictions and labels are not binary."""
18-
dict1 = {'x': [[1, 2, 3], [3, 2, 1],[4, 5, 6]], 'preds': [6, 7, 8], 'true_labels': [11, 0, 2]}
23+
dict1 = {
24+
"x": [[1, 2, 3], [3, 2, 1], [4, 5, 6]],
25+
"preds": [6, 7, 8],
26+
"true_labels": [11, 0, 2],
27+
}
1928
df_test1 = pd.DataFrame(data=dict1)
2029
assert run_checks(df_test1) is ValueError
2130

31+
2232
@pytest.mark.xfail
2333
def test_categorical_preds():
2434
"""Test2: all numerical BUT predictions are categorical."""
25-
dict2 = {'x': [[1, 2, 3], [3, 2, 1],[4, 5, 6]], 'preds': ['yellow', 'yellow', 'blue'], 'true_labels': [0, 1, 1]}
35+
dict2 = {
36+
"x": [[1, 2, 3], [3, 2, 1], [4, 5, 6]],
37+
"preds": ["yellow", "yellow", "blue"],
38+
"true_labels": [0, 1, 1],
39+
}
2640
df_test2 = pd.DataFrame(data=dict2)
2741
assert run_checks(df_test2) is ValueError
2842

43+
2944
@pytest.mark.xfail
3045
def test_categorical_true_labels():
3146
"""Test3: all numerical BUT true labels are categorical."""
32-
dict3 = {'x': [[1, 2, 3], [3, 2, 1],[4, 5, 6]], 'preds': [0, 1, 0], 'true_labels': ['red', 'red', 'yellow']}
47+
dict3 = {
48+
"x": [[1, 2, 3], [3, 2, 1], [4, 5, 6]],
49+
"preds": [0, 1, 0],
50+
"true_labels": ["red", "red", "yellow"],
51+
}
3352
df_test3 = pd.DataFrame(data=dict3)
3453
assert run_checks(df_test3) is ValueError
3554

55+
3656
@pytest.mark.xfail
3757
def test_multiclass_preds():
3858
"""Test4: all numerical BUT predictions are multi-class."""
39-
dict4 = {'x': [[1, 2, 3], [3, 2, 1],[4, 5, 6]], 'preds': [0,1,2], 'true_labels': [0, 1, 1]}
59+
dict4 = {
60+
"x": [[1, 2, 3], [3, 2, 1], [4, 5, 6]],
61+
"preds": [0, 1, 2],
62+
"true_labels": [0, 1, 1],
63+
}
4064
df_test4 = pd.DataFrame(data=dict4)
4165
assert run_checks(df_test4) is ValueError
4266

67+
4368
@pytest.mark.xfail
4469
def test_multiclass_true_labels():
4570
"""Test5: all numerical BUT true labels are multi-class."""
46-
dict5 = {'x': [[1, 2, 3], [3, 2, 1],[4, 5, 6]], 'preds': [0, 1, 1], 'true_labels': [0,1,2]}
71+
dict5 = {
72+
"x": [[1, 2, 3], [3, 2, 1], [4, 5, 6]],
73+
"preds": [0, 1, 1],
74+
"true_labels": [0, 1, 2],
75+
}
4776
df_test5 = pd.DataFrame(data=dict5)
4877
assert run_checks(df_test5) is ValueError
4978

79+
5080
@pytest.mark.xfail
5181
def test_features_nonnumerical():
5282
"""Test6: x includes categorical values."""
53-
dict6 = {'x': [[1, 'three', 2], ['blue', 100, 0], [0, 0, 0]], 'preds': [0, 1, 1], 'true_labels': [1, 1, 1]}
83+
dict6 = {
84+
"x": [[1, "three", 2], ["blue", 100, 0], [0, 0, 0]],
85+
"preds": [0, 1, 1],
86+
"true_labels": [1, 1, 1],
87+
}
5488
df_test6 = pd.DataFrame(data=dict6)
5589
assert run_checks(df_test6) is ValueError
5690

91+
5792
@pytest.mark.xfail
5893
def test_two_missing_columns():
5994
"""Test7: only features present, missing predictions and true labels."""
60-
dict7 = {'x': [[1, 2, 3], [3, 2, 1],[4, 5, 6]]}
95+
dict7 = {"x": [[1, 2, 3], [3, 2, 1], [4, 5, 6]]}
6196
df_test7 = pd.DataFrame(data=dict7)
6297
assert run_checks(df_test7) is IndexError
6398

99+
64100
@pytest.mark.xfail
65101
def test_missing_true_labels():
66102
"""Test8: true labels column missing."""
67-
dict8 = {'x': [[1, 'three', 2], ['blue', 100, 0], [0, 0, 0]], 'preds': [0, 1, 1]}
103+
dict8 = {"x": [[1, "three", 2], ["blue", 100, 0], [0, 0, 0]], "preds": [0, 1, 1]}
68104
df_test8 = pd.DataFrame(data=dict8)
69105
assert run_checks(df_test8) is IndexError
70106

107+
71108
@pytest.mark.xfail
72109
def test_missing_features():
73110
"""Test9: features missing."""
74-
dict9 = {'preds': [0, 1, 1], 'true_labels': [0, 1, 1]}
111+
dict9 = {"preds": [0, 1, 1], "true_labels": [0, 1, 1]}
75112
df_test9 = pd.DataFrame(data=dict9)
76113
assert run_checks(df_test9) is IndexError
77114

115+
78116
@pytest.mark.xfail
79117
def test_not_pandas_type():
80118
"""Test10: the data is not of type pandas."""
81119
array10 = np.array([[1, 2, 3, 0, 1], [4, 5, 6, 0, 0], [7, 8, 9, 1, 1]])
82-
assert run_checks(array10) is TypeError
120+
assert run_checks(array10) is TypeError
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
"""unsupervised-bias-detection."""
1+
"""unsupervised-bias-detection."""

unsupervised_bias_detection/clustering/__init__.py renamed to unsupervised_bias_detection/cluster/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""The :mod:`unsupervised_bias_detection.clustering` module implements bias-aware clustering algorithms."""
1+
"""The :mod:`unsupervised_bias_detection.cluster` module implements bias-aware clustering algorithms."""
22

33
from ._kmeans import BiasAwareHierarchicalKMeans
44
from ._kmodes import BiasAwareHierarchicalKModes

unsupervised_bias_detection/clustering/_bahc.py renamed to unsupervised_bias_detection/cluster/_bahc.py

Lines changed: 49 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,37 @@
1-
import numpy as np
21
import heapq
3-
from abc import ABC, abstractmethod
2+
from numbers import Integral
3+
import numpy as np
44
from sklearn.base import BaseEstimator, ClusterMixin
5+
from sklearn.utils._param_validation import Interval
6+
from sklearn.utils.validation import validate_data
7+
from typing import Any, Type
58

69

7-
class BiasAwareHierarchicalClustering(ABC, BaseEstimator, ClusterMixin):
8-
"""
9-
Base class for Bias-Aware Hierarchical Clustering.
10-
11-
This abstract class specifies an interface for all bias-aware hierarchical clustering classes.
10+
class BiasAwareHierarchicalClustering(BaseEstimator, ClusterMixin):
11+
"""TODO: Add docstring
1212
1313
References
1414
----------
1515
.. [1] J. Misztal-Radecka, B. Indurkhya, "Bias-Aware Hierarchical Clustering for detecting the discriminated
1616
groups of users in recommendation systems", Information Processing & Management, vol. 58, no. 3, May. 2021.
1717
"""
1818

19-
def __init__(self, n_iter, min_cluster_size):
20-
self.n_iter = n_iter
21-
self.min_cluster_size = min_cluster_size
19+
_parameter_constraints: dict = {
20+
"bahc_max_iter": [Interval(Integral, 1, None, closed="left")],
21+
"bahc_min_cluster_size": [Interval(Integral, 1, None, closed="left")],
22+
}
23+
24+
def __init__(
25+
self,
26+
clustering_cls: Type[ClusterMixin],
27+
bahc_max_iter: int,
28+
bahc_min_cluster_size: int,
29+
**clustering_params: Any,
30+
):
31+
self.clustering_cls = clustering_cls
32+
self.bahc_max_iter = bahc_max_iter
33+
self.bahc_min_cluster_size = bahc_min_cluster_size
34+
self.clustering_params = clustering_params
2235

2336
def fit(self, X, y):
2437
"""Compute bias-aware hierarchical clustering.
@@ -36,8 +49,13 @@ def fit(self, X, y):
3649
self : object
3750
Fitted estimator.
3851
"""
39-
X, y = self._validate_data(
40-
X, y, reset=False, accept_large_sparse=False, dtype=self._dtype, order="C"
52+
X, y = validate_data(
53+
self,
54+
X,
55+
y,
56+
reset=False,
57+
accept_large_sparse=False,
58+
order="C",
4159
)
4260
n_samples, _ = X.shape
4361
# We start with all samples in a single cluster
@@ -50,20 +68,26 @@ def fit(self, X, y):
5068
# The entire dataset has a discrimination score of zero
5169
score = 0
5270
heap = [(None, label, score)]
53-
for _ in range(self.n_iter):
71+
for _ in range(self.bahc_max_iter):
5472
if not heap:
5573
# If the heap is empty we stop iterating
5674
break
5775
# Take the cluster with the highest standard deviation of metric y
5876
_, label, score = heapq.heappop(heap)
5977
cluster_indices = np.nonzero(labels == label)[0]
6078
cluster = X[cluster_indices]
61-
cluster_labels = self._split(cluster)
79+
80+
clustering_model = self.clustering_cls(**self.clustering_params)
81+
cluster_labels = clustering_model.fit_predict(cluster)
82+
83+
# TODO: Generalize for more than 2 clusters
84+
# Can do this by checking clustering_model.n_clusters_ (if it exists)
85+
# or by checking the number of unique values in cluster_labels
6286
indices0 = cluster_indices[np.nonzero(cluster_labels == 0)[0]]
6387
indices1 = cluster_indices[np.nonzero(cluster_labels == 1)[0]]
6488
if (
65-
len(indices0) >= self.min_cluster_size
66-
and len(indices1) >= self.min_cluster_size
89+
len(indices0) >= self.bahc_min_cluster_size
90+
and len(indices1) >= self.bahc_min_cluster_size
6791
):
6892
# We calculate the discrimination scores using formula (1) in [1]
6993
mask0 = np.ones(n_samples, dtype=bool)
@@ -87,8 +111,15 @@ def fit(self, X, y):
87111
else:
88112
clusters.append(label)
89113
scores.append(score)
90-
clusters = np.array(clusters + [label for _, label, _ in heap])
91-
scores = np.array(scores + [score for _, _, score in heap])
114+
# clusters = np.array(clusters + [label for _, label, _ in heap])
115+
# scores = np.array(scores + [score for _, _, score in heap])
116+
if heap:
117+
clusters = np.concatenate([clusters, [label for _, label, _ in heap]])
118+
scores = np.concatenate([scores, [score for _, _, score in heap]])
119+
else:
120+
clusters = np.array(clusters)
121+
scores = np.array(scores)
122+
92123
# We sort clusters by decreasing scores
93124
indices = np.argsort(-scores)
94125
clusters = clusters[indices]
@@ -97,19 +128,3 @@ def fit(self, X, y):
97128
mapping[clusters] = np.arange(self.n_clusters_, dtype=np.uint32)
98129
self.labels_ = mapping[labels]
99130
return self
100-
101-
@abstractmethod
102-
def _split(self, X):
103-
"""Split the data into two clusters.
104-
105-
Parameters
106-
----------
107-
X : ndarray of shape (n_samples, n_features)
108-
109-
Returns
110-
-------
111-
labels : ndarray of shape (n_samples,)
112-
Cluster labels for each point. Every label is either 0 or 1 indicating
113-
that the point belongs to the first or the second cluster, respectively.
114-
"""
115-
pass

0 commit comments

Comments
 (0)