Skip to content

Commit f57d6d7

Browse files
authored
Merge pull request #19 from krstopro/master
Add `predict` to BAHC and `get_column_dtypes` to utils
2 parents 9310cb6 + a2f7158 commit f57d6d7

File tree

9 files changed

+211
-28
lines changed

9 files changed

+211
-28
lines changed

tests/test_bahc.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44

55
def test_shapes():
6-
# Checks that labels and biases have the right shapes
6+
# Checks that labels and scores have the right shapes
77
rng = np.random.RandomState(12)
88
X = rng.rand(20, 10)
99
y = rng.rand(20)
@@ -23,11 +23,40 @@ def test_labels():
2323
assert np.array_equal(np.unique(bahc.labels_), np.arange(bahc.n_clusters_))
2424

2525

26-
def test_biases():
27-
# Checks that biases are sorted in descending order
26+
# def test_cluster_sizes():
27+
# Checks that cluster sizes are at least bahc_min_cluster_size
28+
29+
30+
def test_scores():
31+
# Checks that scores are computed correctly
32+
rng = np.random.RandomState(12)
33+
X = rng.rand(20, 10)
34+
y = rng.rand(20)
35+
bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2)
36+
bahc.fit(X, y)
37+
# TODO: Check this!!!
38+
for i in range(bahc.n_clusters_):
39+
cluster_indices = np.arange(20)[bahc.labels_ == i]
40+
complement_indices = np.arange(20)[bahc.labels_ != i]
41+
score = np.mean(y[complement_indices]) - np.mean(y[cluster_indices])
42+
assert bahc.scores_[i] == score
43+
44+
45+
def test_scores_are_sorted():
46+
# Checks that scores are sorted in descending order
2847
rng = np.random.RandomState(12)
2948
X = rng.rand(20, 10)
3049
y = rng.rand(20)
3150
bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2)
3251
bahc.fit(X, y)
3352
assert np.all(bahc.scores_[:-1] >= bahc.scores_[1:])
53+
54+
55+
def test_predict():
56+
# Checks that predict returns the same labels as fit
57+
rng = np.random.RandomState(12)
58+
X = rng.rand(20, 10)
59+
y = rng.rand(20)
60+
bahc = BiasAwareHierarchicalKMeans(bahc_max_iter=5, bahc_min_cluster_size=2)
61+
bahc.fit(X, y)
62+
assert np.array_equal(bahc.predict(X), bahc.labels_)
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
"""unsupervised-bias-detection."""
1+
"""unsupervised-bias-detection."""
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
"""The :mod:`unsupervised_bias_detection.cluster` module implements bias-aware clustering algorithms."""
22

3+
from ._bahc import BiasAwareHierarchicalClustering
34
from ._kmeans import BiasAwareHierarchicalKMeans
45
from ._kmodes import BiasAwareHierarchicalKModes
56

67
__all__ = [
8+
"BiasAwareHierarchicalClustering",
79
"BiasAwareHierarchicalKMeans",
810
"BiasAwareHierarchicalKModes",
911
]

unsupervised_bias_detection/cluster/_bahc.py

Lines changed: 65 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from ._cluster_node import ClusterNode
2+
from collections import deque
13
import heapq
24
from numbers import Integral
35
import numpy as np
@@ -58,22 +60,25 @@ def fit(self, X, y):
5860
order="C",
5961
)
6062
n_samples, _ = X.shape
61-
# We start with all samples in a single cluster
63+
# We start with all samples being in a single cluster
6264
self.n_clusters_ = 1
6365
# We assign all samples a label of zero
6466
labels = np.zeros(n_samples, dtype=np.uint32)
65-
clusters = []
67+
leaves = []
6668
scores = []
6769
label = 0
70+
root = ClusterNode(label)
71+
self.cluster_tree_ = root
6872
# The entire dataset has a discrimination score of zero
6973
score = 0
70-
heap = [(None, label, score)]
74+
heap = [(None, root, score)]
7175
for _ in range(self.bahc_max_iter):
7276
if not heap:
7377
# If the heap is empty we stop iterating
7478
break
7579
# Take the cluster with the highest standard deviation of metric y
76-
_, label, score = heapq.heappop(heap)
80+
_, node, score = heapq.heappop(heap)
81+
label = node.label
7782
cluster_indices = np.nonzero(labels == label)[0]
7883
cluster = X[cluster_indices]
7984

@@ -90,39 +95,82 @@ def fit(self, X, y):
9095
and len(indices1) >= self.bahc_min_cluster_size
9196
):
9297
# We calculate the discrimination scores using formula (1) in [1]
98+
# TODO: Move y[indices0] and y[indices1] into separate variables
99+
# to avoid recomputing them
100+
# Maybe create a function to compute the score
93101
mask0 = np.ones(n_samples, dtype=bool)
94102
mask0[indices0] = False
95103
score0 = np.mean(y[mask0]) - np.mean(y[indices0])
96104
mask1 = np.ones(n_samples, dtype=bool)
97105
mask1[indices1] = False
98106
score1 = np.mean(y[mask1]) - np.mean(y[indices1])
99107
if max(score0, score1) >= score:
108+
std0 = np.std(y[indices0])
109+
node0 = ClusterNode(label)
100110
# heapq implements min-heap
101111
# so we have to negate std before pushing
102-
std0 = np.std(y[indices0])
103-
heapq.heappush(heap, (-std0, label, score0))
112+
heapq.heappush(heap, (-std0, node0, score0))
104113
std1 = np.std(y[indices1])
105-
heapq.heappush(heap, (-std1, self.n_clusters_, score1))
114+
node1 = ClusterNode(self.n_clusters_)
115+
heapq.heappush(heap, (-std1, node1, score1))
106116
labels[indices1] = self.n_clusters_
117+
# TODO: Increase n_clusters_ by clustering_model.n_clusters_ - 1
107118
self.n_clusters_ += 1
119+
children = [node0, node1]
120+
node.split(clustering_model, children)
108121
else:
109-
clusters.append(label)
122+
leaves.append(node)
110123
scores.append(score)
111124
else:
112-
clusters.append(label)
125+
leaves.append(node)
113126
scores.append(score)
114127
if heap:
115-
clusters = np.concatenate([clusters, [label for _, label, _ in heap]])
128+
# TODO: Check if this can be made more efficient
129+
leaves.extend((node for _, node, _ in heap))
116130
scores = np.concatenate([scores, [score for _, _, score in heap]])
117131
else:
118-
clusters = np.array(clusters)
119132
scores = np.array(scores)
120133

121134
# We sort clusters by decreasing scores
122-
indices = np.argsort(-scores)
123-
clusters = clusters[indices]
124-
self.scores_ = scores[indices]
125-
mapping = np.zeros(self.n_clusters_, dtype=np.uint32)
126-
mapping[clusters] = np.arange(self.n_clusters_, dtype=np.uint32)
127-
self.labels_ = mapping[labels]
135+
sorted_indices = np.argsort(-scores)
136+
self.scores_ = scores[sorted_indices]
137+
leaf_labels = np.array([leaf.label for leaf in leaves])
138+
leaf_labels = leaf_labels[sorted_indices]
139+
label_mapping = np.zeros(self.n_clusters_, dtype=np.uint32)
140+
label_mapping[leaf_labels] = np.arange(self.n_clusters_, dtype=np.uint32)
141+
self.labels_ = label_mapping[labels]
142+
for leaf in leaves:
143+
leaf.label = label_mapping[leaf.label]
128144
return self
145+
146+
def predict(self, X):
147+
"""Predict the cluster labels for the given data.
148+
149+
Parameters
150+
----------
151+
X : array-like of shape (n_samples, n_features)
152+
"""
153+
# TODO: Assert that fit has been called
154+
# TODO: Assert that X has the same number of features as the data used to fit
155+
# TODO: Assert that clustering_model has predict method
156+
# TODO: Validate X
157+
n_samples, _ = X.shape
158+
labels = np.zeros(n_samples, dtype=np.uint32)
159+
queue = deque([(self.cluster_tree_, np.arange(n_samples))])
160+
while queue:
161+
node, indices = queue.popleft()
162+
if node.is_leaf:
163+
labels[indices] = node.label
164+
else:
165+
cluster = X[indices]
166+
clustering_model = node.clustering_model
167+
cluster_labels = clustering_model.predict(cluster)
168+
if hasattr(clustering_model, "n_clusters_"):
169+
n_clusters = clustering_model.n_clusters_
170+
else:
171+
n_clusters = len(np.unique(cluster_labels))
172+
for i in range(n_clusters):
173+
child_indices = indices[np.nonzero(cluster_labels == i)[0]]
174+
if child_indices.size > 0:
175+
queue.append((node.children[i], child_indices))
176+
return labels
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
from sklearn.base import ClusterMixin
2+
from typing import Self
3+
4+
class ClusterNode:
5+
def __init__(self, label: int):
6+
"""
7+
Initialize a node in the cluster tree.
8+
9+
Parameters
10+
----------
11+
label : int
12+
The cluster label for this node (required as all nodes start as leaves)
13+
"""
14+
self.label = label
15+
self.clustering_model = None
16+
self.children = []
17+
18+
@property
19+
def is_leaf(self):
20+
return len(self.children) == 0
21+
22+
def split(self, clustering_model: ClusterMixin, children: list[Self]):
23+
"""
24+
Split this node by setting its clustering model and adding children.
25+
26+
This converts the node to an internal node and removes its label
27+
28+
Parameters
29+
----------
30+
clustering_model : ClusterMixin
31+
The clustering model used to split this node
32+
children : list of ClusterNode
33+
The child nodes resulting from the split
34+
"""
35+
self.label = None
36+
self.clustering_model = clustering_model
37+
self.children = children
38+
39+
def get_leaves(self) -> list[Self]:
40+
"""
41+
Get all leaf nodes in the subtree rooted at this node.
42+
43+
Returns
44+
-------
45+
list of ClusterNode
46+
All leaf nodes in the subtree
47+
"""
48+
if not self.children:
49+
return [self]
50+
51+
leaves = []
52+
for child in self.children:
53+
leaves.extend(child.get_leaves())
54+
return leaves

unsupervised_bias_detection/cluster/_kmeans.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@ class BiasAwareHierarchicalKMeans(BaseEstimator, ClusterMixin):
88
99
Parameters
1010
----------
11-
hbac_max_iter : int
11+
bahc_max_iter : int
1212
Maximum number of iterations.
13-
hbac_min_cluster_size : int
13+
bahc_min_cluster_size : int
1414
Minimum size of a cluster.
1515
kmeans_params : dict
1616
k-means parameters
@@ -48,6 +48,7 @@ def __init__(
4848
bahc_min_cluster_size,
4949
**kmeans_params,
5050
):
51+
# TODO: Remove this once we have a better way to handle the number of clusters
5152
if "n_clusters" in kmeans_params and kmeans_params["n_clusters"] != 2:
5253
raise ValueError(
5354
f"The parameter `n_clusters` should be 2, got {kmeans_params['n_clusters']}."
@@ -60,16 +61,20 @@ def __init__(
6061

6162
self.bahc_max_iter = bahc_max_iter
6263
self.bahc_min_cluster_size = bahc_min_cluster_size
63-
self._hbac = BiasAwareHierarchicalClustering(
64+
self._bahc = BiasAwareHierarchicalClustering(
6465
KMeans,
6566
bahc_max_iter,
6667
bahc_min_cluster_size,
6768
**kmeans_params,
6869
)
6970

7071
def fit(self, X, y):
71-
self._hbac.fit(X, y)
72-
self.n_clusters_ = self._hbac.n_clusters_
73-
self.labels_ = self._hbac.labels_
74-
self.scores_ = self._hbac.scores_
72+
self._bahc.fit(X, y)
73+
self.n_clusters_ = self._bahc.n_clusters_
74+
self.labels_ = self._bahc.labels_
75+
self.scores_ = self._bahc.scores_
76+
self.cluster_tree_ = self._bahc.cluster_tree_
7577
return self
78+
79+
def predict(self, X):
80+
return self._bahc.predict(X)

unsupervised_bias_detection/cluster/_kmodes.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ class BiasAwareHierarchicalKModes(BaseEstimator, ClusterMixin):
4343
"""
4444

4545
def __init__(self, bahc_max_iter, bahc_min_cluster_size, **kmodes_params):
46+
# TODO: Remove this once we have a better way to handle the number of clusters
4647
if "n_clusters" in kmodes_params and kmodes_params["n_clusters"] != 2:
4748
raise ValueError(
4849
f"The parameter `n_clusters` should be 2, got {kmodes_params['n_clusters']}."
@@ -61,4 +62,8 @@ def fit(self, X, y):
6162
self.n_clusters_ = self._hbac.n_clusters_
6263
self.labels_ = self._hbac.labels_
6364
self.scores_ = self._hbac.scores_
65+
self.cluster_tree_ = self._hbac.cluster_tree_
6466
return self
67+
68+
def predict(self, X):
69+
return self._hbac.predict(X)
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
"""The :mod:`unsupervised_bias_detection.utils` module implements utility functions."""
2+
3+
from ._get_column_dtypes import get_column_dtypes
4+
5+
__all__ = [
6+
"get_column_dtypes",
7+
]
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import numpy as np
2+
import pandas as pd
3+
4+
5+
def get_column_dtypes(data) -> dict:
6+
"""
7+
Return a dictionary mapping column names to abstract data types that are compatible with the processor.
8+
9+
The mapping is as follows:
10+
- float64, float32, int64, int32 -> "numerical"
11+
- bool -> "boolean"
12+
- datetime64[...] -> "datetime"
13+
- timedelta64[...] -> "timedelta"
14+
- All others (e.g., object) -> "categorical"
15+
"""
16+
def map_dtype(dtype: str) -> str:
17+
if dtype in ['float64', 'float32', 'int64', 'int32']:
18+
return "numerical"
19+
elif dtype == 'bool':
20+
return "boolean"
21+
elif 'datetime' in dtype:
22+
return "datetime"
23+
elif 'timedelta' in dtype:
24+
return "timedelta"
25+
else:
26+
return "categorical"
27+
28+
if isinstance(data, pd.DataFrame):
29+
return {col: map_dtype(str(dtype)) for col, dtype in data.dtypes.items()}
30+
elif isinstance(data, np.ndarray) and data.dtype.names is not None:
31+
return {name: map_dtype(str(data.dtype.fields[name][0])) for name in data.dtype.names}
32+
else:
33+
raise TypeError("Data must be a pandas DataFrame or a structured numpy array.")

0 commit comments

Comments
 (0)