add C2ST metric

marvinschmitt · marvinschmitt · commit bb0cf6549b51 · 2023-08-11T10:22:41.000+03:00
diff --git a/bayesflow/computational_utilities.py b/bayesflow/computational_utilities.py
@@ -22,6 +22,8 @@
 import tensorflow as tf
 from scipy import stats
 from sklearn.calibration import calibration_curve
+from sklearn.neural_network import MLPClassifier
+from sklearn.model_selection import cross_val_score, KFold
 
 from bayesflow.default_settings import MMD_BANDWIDTH_LIST
 from bayesflow.exceptions import ShapeError
@@ -517,3 +519,73 @@ def aggregated_rmse(x_true, x_pred):
     return aggregated_error(
         x_true=x_true, x_pred=x_pred, inner_error_fun=root_mean_squared_error, outer_aggregation_fun=np.mean
     )
+
+
+def c2st(source_samples, target_samples, n_folds=5, scoring="accuracy", normalize=True, seed=123,
+         hidden_units_per_dim=10):
+    """C2ST metric [1] using an sklearn MLP classifier.
+    Code adapted from https://github.com/sbi-benchmark/sbibm/blob/main/sbibm/metrics/c2st.py
+
+    [1] Lopez-Paz, D., & Oquab, M. (2016). Revisiting classifier two-sample tests. arXiv:1610.06545.
+
+    Parameters
+    ----------
+    source_samples : np.ndarray or tf.Tensor
+        Source samples (e.g., approximate posterior samples)
+    target_samples : np.ndarray or tf.Tensor
+        Target samples (e.g., samples from a reference posterior)
+    n_folds        : int, optional, default: 5
+        Number of folds in k-fold cross-validation for the classifier evaluation
+    scoring        : str, optional, default: "accuracy"
+        Evaluation score of the sklearn MLP classifier
+    normalize      : bool, optional, default: True
+        Whether the data shall be z-standardized relative to source_samples
+    seed           : int, optional, default: 123
+        RNG seed for the MLP and k-fold CV
+    hidden_units_per_dim : int, optional, default: 10
+        Number of hidden units in the MLP, relative to the input dimensions.
+        Example: source samples are 5D, hidden_units_per_dim=10 -> 50 hidden units per layer
+
+    Returns
+    -------
+    c2st_score  :  float
+        The resulting C2ST score
+
+    """
+
+    x = np.array(source_samples)
+    y = np.array(target_samples)
+
+    num_dims = x.shape[1]
+    if not num_dims == y.shape[1]:
+        raise ShapeError(f"source_samples and target_samples can have different number of observations (1st dim)"
+                         f"but must have the same dimensionality (2nd dim)"
+                         f"found: source_samples {source_samples.shape[1]}, target_samples {target_samples.shape[1]}")
+
+    if normalize:
+        x_mean = np.mean(x, axis=0)
+        x_std = np.std(x, axis=0)
+        x = (x - x_mean) / x_std
+        y = (y - x_mean) / x_std
+
+    clf = MLPClassifier(
+        activation="relu",
+        hidden_layer_sizes=(hidden_units_per_dim * num_dims, hidden_units_per_dim * num_dims),
+        max_iter=10000,
+        solver="adam",
+        random_state=seed,
+    )
+
+    data = np.concatenate((x, y))
+    target = np.concatenate(
+        (
+            np.zeros((x.shape[0],)),
+            np.ones((y.shape[0],)),
+        )
+    )
+
+    shuffle = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
+    scores = cross_val_score(clf, data, target, cv=shuffle, scoring=scoring)
+
+    c2st_score = np.asarray(np.mean(scores)).astype(np.float32)
+    return c2st_score
diff --git a/tests/test_computational_utilities.py b/tests/test_computational_utilities.py
@@ -3,8 +3,9 @@
 import pytest
 import numpy as np
 from bayesflow import computational_utilities
-from bayesflow.exceptions import ArgumentError
+from bayesflow.exceptions import ArgumentError, ShapeError
 from bayesflow.trainers import Trainer
+import tensorflow as tf
 
 
 @pytest.mark.parametrize("x_true, x_pred, output",
@@ -93,3 +94,44 @@ def test_aggregated_error(x_true, x_pred, inner_error_fun, outer_aggregation_fun
         outer_aggregation_fun=outer_aggregation_fun
     )
     assert aggregated_error_result == pytest.approx(output)
+
+
+def test_c2st_shape_error():
+    source_samples = np.random.random(size=(5, 2))
+    target_samples = np.random.random(size=(5, 3))
+    with pytest.raises(ShapeError):
+        computational_utilities.c2st(source_samples, target_samples)
+
+
+@pytest.mark.parametrize(
+    "source_samples, target_samples",
+    [
+        (np.random.random((5, 2)), np.random.random((5, 2))),
+        (np.random.random((10, 2)), np.random.random((5, 2))),
+        (tf.constant(np.random.random((5, 2))), tf.constant(np.random.random((5, 2))))
+    ]
+)
+def test_c2st(source_samples, target_samples):
+    c2st_score = computational_utilities.c2st(source_samples, target_samples)
+    assert 0.0 <= c2st_score <= 1.0
+
+
+@pytest.mark.parametrize(
+    "n_folds, scoring, normalize, seed, hidden_units_per_dim",
+    [
+        (3, "accuracy", False, 42, 5),
+        (7, "f1", True, 12, 10)
+    ]
+)
+def test_c2st_params(n_folds, scoring, normalize, seed, hidden_units_per_dim):
+    source_samples = np.random.random((5, 2))
+    target_samples = np.random.random((10, 2))
+    _ = computational_utilities.c2st(
+        source_samples=source_samples,
+        target_samples=target_samples,
+        n_folds=n_folds,
+        scoring=scoring,
+        normalize=normalize,
+        seed=seed,
+        hidden_units_per_dim=hidden_units_per_dim
+    )