diff --git a/bayesflow/diagnostics/metrics/__init__.py b/bayesflow/diagnostics/metrics/__init__.py
index 1fb5f2b8d..ceeca4cc4 100644
--- a/bayesflow/diagnostics/metrics/__init__.py
+++ b/bayesflow/diagnostics/metrics/__init__.py
@@ -2,3 +2,4 @@
 from .posterior_contraction import posterior_contraction
 from .root_mean_squared_error import root_mean_squared_error
 from .expected_calibration_error import expected_calibration_error
+from .classifier_two_sample_test import classifier_two_sample_test
diff --git a/bayesflow/diagnostics/metrics/classifier_two_sample_test.py b/bayesflow/diagnostics/metrics/classifier_two_sample_test.py
new file mode 100644
index 000000000..d1382ae5d
--- /dev/null
+++ b/bayesflow/diagnostics/metrics/classifier_two_sample_test.py
@@ -0,0 +1,136 @@
+from typing import Sequence, Mapping, Any
+
+import numpy as np
+
+import keras
+
+from bayesflow.utils.exceptions import ShapeError
+from bayesflow.networks import MLP
+
+
+def classifier_two_sample_test(
+    estimates: np.ndarray,
+    targets: np.ndarray,
+    metric: str = "accuracy",
+    patience: int = 5,
+    max_epochs: int = 1000,
+    batch_size: int = 128,
+    return_metric_only: bool = True,
+    validation_split: float = 0.5,
+    standardize: bool = True,
+    mlp_widths: Sequence = (64, 64),
+    **kwargs,
+) -> float | Mapping[str, Any]:
+    """
+    C2ST metric [1] between samples from two distributions computed using a neural classifier.
+    Can be computationally expensive if called in a loop[, since it needs to train the model
+    for each set of samples.
+
+    Note: works best for large numbers of samples and averaged across different posteriors.
+
+    [1] Lopez-Paz, D., & Oquab, M. (2016). Revisiting classifier two-sample tests. arXiv:1610.06545.
+
+    Parameters
+    ----------
+    estimates : np.ndarray
+        Array of shape (num_samples_est, num_variables) containing samples representing estimated quantities
+        (e.g., approximate posterior samples).
+    targets : np.ndarray
+        Array of shape (num_samples_tar, num_variables) containing target samples
+        (e.g., samples from a reference posterior).
+    metric : str, optional
+        Metric to evaluate the classifier performance. Default is "accuracy".
+    patience : int, optional
+        Number of epochs with no improvement after which training will be stopped. Default is 5.
+    max_epochs : int, optional
+        Maximum number of epochs to train the classifier. Default is 1000.
+    batch_size : int, optional
+        Number of samples per batch during training. Default is 64.
+    return_metric_only : bool, optional
+        If True, only the final validation metric is returned. Otherwise, a dictionary with the score, classifier, and
+        full training history is returned. Default is True.
+    validation_split : float, optional
+        Fraction of the training data to be used as validation data. Default is 0.5.
+    standardize : bool, optional
+        If True, both estimates and targets will be standardized using the mean and standard deviation of estimates.
+        Default is True.
+    mlp_widths : Sequence[int], optional
+        Sequence specifying the number of units in each hidden layer of the MLP classifier. Default is (256, 256).
+    **kwargs
+        Additional keyword arguments. Recognized keyword:
+            mlp_kwargs : dict
+                Dictionary of additional parameters to pass to the MLP constructor.
+
+    Returns
+    -------
+    results : float or dict
+        If return_metric_only is True, returns the final validation metric (e.g., accuracy) as a float.
+        Otherwise, returns a dictionary with keys "score", "classifier", and "history", where "score"
+        is the final validation metric, "classifier" is the trained Keras model, and "history" contains the
+        full training history.
+    """
+
+    # Error, if targets dim does not match estimates dim
+    num_dims = estimates.shape[1]
+    if not num_dims == targets.shape[1]:
+        raise ShapeError(
+            f"estimates and targets can have different number of samples (1st dim)"
+            f"but must have the same dimensionality (2nd dim)"
+            f"found: estimates shape {estimates.shape[1]}, targets shape {targets.shape[1]}"
+        )
+
+    # Standardize both estimates and targets relative to estimates mean and std
+    if standardize:
+        estimates_mean = np.mean(estimates, axis=0)
+        estimates_std = np.std(estimates, axis=0)
+        estimates = (estimates - estimates_mean) / estimates_std
+        targets = (targets - estimates_mean) / estimates_std
+
+    # Create data for classification task
+    data = np.r_[estimates, targets]
+    labels = np.r_[np.zeros((estimates.shape[0],)), np.ones((targets.shape[0],))]
+
+    # Important: needed, since keras does not shuffle before selecting validation split
+    shuffle_idx = np.random.permutation(data.shape[0])
+    data = data[shuffle_idx]
+    labels = labels[shuffle_idx]
+
+    # Create and train classifier with optional stopping
+    classifier = keras.Sequential(
+        [MLP(widths=mlp_widths, **kwargs.get("mlp_kwargs", {})), keras.layers.Dense(1, activation="sigmoid")]
+    )
+
+    classifier.compile(optimizer="adam", loss="binary_crossentropy", metrics=[metric])
+
+    early_stopping = keras.callbacks.EarlyStopping(
+        monitor=f"val_{metric}", patience=patience, restore_best_weights=True
+    )
+
+    # For now, we need to enable grads, since we turn them off by default
+    if keras.backend.backend() == "torch":
+        import torch
+
+        with torch.enable_grad():
+            history = classifier.fit(
+                x=data,
+                y=labels,
+                epochs=max_epochs,
+                batch_size=batch_size,
+                verbose=0,
+                callbacks=[early_stopping],
+                validation_split=validation_split,
+            )
+    else:
+        history = classifier.fit(
+            x=data,
+            y=labels,
+            epochs=max_epochs,
+            batch_size=batch_size,
+            verbose=0,
+            callbacks=[early_stopping],
+            validation_split=validation_split,
+        )
+
+    if return_metric_only:
+        return history.history[f"val_{metric}"][-1]
+    return {"score": history.history[f"val_{metric}"][-1], "classifier": classifier, "history": history.history}
diff --git a/bayesflow/diagnostics/metrics/expected_calibration_error.py b/bayesflow/diagnostics/metrics/expected_calibration_error.py
index bf6fb951e..81f4d9cb6 100644
--- a/bayesflow/diagnostics/metrics/expected_calibration_error.py
+++ b/bayesflow/diagnostics/metrics/expected_calibration_error.py
@@ -3,14 +3,14 @@
 from typing import Sequence, Any, Mapping
 
 from ...utils.exceptions import ShapeError
-from sklearn.calibration import calibration_curve
+from ...utils.classification import calibration_curve
 
 
 def expected_calibration_error(
     estimates: np.ndarray,
     targets: np.ndarray,
     model_names: Sequence[str] = None,
-    n_bins: int = 10,
+    num_bins: int = 10,
     return_probs: bool = False,
 ) -> Mapping[str, Any]:
     """
@@ -31,11 +31,11 @@ def expected_calibration_error(
         The one-hot-encoded true model indices.
     model_names : Sequence[str], optional (default = None)
         Optional model names to show in the output. By default, models are called "M_" + model index.
-    n_bins    : int, optional, default: 10
+    num_bins    : int, optional, default: 10
         The number of bins to use for the calibration curves (and marginal histograms).
-        Passed into ``sklearn.calibration.calibration_curve()``.
+        Passed into ``bayesflow.utils.calibration_curve()``.
     return_probs : bool (default = False)
-        Do you want to obtain the output of ``sklearn.calibration.calibration_curve()``?
+        Do you want to obtain the output of ``bayesflow.utils.calibration_curve()``?
 
     Returns
     -------
@@ -48,9 +48,9 @@ def expected_calibration_error(
         - "model_names" : str
             The (inferred) variable names.
         - "probs_true": (optional) list[np.ndarray]:
-            Outputs of ``sklearn.calibration.calibration_curve()`` per model
+            Outputs of ``bayesflow.utils.calibration.calibration_curve()`` per model
         - "probs_pred": (optional) list[np.ndarray]:
-            Outputs of ``sklearn.calibration.calibration_curve()`` per model
+            Outputs of ``bayesflow.utils.calibration.calibration_curve()`` per model
     """
 
     # Convert tensors to numpy, if passed
@@ -76,10 +76,10 @@ def expected_calibration_error(
     for model_index in range(estimates.shape[-1]):
         y_true = (targets == model_index).astype(np.float32)
         y_prob = estimates[..., model_index]
-        prob_true, prob_pred = calibration_curve(y_true, y_prob, n_bins=n_bins)
+        prob_true, prob_pred = calibration_curve(y_true, y_prob, num_bins=num_bins)
 
         # Compute ECE by weighting bin errors by bin size
-        bins = np.linspace(0.0, 1.0, n_bins + 1)
+        bins = np.linspace(0.0, 1.0, num_bins + 1)
         binids = np.searchsorted(bins[1:-1], y_prob)
         bin_total = np.bincount(binids, minlength=len(bins))
         nonzero = bin_total != 0
diff --git a/bayesflow/diagnostics/plots/calibration_histogram.py b/bayesflow/diagnostics/plots/calibration_histogram.py
index c7c2de408..6ff10c881 100644
--- a/bayesflow/diagnostics/plots/calibration_histogram.py
+++ b/bayesflow/diagnostics/plots/calibration_histogram.py
@@ -102,7 +102,7 @@ def calibration_histogram(
             "Confidence intervals might be unreliable!"
         )
 
-    # Set n_bins automatically, if nothing provided
+    # Set num_bins automatically, if nothing provided
     if num_bins is None:
         num_bins = int(ratio / 2)
         # Attempt a fix if a single bin is determined so plot still makes sense
diff --git a/bayesflow/diagnostics/plots/mc_calibration.py b/bayesflow/diagnostics/plots/mc_calibration.py
index 1f1e3678f..0e80241e5 100644
--- a/bayesflow/diagnostics/plots/mc_calibration.py
+++ b/bayesflow/diagnostics/plots/mc_calibration.py
@@ -18,7 +18,7 @@ def mc_calibration(
     pred_models: dict[str, np.ndarray] | np.ndarray,
     true_models: dict[str, np.ndarray] | np.ndarray,
     model_names: Sequence[str] = None,
-    n_bins: int = 10,
+    num_bins: int = 10,
     label_fontsize: int = 16,
     title_fontsize: int = 18,
     metric_fontsize: int = 14,
@@ -41,12 +41,12 @@ def mc_calibration(
         The one-hot-encoded true model indices per data set.
     model_names       : list or None, optional, default: None
         The model names for nice plot titles. Inferred if None.
-    n_bins          : int, optional, default: 10
+    num_bins          : int, optional, default: 10
         The number of bins to use for the calibration curves (and marginal histograms).
     label_fontsize    : int, optional, default: 16
         The font size of the y-label and y-label texts
-    legend_fontsize   : int, optional, default: 14
-        The font size of the legend text (ECE value)
+    metric_fontsize   : int, optional, default: 14
+        The font size of the metric (e.g., ECE)
     title_fontsize    : int, optional, default: 18
         The font size of the title text. Only relevant if `stacked=False`
     tick_fontsize     : int, optional, default: 12
@@ -83,7 +83,7 @@ def mc_calibration(
         estimates=pred_models,
         targets=true_models,
         model_names=plot_data["variable_names"],
-        n_bins=n_bins,
+        num_bins=num_bins,
         return_probs=True,
     )
 
@@ -92,7 +92,7 @@ def mc_calibration(
         ax.plot(ece["probs_pred"][j], ece["probs_true"][j], "o-", color=color)
 
         # Plot PMP distribution over bins
-        uniform_bins = np.linspace(0.0, 1.0, n_bins + 1)
+        uniform_bins = np.linspace(0.0, 1.0, num_bins + 1)
         norm_weights = np.ones_like(plot_data["estimates"]) / len(plot_data["estimates"])
         ax.hist(plot_data["estimates"][:, j], bins=uniform_bins, weights=norm_weights[:, j], color="grey", alpha=0.3)
 
diff --git a/bayesflow/diagnostics/plots/mc_confusion_matrix.py b/bayesflow/diagnostics/plots/mc_confusion_matrix.py
index 2d27b5073..db5165eae 100644
--- a/bayesflow/diagnostics/plots/mc_confusion_matrix.py
+++ b/bayesflow/diagnostics/plots/mc_confusion_matrix.py
@@ -5,9 +5,8 @@
 from matplotlib.colors import LinearSegmentedColormap
 import numpy as np
 
-from sklearn.metrics import confusion_matrix
-
-from bayesflow.utils.plot_utils import make_figure
+from ...utils.plot_utils import make_figure
+from ...utils.classification import confusion_matrix
 
 
 def mc_confusion_matrix(
@@ -50,10 +49,9 @@ def mc_confusion_matrix(
     ytick_rotation: int, optional, default: None
         Rotation of y-axis tick labels (helps with long model names).
     normalize : {'true', 'pred', 'all'}, default=None
-        Passed to sklearn.metrics.confusion_matrix.
-        Normalizes confusion matrix over the true (rows), predicted (columns)
-        conditions or all the population. If None, confusion matrix will not be
-        normalized.
+        Passed to confusion matrix. Normalizes confusion matrix over the true (rows),
+        predicted (columns) conditions or all the population. If None, confusion matrix
+        will not be normalized.
     cmap           : matplotlib.colors.Colormap or str, optional, default: None
         Colormap to be used for the cells. If a str, it should be the name of a registered colormap,
         e.g., 'viridis'. Default colormap matches the BayesFlow defaults by ranging from white to red.
diff --git a/bayesflow/utils/__init__.py b/bayesflow/utils/__init__.py
index 84f5f47cb..5bdaea274 100644
--- a/bayesflow/utils/__init__.py
+++ b/bayesflow/utils/__init__.py
@@ -73,6 +73,7 @@
     fill_triangular_matrix,
     weighted_sum,
 )
+from .classification import calibration_curve, confusion_matrix
 from .validators import check_lengths_same
 from .workflow_utils import find_inference_network, find_summary_network
 
diff --git a/bayesflow/utils/classification/__init__.py b/bayesflow/utils/classification/__init__.py
new file mode 100644
index 000000000..94e6dbdfd
--- /dev/null
+++ b/bayesflow/utils/classification/__init__.py
@@ -0,0 +1,2 @@
+from .confusion_matrix import confusion_matrix
+from .calibration_curve import calibration_curve
diff --git a/bayesflow/utils/classification/calibration_curve.py b/bayesflow/utils/classification/calibration_curve.py
new file mode 100644
index 000000000..38e49fc08
--- /dev/null
+++ b/bayesflow/utils/classification/calibration_curve.py
@@ -0,0 +1,82 @@
+import numpy as np
+
+
+def calibration_curve(
+    targets: np.ndarray,
+    estimates: np.ndarray,
+    *,
+    pos_label: int | float | bool | str = 1,
+    num_bins: int = 5,
+    strategy: str = "uniform",
+):
+    """Compute true and predicted probabilities for a calibration curve.
+
+    The method assumes the inputs come from a binary classifier, and
+    discretize the [0, 1] interval into bins.
+
+    Code from: https://github.com/scikit-learn/scikit-learn/blob/98ed9dc73/sklearn/calibration.py#L927
+
+    Parameters
+    ----------
+    targets : array-like of shape (n_samples,)
+        True targets.
+    estimates : array-like of shape (n_samples,)
+        Probabilities of the positive class.
+    pos_label : int, float, bool or str, default = 1
+        The label of the positive class.
+    num_bins : int, default=5
+        Number of bins to discretize the [0, 1] interval. A bigger number
+        requires more data. Bins with no samples (i.e. without
+        corresponding values in `estimates`) will not be returned, thus the
+        returned arrays may have less than `num_bins` values.
+    strategy : {'uniform', 'quantile'}, default='uniform'
+        Strategy used to define the widths of the bins.
+
+        uniform
+            The bins have identical widths.
+        quantile
+            The bins have the same number of samples and depend on `y_prob`.
+
+    Returns
+    -------
+    prob_true : ndarray of shape (num_bins,) or smaller
+        The proportion of samples whose class is the positive class, in each
+        bin (fraction of positives).
+
+    prob_pred : ndarray of shape (num_bins,) or smaller
+        The mean estimated probability in each bin.
+
+    References
+    ----------
+    Alexandru Niculescu-Mizil and Rich Caruana (2005) Predicting Good
+    Probabilities With Supervised Learning, in Proceedings of the 22nd
+    International Conference on Machine Learning (ICML).
+    """
+
+    if estimates.min() < 0 or estimates.max() > 1:
+        raise ValueError("y_prob has values outside [0, 1].")
+
+    labels = np.unique(targets)
+    if len(labels) > 2:
+        raise ValueError(f"Only binary classification is supported. Provided labels {labels}.")
+    targets = targets == pos_label
+
+    if strategy == "quantile":  # Determine bin edges by distribution of data
+        quantiles = np.linspace(0, 1, num_bins + 1)
+        bins = np.percentile(estimates, quantiles * 100)
+    elif strategy == "uniform":
+        bins = np.linspace(0.0, 1.0, num_bins + 1)
+    else:
+        raise ValueError("Invalid entry to 'strategy' input. Strategy must be either 'quantile' or 'uniform'.")
+
+    binids = np.searchsorted(bins[1:-1], estimates)
+
+    bin_sums = np.bincount(binids, weights=estimates, minlength=len(bins))
+    bin_true = np.bincount(binids, weights=targets, minlength=len(bins))
+    bin_total = np.bincount(binids, minlength=len(bins))
+
+    nonzero = bin_total != 0
+    prob_true = bin_true[nonzero] / bin_total[nonzero]
+    prob_pred = bin_sums[nonzero] / bin_total[nonzero]
+
+    return prob_true, prob_pred
diff --git a/bayesflow/utils/classification/confusion_matrix.py b/bayesflow/utils/classification/confusion_matrix.py
new file mode 100644
index 000000000..4aadda28f
--- /dev/null
+++ b/bayesflow/utils/classification/confusion_matrix.py
@@ -0,0 +1,61 @@
+from typing import Sequence
+
+import numpy as np
+
+
+def confusion_matrix(targets: np.ndarray, estimates: np.ndarray, labels: Sequence = None, normalize: str = None):
+    """
+    Compute confusion matrix to evaluate the accuracy of a classification or model comparison setting.
+
+    Code inspired by: https://github.com/scikit-learn/scikit-learn/blob/98ed9dc73/sklearn/metrics/_classification.py
+
+    Parameters
+    ----------
+    targets : np.ndarray
+        Ground truth (correct) target values.
+    estimates : np.ndarray
+        Estimated targets as returned by a classifier.
+    labels : Sequence, optional
+        List of labels to index the matrix. This may be used to reorder or select a subset of labels.
+        If None, labels that appear at least once in y_true or y_pred are used in sorted order.
+    normalize : {'true', 'pred', 'all'}, optional
+        Normalizes confusion matrix over the true (rows), predicted (columns)
+        conditions or all the population. If None, no normalization is applied.
+
+    Returns
+    -------
+    cm : np.ndarray of shape (num_labels, num_labels)
+        Confusion matrix. Rows represent true classes, columns represent predicted classes.
+    """
+
+    # Get unique labels
+    if labels is None:
+        labels = np.unique(np.concatenate((targets, estimates)))
+    else:
+        labels = np.asarray(labels)
+
+    label_to_index = {label: i for i, label in enumerate(labels)}
+    num_labels = len(labels)
+
+    # Initialize the confusion matrix
+    cm = np.zeros((num_labels, num_labels), dtype=np.int64)
+
+    # Fill confusion matrix
+    for t, p in zip(targets, estimates):
+        if t in label_to_index and p in label_to_index:
+            cm[label_to_index[t], label_to_index[p]] += 1
+
+    # Normalize if required
+    if normalize == "true":
+        with np.errstate(all="ignore"):
+            cm = cm.astype(np.float64)
+            cm = np.divide(cm, cm.sum(axis=1, keepdims=True), where=cm.sum(axis=1, keepdims=True) != 0)
+    elif normalize == "pred":
+        with np.errstate(all="ignore"):
+            cm = cm.astype(np.float64)
+            cm = np.divide(cm, cm.sum(axis=0, keepdims=True), where=cm.sum(axis=0, keepdims=True) != 0)
+    elif normalize == "all":
+        cm = cm.astype(np.float64)
+        cm /= cm.sum()
+
+    return cm
diff --git a/pyproject.toml b/pyproject.toml
index 5f83611f2..8018b55a4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,7 +27,6 @@ dependencies = [
     "matplotlib",
     "numpy >= 1.24, <2.0",
     "pandas",
-    "scikit-learn",
     "scipy",
     "seaborn",
     "tqdm",
diff --git a/tests/test_diagnostics/conftest.py b/tests/test_diagnostics/conftest.py
index 9bd9bd907..5103ea455 100644
--- a/tests/test_diagnostics/conftest.py
+++ b/tests/test_diagnostics/conftest.py
@@ -8,6 +8,16 @@ def var_names():
     return [r"$\beta_0$", r"$\beta_1$", r"$\sigma$"]
 
 
+@pytest.fixture()
+def random_samples_a():
+    return np.random.normal(loc=0, scale=1, size=(5000, 8))
+
+
+@pytest.fixture()
+def random_samples_b():
+    return np.random.normal(loc=0, scale=3, size=(5000, 8))
+
+
 @pytest.fixture()
 def random_estimates():
     return {
diff --git a/tests/test_diagnostics/test_diagnostics_metrics.py b/tests/test_diagnostics/test_diagnostics_metrics.py
index 792cf558a..4fb0945b3 100644
--- a/tests/test_diagnostics/test_diagnostics_metrics.py
+++ b/tests/test_diagnostics/test_diagnostics_metrics.py
@@ -50,6 +50,14 @@ def test_root_mean_squared_error(random_estimates, random_targets):
     assert out["variable_names"] == ["beta_0", "beta_1", "sigma"]
 
 
+def test_classifier_two_sample_test(random_samples_a, random_samples_b):
+    metric = bf.diagnostics.metrics.classifier_two_sample_test(estimates=random_samples_a, targets=random_samples_a)
+    assert 0.55 > metric > 0.45
+
+    metric = bf.diagnostics.metrics.classifier_two_sample_test(estimates=random_samples_a, targets=random_samples_b)
+    assert metric > 0.55
+
+
 def test_expected_calibration_error(pred_models, true_models, model_names):
     out = bf.diagnostics.metrics.expected_calibration_error(pred_models, true_models, model_names=model_names)
     assert list(out.keys()) == ["values", "metric_name", "model_names"]