[API 2] add PDP (#517)

jpaillard · web-flow · commit f1cf2956dc73 · 2025-11-06T09:43:57.000+01:00
* minimal PDP

* docstring

* plot at end of cells

* remove print, try fix coverage

* add API page

* fix docstring

* typo

* remove left right

* fix plot

* clean example
diff --git a/docs/src/api.rst b/docs/src/api.rst
@@ -45,6 +45,16 @@ Feature Importance functions
    ensemble_clustered_inference
    ensemble_clustered_inference_pvalue
 
+Visualization
+=============
+
+.. autosummary::
+   :toctree: ./generated/api/class/
+   :template: class.rst
+
+   ~visualization.PDP
+
+
 Samplers
 ========
 
diff --git a/examples/plot_partial_dependency_plot.py b/examples/plot_partial_dependency_plot.py
@@ -0,0 +1,92 @@
+"""
+Visualization with Partial Dependency Plots
+===========================================
+
+This example demonstrates how to create Partial Dependency Plots (PDPs). This
+visualization method allows you to examine a model's dependence on a single feature or
+a pair of features. The underlying implementation is built upon
+sklearn.inspection.partial_dependence, which calculates the dependence by taking the
+average response of an estimator across all possible values of the target feature(s).
+We'll use the circles dataset to illustrate the basic usage.
+"""
+
+# %%
+# Loading the circles dataset
+# ----------------------------
+# We start by sampling a synthetic dataset using the `make_circles` function from
+# `sklearn.datasets`.
+
+import matplotlib.pyplot as plt
+import numpy as np
+import seaborn as sns
+from sklearn.datasets import make_circles
+
+X, y = make_circles(n_samples=500, noise=0.1, factor=0.7, random_state=0)
+
+# Visualizing the dataset
+_, ax = plt.subplots()
+sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, ax=ax)
+ax.set_xlabel("X0")
+ax.set_ylabel("X1")
+sns.despine(ax=ax)
+c1 = plt.Circle((0, 0), 0.85, color="k", ls="--", fill=False, label="class boundary")
+ax.add_patch(c1)
+_ = ax.legend(loc="upper right")
+
+
+# %%
+# Training a classifier
+# ---------------------
+# Next, we train a model to solve the binary classification task presented by the
+# non-linearly separable circles dataset. For this example, we'll use a gradient
+# boosted tree ensemble, specifically the HistGradientBoostingClassifier from
+# scikit-learn.
+
+from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.metrics import roc_auc_score
+from sklearn.model_selection import train_test_split
+
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
+model = HistGradientBoostingClassifier(random_state=0)
+
+model.fit(X_train, y_train)
+y_pred = model.predict_proba(X_test)
+
+auc = roc_auc_score(y_true=y_test, y_score=y_pred[:, 1])
+print(f"ROC AUC on the test set: {auc:.2f}")
+
+
+# %%
+# Partial Dependence for an Individual Feature
+# --------------------------------------------
+# Once the model is fitted, we use the Partial Dependency Plot (PDP) to visualize its
+# dependence on a single input feature (e.g., the first feature, :math:`X_0`).The
+# resulting plot shows the average response of the model (on the :math`y`-axis)
+# for each possible value of the selected feature (on the :math:`x`-axis), with the averaging
+# performed over all other features in the dataset.
+#
+# The plot also includes the marginal distribution of the feature considered along
+# the :math:`x`-axis. This feature distribution is essential for identifying
+# low-density regions in the data. Model predictions and the estimated partial
+# dependence can be less reliable or extrapolated in these regions.
+
+
+from hidimstat.visualization import PDP
+
+# sphinx_gallery_thumbnail_number = 2
+pdp = PDP(model)
+_ = pdp.plot(X_test, features=0)
+
+
+# %%
+# Partial Dependence on a Pair of Features
+# ----------------------------------------
+# We can similarly visualize the dependence of the model on a pair of features
+# (e.g., :math:`X_0` and :math:`X_1`). Here, the partial dependence is encoded by
+# contour lines (level lines) across the 2D plot. The marginal distribution for each
+# feature is also represented along the axes to help identify regions where the
+# estimated dependence might be unreliable due to a low density of training data.
+
+axes = pdp.plot(X_test, features=[0, 1], cmap="RdBu_r")
+c1 = plt.Circle((0, 0), 0.85, color="k", ls="--", fill=False, zorder=10)
+_ = axes[1, 0].add_patch(c1)
diff --git a/src/hidimstat/visualization/__init__.py b/src/hidimstat/visualization/__init__.py
@@ -0,0 +1,3 @@
+from .partial_dependence_plot import PDP
+
+__all__ = ["PDP"]
diff --git a/src/hidimstat/visualization/partial_dependence_plot.py b/src/hidimstat/visualization/partial_dependence_plot.py
@@ -0,0 +1,127 @@
+from copy import copy
+
+import matplotlib.pyplot as plt
+import numpy as np
+import seaborn as sns
+from sklearn.inspection import partial_dependence
+
+
+class PDP:
+    """
+    Partial Dependence Plot (PDP) visualization. This class is based on
+    `sklearn.inspection.partial_dependence` to compute the partial dependence
+    values and provides methods to plot 1D and 2D PDPs. For each realization of a feature
+    or pair of features :math:`x_S`, the partial dependence :math:`f_S(x_S)` is defined
+    as :math:`f_S(x_S) = \mathbb{E}_{X_{-S}}[ f(x_S, X_{-S})]`,
+    where :math:`X_{-S}` denotes all features except those in :math:`S`.
+
+    Parameters
+    ----------
+    estimator : object
+        A fitted scikit-learn estimator implementing `predict` or `predict_proba`.
+    feature_names : list of str, optional
+        Names of the features. If None, X0, X1, ... will be used.
+
+    """
+
+    def __init__(self, estimator, feature_names=None):
+        self.estimator = estimator
+        self.feature_names = feature_names
+
+    def plot(self, X, features, cmap="viridis", **kwargs):
+        """
+        Plot the Partial Dependence Plot for the specified feature (1D) or pair of
+        features (2D). The marginal distribution of the feature(s) is also displayed.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input data used to compute the partial dependence.
+        features : int or list of int
+            The feature index (for 1D PDP) or list of two feature indices (for 2D PDP).
+        cmap : str, optional
+            The colormap to use for the plot (only for 2D PDP). Default is "viridis".
+        **kwargs : additional keyword arguments
+            Additional keyword arguments passed to:
+            - `sns.lineplot` for 1D PDP
+            - `ax.contour` for 2D PDP
+        """
+        if isinstance(features, int):
+            feature_ids = [features]
+            plotting_func = self._plot_1d
+        elif isinstance(features, list):
+            if len(features) > 2:
+                raise ValueError("Only 1D and 2D PDP plots are supported")
+            else:
+                feature_ids = copy(features)
+                plotting_func = self._plot_2d
+
+        if self.feature_names is not None:
+            feature_names = [self.feature_names[idx] for idx in feature_ids]
+        else:
+            feature_names = [f"X{idx}" for idx in feature_ids]
+
+        pd = partial_dependence(self.estimator, X, features=features)
+        return plotting_func(pd, feature_names, cmap=cmap, **kwargs)
+
+    @staticmethod
+    def _plot_1d(pd, feature_names, cmap=None, **kwargs):
+
+        _, axes = plt.subplots(2, 1, height_ratios=[0.2, 1])
+        ax = axes[0]
+
+        sns.kdeplot(pd["grid_values"], ax=ax, legend=False, fill=True)
+        sns.rugplot(pd["grid_values"], ax=ax, height=0.25, legend=False)
+        sns.despine(ax=ax, left=True)
+        # Plot partial dependence
+        ax.spines["left"].set_visible(False)
+        ax.spines["bottom"].set_visible(True)
+        ax.xaxis.set_ticks([])
+        ax.yaxis.set_visible(False)
+
+        ax = axes[1]
+        sns.lineplot(x=pd["grid_values"][0], y=pd["average"][0], **kwargs)
+        ax.set_xlabel(feature_names[0])
+        ax.set_ylabel("Partial Dependence")
+        sns.despine(ax=ax)
+        plt.tight_layout()
+        return axes
+
+    @staticmethod
+    def _plot_2d(pd, feature_names, cmap="viridis", **kwargs):
+        x = pd["grid_values"][0]
+        y = pd["grid_values"][1]
+        z = pd["average"][0]
+
+        xx, yy = np.meshgrid(x, y, indexing="ij")
+
+        _, axes = plt.subplots(
+            2, 2, figsize=(8, 6), height_ratios=[0.2, 1], width_ratios=[1, 0.2]
+        )
+        ax = axes[1, 0]
+        contour = ax.contour(xx, yy, z, cmap=cmap, **kwargs)
+        ax.set_xlabel(feature_names[0])
+        ax.set_ylabel(feature_names[1])
+        ax.clabel(contour, inline=True, fontsize=10)
+        sns.despine(ax=ax)
+
+        ax = axes[0, 0]
+        sns.kdeplot(x, ax=ax, legend=False, fill=True)
+        sns.rugplot(x, ax=ax, height=0.25, legend=False)
+        sns.despine(ax=ax)
+        ax.spines["left"].set_visible(False)
+        ax.spines["bottom"].set_visible(True)
+        ax.xaxis.set_ticks([])
+        ax.yaxis.set_visible(False)
+
+        ax = axes[1, 1]
+        sns.kdeplot(y=y, ax=ax, legend=False, fill=True)
+        sns.rugplot(y=y, ax=ax, height=0.25, legend=False)
+        sns.despine(ax=ax)
+        ax.spines["bottom"].set_visible(False)
+        ax.yaxis.set_ticks([])
+        ax.xaxis.set_visible(False)
+
+        axes[0, 1].remove()
+        plt.tight_layout()
+        return axes

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .partial_dependence_plot import PDP`
	`2`	`+`
	`3`	`+__all__ = ["PDP"]`