[DOC] User guide section 3. model-agnostic methods: CFI (#402)

antoinebaker · jpaillard · lionelkusch · web-flow · commit f8b5446fe2d3 · 2025-10-07T11:24:15.000+02:00
* first commit

* add section structure

* [doc quick] [skip tests] skip

* missing .rst? [doc quick] [skip tests]

* fix link [doc quick] [skip tests]

* add CFI fiirst draft and TSI [doc quick] [skip tests]

* missing space? [doc quick] [skip tests]

* replace space

* [doc quick] [skip tests]

* add ref and note section [doc quick] [skip tests]

* add code snippets

* typo cfi [doc quick] [skip tests]

* add total sobol index ref [doc quick] [skip tests]

* add copy button [doc quick] [skip tests]

* missing sphinx requirements [quick doc] [skip tests]

* add copybutton config

* [doc quick] [skip tests]

* solve example test

* clarify "sub-model" for classif and regression

* trry add figure + update note

* add intro

* try fix image path

* trigger CI

* try not to scale

* definition

* try image

* [skip tests] trigger CI

* [tests skip] another one

* trigger CI

* back to figure

* add inference section

* add reff

* skip tests

* [skip tests]

* Update docs/src/model_agnostic_methods/conditional_feature_importance.rst

Co-authored-by: lionel kusch &lt;lionel.kusch@grenoble-inp.org&gt;

* [skip tests] format bullet

* rephrase not

* [skip tests]

* [skip tests]

* [skip tests] linkcheck generated ignore images

* [skip tests] linkcheck generated ignore

* review

* trigger CI

* Update docs/src/model_agnostic_methods/conditional_feature_importance.rst

Co-authored-by: Ángel Reyero Lobo &lt;angelreyerolobo@gmail.com&gt;

* Update docs/src/model_agnostic_methods/conditional_feature_importance.rst

Co-authored-by: Ángel Reyero Lobo &lt;angelreyerolobo@gmail.com&gt;

* Update docs/src/model_agnostic_methods/conditional_feature_importance.rst

Co-authored-by: Ángel Reyero Lobo &lt;angelreyerolobo@gmail.com&gt;

* Update docs/src/model_agnostic_methods/conditional_feature_importance.rst

Co-authored-by: Ángel Reyero Lobo &lt;angelreyerolobo@gmail.com&gt;

* genetic example

* Update docs/src/model_agnostic_methods/conditional_feature_importance.rst

Co-authored-by: Ángel Reyero Lobo &lt;angelreyerolobo@gmail.com&gt;

* Update docs/src/model_agnostic_methods/conditional_feature_importance.rst

Co-authored-by: Ángel Reyero Lobo &lt;angelreyerolobo@gmail.com&gt;

* Update docs/src/model_agnostic_methods/conditional_feature_importance.rst

Co-authored-by: Ángel Reyero Lobo &lt;angelreyerolobo@gmail.com&gt;

* Update docs/src/model_agnostic_methods/total_sobol_index.rst

Co-authored-by: Ángel Reyero Lobo &lt;angelreyerolobo@gmail.com&gt;

* Update docs/src/model_agnostic_methods/conditional_feature_importance.rst

Co-authored-by: bthirion &lt;bertrand.thirion@inria.fr&gt;

* Update docs/src/model_agnostic_methods/conditional_feature_importance.rst

Co-authored-by: bthirion &lt;bertrand.thirion@inria.fr&gt;

* Update docs/src/model_agnostic_methods/total_sobol_index.rst

Co-authored-by: Ángel Reyero Lobo &lt;angelreyerolobo@gmail.com&gt;

---------

Co-authored-by: jpaillard &lt;joseph.paillard@inria.fr&gt;
Co-authored-by: lionel kusch &lt;lionel.kusch@grenoble-inp.org&gt;
Co-authored-by: Ángel Reyero Lobo &lt;angelreyerolobo@gmail.com&gt;
Co-authored-by: bthirion &lt;bertrand.thirion@inria.fr&gt;
diff --git a/docs/src/glm_methods.rst b/docs/src/glm_methods.rst
@@ -0,0 +1,12 @@
+.. _glm_methods:
+
+
+======================
+GLM methods
+======================
+
+.. toctree::
+   :maxdepth: 2
+
+   glm_methods/desparsified_lasso.rst
+   glm_methods/knockoffs.rst
diff --git a/docs/src/glm_methods/desparsified_lasso.rst b/docs/src/glm_methods/desparsified_lasso.rst
@@ -0,0 +1,6 @@
+.. _desparsified_lasso:
+
+
+======================
+Desparsified Lasso
+======================
diff --git a/docs/src/glm_methods/knockoffs.rst b/docs/src/glm_methods/knockoffs.rst
@@ -0,0 +1,6 @@
+.. _knockoffs:
+
+
+======================
+Knockoffs
+======================
diff --git a/docs/src/marginal_methods.rst b/docs/src/marginal_methods.rst
@@ -0,0 +1,11 @@
+.. _marginal_methods:
+
+
+======================
+Marginal methods
+======================
+
+.. toctree::
+   :maxdepth: 2
+
+   marginal_methods/leave_one_covariate_in.rst
diff --git a/docs/src/marginal_methods/leave_one_covariate_in.rst b/docs/src/marginal_methods/leave_one_covariate_in.rst
@@ -0,0 +1,5 @@
+.. _leave_one_covariate_in:
+
+======================
+Leave-One-Covariate-In
+======================
diff --git a/docs/src/methods_list.rst b/docs/src/methods_list.rst
diff --git a/docs/src/model_agnostic_methods.rst b/docs/src/model_agnostic_methods.rst
@@ -0,0 +1,14 @@
+.. _model_agnostic_methods:
+
+
+======================
+Model-agnostic methods
+======================
+
+.. toctree::
+   :maxdepth: 2
+
+   model_agnostic_methods/total_sobol_index
+   model_agnostic_methods/leave_one_covariate_out
+   model_agnostic_methods/conditional_feature_importance
+   model_agnostic_methods/permutation_feature_importance
diff --git a/docs/src/model_agnostic_methods/conditional_feature_importance.rst b/docs/src/model_agnostic_methods/conditional_feature_importance.rst
@@ -0,0 +1,164 @@
+.. _conditional_feature_importance:
+
+
+==============================
+Conditional Feature Importance
+==============================
+
+Conditional Feature Importance (CFI) is a model-agnostic approach for quantifying the 
+relevance of individual or groups of features in predictive models. It is a 
+perturbation-based method that compares the predictive performance of a model on 
+unmodified test data—following the same distribution as the training data—
+to its performance when the studied feature is conditionally perturbed. Thus, this approach 
+does not require retraining the model.
+
+.. figure:: ../generated/gallery/examples/images/sphx_glr_plot_cfi_001.png
+    :target: ../generated/gallery/examples/plot_cfi.html
+    :align: center
+
+
+Theoretical index
+------------------
+
+Conditional Feature Importance (CFI) is a model-agnostic method for estimating feature 
+importance through conditional perturbations. Specifically, it constructs a perturbed 
+version of the feature :math:`X_j^P`, sampled independently from the conditional distribution 
+:math:`P(X_j | X_{-j})`, such that its association with the output is removed: 
+:math:`X_j^P \perp Y \mid X^{-j}`. The predictive model is then evaluated on the 
+modified feature vector :math:`\tilde X = [X_1, ..., X_j^P, ..., X_p]`, and the 
+importance of the feature is quantified by the resulting drop in model performance.
+
+.. math::
+    \psi_j^{CFI} = \mathbb{E} [\mathcal{L}(y, \mu(\tilde X))] - \mathbb{E} [\mathcal{L}(y, \mu(X))].
+
+
+The target quantity estimated by CFI is the Total Sobol Index (TSI) :ref:`total_sobol_index`. 
+Indeed, 
+
+.. math::
+    \frac{1}{2} \psi_j^{CFI} 
+    = \psi_j^{TSI} 
+    = \mathbb{E} [\mathcal{L}(y, \mu_{-j}(X^-j))] - \mathbb{E} [\mathcal{L}(y, \mu(X))].
+
+Where in regression, :math:`\mu_{-j}(X_{-j}) = \mathbb{E}[Y| X_{-j}]` is the 
+theoretical model without the :math:`j^{th}` feature.
+
+Estimation procedure
+--------------------
+
+The estimation of CFI relies on the ability to sample the perturbed feature matrix 
+:math:`\tilde X`, and specifically to sample :math:`X_j^p` independently from the conditional 
+distribution, :math:`X_j^p \overset{\text{i.i.d.}}{\sim} P(X_j | X_{-j})`, while breaking the
+association with the output :math:`Y`. Any conditional sampler can be used. A valid 
+and efficient approach is conditional permutation (:footcite:t:`Chamma_NeurIPS2023`). 
+This procedure decomposes the :math:`j^{th}` feature into a part that 
+is predictable from the other features and a residual term that is 
+independent of the other features:
+
+.. math::
+    X_j = \nu_j(X_{-j}) + \epsilon_j, \quad \text{with} \quad \epsilon_j \perp\!\!\!\perp X_{-j} \text{ and } \mathbb{E}[\epsilon_j] = 0.
+
+Here :math:`\nu_j(X_{-j}) = \mathbb{E}[X_j | X_{-j}]` is the conditional expectation of
+:math:`X_j` given the other features. In practice, :math:`\nu_j` is unknown and has to be
+estimated from the data using a predictive model. 
+
+Then the perturbed feature :math:`X_j^p` is generated by keeping the predictable part
+:math:`\nu_j(X_{-j})` unchanged, and by replacing the residual :math:`\epsilon_j` by a
+randomly permuted version :math:`\epsilon_j^p`:
+
+.. math::
+    X_j^p = \nu_j(X_{-j}) + \epsilon_j^p, \quad \text{with} \quad \epsilon_j^p \sim \text{Perm}(\epsilon_j).
+
+
+.. note:: **Estimation of** :math:`\nu_j`
+
+    To generate the perturbed feature :math:`X_j^p`, a model for :math:`\nu_j` is required.
+    Estimating :math:`\nu_j` amounts to modeling the relationship between features and is
+    arguably an easier task than estimating the relationship between features and the 
+    target. This 'model-X' assumption was for instance argued in :footcite:t:`Chamma_NeurIPS2023`, 
+    :footcite:t:`candes2018panning`. 
+    For example, in genetics, features such as single nucleotide polymorphisms (SNPs) 
+    are the basis of complex biological processes that result in an outcome (phenotype), 
+    such as a disease. Predicting the phenotype from SNPs is challenging, whereas 
+    modeling the relationships between SNPs is often easier due to known correlation 
+    structures in the genome (linkage disequilibrium). As a result, simple predictive 
+    models such as regularized linear models or decision trees can be used to estimate 
+    :math:`\nu_j`.
+
+
+Inference
+---------
+Under standard assumptions such as additive model: :math:`Y = \mu(X) + \epsilon`, 
+Conditional Feature Importance (CFI) allows for conditional independence testing, which 
+determines if a feature provides any unique information to the model's predictions that 
+isn't already captured by the other features. Essentially, we are testing whether the output is independent from the studied feature given the rest of the input:
+
+.. math::
+    \mathcal{H}_0: Y \perp\!\!\!\perp X_j | X_{-j}.
+
+
+The core of this inference is to test the statistical significance of the loss 
+differences estimated by CFI. Consequently, a one-sample test on the loss differences
+(or a paired test on the losses) needs to be performed. 
+
+Two technical challenges arise in this context:
+
+* When cross-validation (for instance, k-fold) is used to estimate CFI, the loss
+  differences obtained from different folds are not independent. Consequently,
+  performing a simple t-test on the loss differences is not valid. This issue can be
+  addressed by a corrected t-test accounting for this dependence, such as the one
+  proposed in :footcite:t:`nadeau1999inference`.
+* Vanishing variance: Under the null hypothesis, even if the loss difference
+  converges to zero, the variance of the loss differences also vanishes due to the quadratic functional (:footcite:t:verdinelli2024feature``) . This makes the
+  standard one-sample t-test invalid. This second issue can be handled by correcting
+  the variance estimate or using other nonparametric test.
+
+
+Regression example
+------------------
+The following example illustrates the use of CFI on a regression task with::
+
+    >>> from sklearn.datasets import make_regression
+    >>> from sklearn.linear_model import LinearRegression
+    >>> from sklearn.model_selection import train_test_split
+    >>> from hidimstat import CFI
+
+
+    >>> X, y = make_regression(n_features=2)
+    >>> X_train, X_test, y_train, y_test = train_test_split(X, y)
+    >>> model = LinearRegression().fit(X_train, y_train)
+    
+    >>> cfi = CFI(estimator=model, imputation_model_continuous=LinearRegression())
+    >>> cfi = cfi.fit(X_train, y_train)
+    >>> features_importance = cfi.importance(X_test, y_test)
+
+
+Classification example
+----------------------
+To measure feature importance in a classification task, a classification loss should be
+used, in addition, the prediction method of the estimator should output the corresponding 
+type of prediction (probabilities or classes). The following example illustrates the use
+of CFI on a classification task::
+
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.ensemble import RandomForestClassifier
+    >>> from sklearn.linear_model import LinearRegression
+    >>> from sklearn.metrics import log_loss
+    >>> from sklearn.model_selection import train_test_split
+    >>> from hidimstat import CFI
+
+    >>> X, y = make_classification(n_features=4)
+    >>> X_train, X_test, y_train, y_test = train_test_split(X, y)
+    >>> model = RandomForestClassifier().fit(X_train, y_train)
+    >>> cfi = CFI(
+    ...     estimator=model,
+    ...     imputation_model_continuous=LinearRegression(),
+    ...     loss=log_loss,
+    ...     method="predict_proba",
+    ... )
+    >>> cfi = cfi.fit(X_train, y_train)
+    >>> features_importance = cfi.importance(X_test, y_test)
+
+References
+----------
+.. footbibliography::
diff --git a/docs/src/model_agnostic_methods/leave_one_covariate_out.rst b/docs/src/model_agnostic_methods/leave_one_covariate_out.rst
@@ -0,0 +1,8 @@
+.. _leave_one_covariate_out:
+
+
+========================
+Leave-One-Covariate-Out
+========================
+
+TODO: Write this section.
diff --git a/docs/src/model_agnostic_methods/permutation_feature_importance.rst b/docs/src/model_agnostic_methods/permutation_feature_importance.rst
@@ -0,0 +1,6 @@
+.. _permutation_feature_importance:
+
+
+==============================
+Permutation Feature Importance
+==============================
diff --git a/docs/src/model_agnostic_methods/total_sobol_index.rst b/docs/src/model_agnostic_methods/total_sobol_index.rst
@@ -0,0 +1,19 @@
+.. _total_sobol_index:
+
+
+======================
+Total Sobol Index
+======================
+
+The Total Sobol Index (TSI) :footcite:t:`sobol1993sensitivity` is a widely used measure of feature importance that is
+rooted in sensitivity analysis. It can intuitively be defined by the performance drop
+of a predictive model when a feature is removed, similarly to an ablation study. In general, the TSI can be expressed as:
+
+.. math::
+    \psi_j^{TSI} = \mathbb{E} \left[\mathcal{L}\left(y, \mu(X)\right)\right] - \mathbb{E} \left[\mathcal{L}\left(y, \mu_{-j}(X^{-j})\right)\right],
+
+where :math:`X^{-j}` denotes the feature vector without the :math:`j^{th}` feature, and
+:math:`\mu_{-j}(X^{-j})` is the same predictive model as :math:`\mu(X)` but retrained 
+on the reduced feature set :math:`X^{-j}`. When :math:`\mathcal{L}` is the squared loss, 
+for a regression task, :math:`\mu_{-j}(X^{-j}) = \mathbb{E}[y | X^{-j}]` and when 
+:math:`\mathcal{L}` is the log-loss, for a classification task, :math:`\mu_{-j}(X^{-j}) = P(y | X^{-j})`.
diff --git a/docs/src/user_guide.rst b/docs/src/user_guide.rst
@@ -15,7 +15,9 @@ Table of contents
    :maxdepth: 3
 
    concepts.rst
-   methods_list.rst
+   glm_methods.rst
+   model_agnostic_methods.rst
+   marginal_methods.rst
    visualization.rst
    grouping.rst
    high_dimension.rst
diff --git a/docs/tools/conf.py b/docs/tools/conf.py
@@ -47,8 +47,14 @@
     "sphinx_prompt",
     "numpydoc",
     "sphinx.ext.linkcode",  # use the function linkcode_resolve for the definition of the link
+    "sphinx_copybutton",
 ]
 
+# Specify how to identify the prompt when copying a code snippet
+copybutton_prompt_text = r">>> |\.\.\. "
+copybutton_prompt_is_regexp = True
+copybutton_exclude = "style"
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["../tools/_templates"]
 # List of patterns, relative to source directory, that match files and
@@ -165,4 +171,5 @@
 linkcheck_ignore = [
     # A lot of link DOI "fail" - false positives: easier to ignore them
     r"https://doi.org/.*",
+    r"../generated/gallery/examples/.*",
 ]
diff --git a/docs/tools/references.bib b/docs/tools/references.bib
@@ -240,6 +240,14 @@ @article{mi2021permutation
   year      = {2021}
 }
 
+@article{nadeau1999inference,
+  title={Inference for the generalization error},
+  author={Nadeau, Claude and Bengio, Yoshua},
+  journal={Advances in neural information processing systems},
+  volume={12},
+  year={1999}
+}
+
 @inproceedings{pmlr-v119-nguyen20a,
   author    = {Nguyen, Tuan-Binh and Chevalier, Jerome-Alexis and Thirion, Bertrand and Arlot, Sylvain},
   booktitle = {Proceedings of the 37th International Conference on Machine Learning},
@@ -295,6 +303,15 @@ @article{stroblConditionalVariableImportance2008
   year    = {2008}
 }
 
+@article{sobol1993sensitivity,
+  title={Sensitivity estimates for nonlinear mathematical models},
+  author={Sobol, Ilya Meyerovich},
+  journal={Math. Model. Comput. Exp.},
+  volume={1},
+  pages={407},
+  year={1993}
+}
+
 @article{van2014asymptotically,
   author    = {van de Geer, Sara and B{\"u}hlmann, Peter and Ritov, Ya'acov and Dezeure, Ruben},
   journal   = {The Annals of Statistics},
diff --git a/pyproject.toml b/pyproject.toml
@@ -48,6 +48,8 @@ doc = [
     "sphinx                 >= 7.0.0,   < 9",
     "sphinx-gallery         >= 0.17.0,  < 1",
     "sphinx-prompt          >= 1.8.0,   < 2",
+    "tqdm                   >= 4.1.0,   < 5",
+    "sphinx-copybutton"
 ]
 example = ["matplotlib >= 3.1.0,   < 4", "seaborn    >= 0.9,     < 1"]
 style = ["black  >= 24.4.2", "codespell >=2.4.0", "isort  >= 5.13.2"]