From d4caef58faf87edaf7684d69a3cec2cc6e504a3e Mon Sep 17 00:00:00 2001
From: kusch lionel <lionel.a.kusch@inria.fr>
Date: Wed, 27 Aug 2025 19:18:23 +0200
Subject: [PATCH 1/4] New marginal method base on scikitlearn

---
 docs/src/api.rst                              |  11 +
 docs/tools/references.bib                     |  47 +++
 src/hidimstat/__init__.py                     |  13 +
 src/hidimstat/marginal/__init__.py            |  13 +
 .../selection_variable_scikit_learn.py        | 302 ++++++++++++++++
 ...on_variable_scikit_learn_classification.py | 342 ++++++++++++++++++
 ...ection_variable_scikit_learn_regression.py | 229 ++++++++++++
 7 files changed, 957 insertions(+)
 create mode 100644 src/hidimstat/marginal/__init__.py
 create mode 100644 src/hidimstat/marginal/selection_variable_scikit_learn.py
 create mode 100644 test/marginal/test_selection_variable_scikit_learn_classification.py
 create mode 100644 test/marginal/test_selection_variable_scikit_learn_regression.py

diff --git a/docs/src/api.rst b/docs/src/api.rst
index 51074f1dc..77d82cdb1 100644
--- a/docs/src/api.rst
+++ b/docs/src/api.rst
@@ -40,3 +40,14 @@ Classes
    CFI
    PFI
    D0CRT
+
+Marginal Importance
+===================
+.. autosummary::
+   :toctree: ./generated/api/marginal
+   :template: class.rst
+
+   ANOVA
+   AnalysisOfVariance
+   UnivariateLinearRegressionTests
+   MutualInformation
\ No newline at end of file
diff --git a/docs/tools/references.bib b/docs/tools/references.bib
index fe04dc12f..092f5aa68 100644
--- a/docs/tools/references.bib
+++ b/docs/tools/references.bib
@@ -135,6 +135,22 @@ @article{chevalier_statistical_2020
   year    = {2020}
 }
 
+@article{chow1960tests,
+  title     = {Tests of equality between sets of coefficients in two linear regressions},
+  author    = {Chow, Gregory C},
+  journal   = {Econometrica: Journal of the Econometric Society},
+  pages     = {591--605},
+  year      = {1960},
+  publisher = {JSTOR}
+}
+
+@book{cover1999elements,
+  title     = {Elements of information theory},
+  author    = {Cover, Thomas M},
+  year      = {1999},
+  publisher = {John Wiley \& Sons}
+}
+
 @article{eshel2003yule,
   author  = {Eshel, Gidon},
   journal = {Internet resource},
@@ -155,6 +171,15 @@ @article{fan2012variance
   year      = {2012}
 }
 
+@incollection{fisher1970statistical,
+  title     = {Statistical methods for research workers},
+  author    = {Fisher, Ronald Aylmer},
+  booktitle = {Breakthroughs in statistics: Methodology and distribution},
+  pages     = {66--70},
+  year      = {1970},
+  publisher = {Springer}
+}
+
 @article{gaonkar_deriving_2012,
   author  = {Gaonkar, Bilwaj and Davatzikos, Christos},
   journal = {International Conference on Medical Image Computing and Computer-Assisted Intervention},
@@ -180,6 +205,17 @@ @article{hirschhorn2005genome
   year      = {2005}
 }
 
+@article{larson2008analysis,
+  title     = {Analysis of variance},
+  author    = {Larson, Martin G},
+  journal   = {Circulation},
+  volume    = {117},
+  number    = {1},
+  pages     = {115--121},
+  year      = {2008},
+  publisher = {Lippincott Williams \& Wilkins}
+}
+
 @article{lei2018distribution,
   title     = {Distribution-free predictive inference for regression},
   author    = {Lei, Jing and G'Sell, Max and Rinaldo, Alessandro and Tibshirani, Ryan J and Wasserman, Larry},
@@ -269,6 +305,17 @@ @article{Ren_2023
   year    = {2023}
 }
 
+@article{shannon1948mathematical,
+  title     = {A mathematical theory of communication},
+  author    = {Shannon, Claude E},
+  journal   = {The Bell system technical journal},
+  volume    = {27},
+  number    = {3},
+  pages     = {379--423},
+  year      = {1948},
+  publisher = {Nokia Bell Labs}
+}
+
 @article{stroblConditionalVariableImportance2008,
   author  = {Strobl, Carolin and Boulesteix, Anne-Laure and Kneib, Thomas and Augustin, Thomas and Zeileis, Achim},
   doi     = {10.1186/1471-2105-9-307},
diff --git a/src/hidimstat/__init__.py b/src/hidimstat/__init__.py
index 81d5a0cce..b57abeb77 100644
--- a/src/hidimstat/__init__.py
+++ b/src/hidimstat/__init__.py
@@ -27,6 +27,15 @@
 
 from .statistical_tools.aggregation import quantile_aggregation
 
+# marginal methods
+from .marginal import (
+    AdapterScikitLearn,  # for documentation
+    AnalysisOfVariance,  # for documentation
+    UnivariateLinearRegressionTests,
+    MutualInformation,
+)
+from .marginal import AnalysisOfVariance as ANOVA
+
 try:
     from ._version import __version__
 except ImportError:
@@ -51,4 +60,8 @@
     "CFI",
     "LOCO",
     "PFI",
+    # marginal
+    "ANOVA",
+    "UnivariateLinearRegressionTests",
+    "MutualInformation",
 ]
diff --git a/src/hidimstat/marginal/__init__.py b/src/hidimstat/marginal/__init__.py
new file mode 100644
index 000000000..afd1e43e4
--- /dev/null
+++ b/src/hidimstat/marginal/__init__.py
@@ -0,0 +1,13 @@
+from .selection_variable_scikit_learn import (
+    AdapterScikitLearn,
+    AnalysisOfVariance,
+    UnivariateLinearRegressionTests,
+    MutualInformation,
+)
+
+__all__ = [
+    "AdapterScikitLearn",
+    "AnalysisOfVariance",
+    "UnivariateLinearRegressionTests",
+    "MutualInformation",
+]
\ No newline at end of file
diff --git a/src/hidimstat/marginal/selection_variable_scikit_learn.py b/src/hidimstat/marginal/selection_variable_scikit_learn.py
new file mode 100644
index 000000000..6be5a645a
--- /dev/null
+++ b/src/hidimstat/marginal/selection_variable_scikit_learn.py
@@ -0,0 +1,302 @@
+from typing import override
+import warnings
+
+from sklearn.feature_selection import (
+    f_classif,
+    f_regression,
+    mutual_info_classif,
+    mutual_info_regression,
+)
+
+from hidimstat.base_variable_importance import BaseVariableImportance
+
+
+class AdapterScikitLearn(BaseVariableImportance):
+    """
+    Adapter base class for scikit-learn feature selection methods.
+    This class provides a unified interface for scikit-learn feature selection
+    methods to be used within the hidimstat framework.
+    Notes
+    -----
+    Subclasses should implement the `importance` methods.
+    """
+
+    def fit(self, X=None, y=None):
+        """
+        Fit the feature selection model to the data.
+        This method does nothing because fitting is not required for these
+        scikit-learn feature selection methods.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features), optional
+            (not used) Input data matrix.
+        y : array-like of shape (n_samples,), optional
+            (not used) Target values.
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+        if X is not None:
+            warnings.warn("X won't be used")
+        if y is not None:
+            warnings.warn("y won't be used")
+        return self
+
+    def importance(self, X, y):
+        """
+        Return the computed feature importances.
+        This method should be implemented by subclasses to compute feature
+        importances for the given data.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Input data matrix.
+        y : array-like of shape (n_samples,)
+            Target values.
+        Raises
+        ------
+        NotImplementedError
+            This method should be implemented by subclasses.
+        """
+        raise NotImplementedError()
+
+    def fit_importance(self, X, y, cv=None):
+        """
+        Fit the model and compute feature importances.
+        This method fits the model (if necessary) and computes feature
+        importances for the given data.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Input data matrix.
+        y : array-like of shape (n_samples,)
+            Target values.
+        cv : None or int, optional
+            (not used) Cross-validation parameter.
+        Returns
+        -------
+        importances_ : ndarray
+            Feature importance scores.
+        """
+        if cv is not None:
+            warnings.warn("cv won't be used")
+        self.fit()
+        return self.importance(X, y)
+
+
+class AnalysisOfVariance(AdapterScikitLearn):
+    """
+    Analysis of Variance (ANOVA) :footcite:t:`fisher1970statistical` feature
+    selection for classification tasks.
+    This class uses scikit-learn's f_classif to compute F-statistics and p-values
+    for each feature. For a short summary of this method, see
+    :footcite:t:`larson2008analysis`.
+    Attributes
+    ----------
+    importances_ : ndarray
+        1 - p-values for each feature (higher is more important).
+    pvalues_ : ndarray
+        1 - p-values for each feature.
+    f_statitstic_ : ndarray
+        F-statistics for each feature.
+    Notes
+    -----
+    See sklearn.feature_selection.f_classif
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    @override
+    def importance(self, X, y):
+        """
+        Compute ANOVA F-statistics and p-values for each feature.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Input data matrix.
+        y : array-like of shape (n_samples,)
+            Target class labels.
+        Sets
+        ----
+        importances_ : ndarray
+            1 - p-values for each feature.
+        pvalues_ : ndarray
+            1 - p-values for each feature.
+        f_statitstic_ : ndarray
+            F-statistics for each feature.
+        Returns
+        -------
+        importances_ : ndarray
+            1 - p-values for each feature.
+        """
+        f_statistic, p_values = f_classif(X, y)
+        # Test the opposite hypothese to the anova
+        # Test the similarity in the distribution instead of the difference
+        self.importances_ = 1 - p_values
+        self.pvalues_ = 1 - p_values
+        self.f_statitstic_ = f_statistic
+        return self.importances_
+
+
+class UnivariateLinearRegressionTests(AdapterScikitLearn):
+    """
+    Univariate linear regression F-test for regression tasks.
+    This test is also known as the Chow test :footcite:t:`chow1960tests`.
+    Uses scikit-learn's f_regression to compute F-statistics and p-values for each feature.
+    Parameters
+    ----------
+    center : bool, default=True
+        If True, center the data before computing F-statistics.
+    force_finite : bool, default=True
+        If True, replace NaNs and infs in the output with finite numbers.
+    Attributes
+    ----------
+    importances_ : ndarray
+        1 - p-values for each feature.
+    pvalues_ : ndarray
+        1 - p-values for each feature.
+    f_statitstic_ : ndarray
+        F-statistics for each feature.
+    Notes
+    -----
+    See sklearn.feature_selection.f_regression
+    """
+
+    def __init__(self, center=True, force_finite=True):
+
+        super().__init__()
+        self.center = center
+        self.force_finite = force_finite
+
+    @override
+    def importance(self, X, y):
+        """
+        Compute univariate linear regression F-statistics and p-values for each feature.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Input data matrix.
+        y : array-like of shape (n_samples,)
+            Target values.
+        Sets
+        ----
+        importances_ : ndarray
+            1 - p-values for each feature.
+        pvalues_ : ndarray
+            1 - p-values for each feature.
+        f_statitstic_ : ndarray
+            F-statistics for each feature.
+        Returns
+        -------
+        importances_ : ndarray
+            1 - p-values for each feature.
+        """
+        f_statistic, p_values = f_regression(
+            X, y, center=self.center, force_finite=self.force_finite
+        )
+        # Test the opposite hypothese to the anova
+        # Test the similarity in the distribution instead of the difference
+        self.importances_ = 1 - p_values
+        self.pvalues_ = 1 - p_values
+        self.f_statitstic_ = f_statistic
+        return self.importances_
+
+
+class MutualInformation(AdapterScikitLearn):
+    """
+    Mutual information feature selection for regression or classification.
+    This method was introduced by Shannon :footcite:t:`shannon1948mathematical`.
+    For an introduction, see section 2.4 of :footcite:t:`cover1999elements`.
+    Parameters
+    ----------
+    problem_type : {'regression', 'classification'}, default='regression'
+        Type of prediction problem.
+    discrete_features : 'auto' or array-like, default='auto'
+        Indicates which features are discrete.
+    n_neighbors : int, default=3
+        Number of neighbors to use for MI estimation.
+    random_state : int, default=None
+        Random seed for reproducibility.
+    n_jobs : int, default=1
+        Number of parallel jobs.
+    Attributes
+    ----------
+    importances_ : ndarray
+        Mutual information scores for each feature.
+    pvalues_ : None
+        P-values are not computed for mutual information.
+    Notes
+    -----
+    See sklearn.feature_selection.mutual_info_regression
+    See sklearn.feature_selection.mutual_info_classification
+    """
+
+    def __init__(
+        self,
+        problem_type="regression",
+        discrete_features="auto",
+        n_neighbors=3,
+        random_state=None,
+        n_jobs=1,
+    ):
+        super().__init__()
+        assert (
+            problem_type == "regression" or problem_type == "classification"
+        ), "the value of problem type should be 'regression' or 'classification'"
+        self.problem_type = problem_type
+        self.discrete_features = discrete_features
+        self.n_neighbors = n_neighbors
+        self.random_state = random_state
+        self.n_jobs = n_jobs
+
+    @override
+    def importance(self, X, y):
+        """
+        Compute mutual information scores for each feature.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Input data matrix.
+        y : array-like of shape (n_samples,)
+            Target values.
+        Sets
+        ----
+        importances_ : ndarray
+            Mutual information scores for each feature.
+        pvalues_ : None
+            P-values are not computed for mutual information.
+        Returns
+        -------
+        importances_ : ndarray
+            Mutual information scores for each feature.
+        """
+        if self.problem_type == "regression":
+            mutual_information = mutual_info_regression(
+                X,
+                y,
+                discrete_features=self.discrete_features,
+                n_neighbors=self.n_neighbors,
+                copy=True,
+                random_state=self.random_state,
+                n_jobs=self.n_jobs,
+            )
+        elif self.problem_type == "classification":
+            mutual_information = mutual_info_classif(
+                X,
+                y,
+                discrete_features=self.discrete_features,
+                n_neighbors=self.n_neighbors,
+                copy=True,
+                random_state=self.random_state,
+                n_jobs=self.n_jobs,
+            )
+        else:
+            raise ValueError(
+                "the value of problem type should be 'regression' or 'classification'"
+            )
+        self.importances_ = mutual_information
+        self.pvalues_ = None
+
+        return self.importances_
diff --git a/test/marginal/test_selection_variable_scikit_learn_classification.py b/test/marginal/test_selection_variable_scikit_learn_classification.py
new file mode 100644
index 000000000..930829f3d
--- /dev/null
+++ b/test/marginal/test_selection_variable_scikit_learn_classification.py
@@ -0,0 +1,342 @@
+from copy import deepcopy
+import numpy as np
+import pytest
+
+from hidimstat import ANOVA, MutualInformation, AdapterScikitLearn
+
+
+class MutualInformationClassification(MutualInformation):
+    """Specify the class for classification problem"""
+
+    def __init__(
+        self,
+        discrete_features="auto",
+        n_neighbors=3,
+        random_state=None,
+        n_jobs=1,
+    ):
+        super().__init__(
+            problem_type="classification",
+            discrete_features=discrete_features,
+            n_neighbors=n_neighbors,
+            random_state=random_state,
+            n_jobs=n_jobs,
+        )
+
+
+def classification_float(y, nb_classes=4, min_value=None, max_value=None):
+    """
+    Create classification problem bae on regression problem
+    Parameters
+    ----------
+    y : array-like of shape (n_samples,)
+        target for the prediction
+    nb_classes : int, default=4
+        number of classes
+    min_value : None or int, default=None
+        maximal value for the first class
+    max_value : None or int, default=None
+        minimal value for the last class
+    Returns
+    -------
+    array-like of shape (n_samples,)
+        classification for target
+    """
+    assert nb_classes >= 2
+    if min_value is None:
+        min_value = np.min(y)
+    if max_value is None:
+        max_value = np.max(y)
+    assert min_value < max_value
+    # change from regression to classification problem
+    y_ = deepcopy(y)
+    previous_value = min_value
+    for classe, value in enumerate(np.linspace(min_value, max_value, nb_classes)):
+        if value == min_value:
+            y_[np.where(y < min_value)] = classe
+        elif value == max_value:
+            y_[np.where(y >= max_value)] = classe
+        else:
+            y_[np.where(np.logical_and(previous_value <= y, y < value))] = classe
+        previous_value = value
+    y_ = np.array(y_, dtype=int)
+    return y_
+
+
+def configure_marginal_classication(
+    ClassMethod, X, y, nb_classes=4, min_value=None, max_value=None
+):
+    """
+    Configure ClassMethod model for feature importance analysis.
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Input data matrix where each column represents a feature
+        and each row a sample.
+    y : array-like of shape (n_samples,)
+        Target variable array.
+    Returns
+    -------
+    importance : array-like
+        Array containing importance scores for each feature.
+        Higher values indicate greater feature importance in predicting
+        the target variable.
+    Notes
+    -----
+    The function performs the following steps:
+    1. Intanciate ClassMethod
+    2. Calculates feature importance
+    """
+    y_ = classification_float(y, nb_classes, min_value, max_value)
+    # instantiate model
+    vi = ClassMethod()
+    # fit the model using the training set
+    vi.fit()
+    # calculate feature importance using the test set
+    importance = vi.importance(X, y_)
+    return np.array(importance)
+
+
+parameter_exact = [
+    ("HiDim", 150, 200, 1, 0.0, 42, 1.0, np.inf, 0.0),
+    ("HiDim with noise", 150, 200, 1, 0.0, 42, 1.0, 10.0, 0.0),
+    ("HiDim with correlated noise", 150, 200, 1, 0.0, 42, 1.0, 10.0, 0.5),
+    ("HiDim with correlated features", 150, 200, 1, 0.8, 42, 1.0, np.inf, 0.0),
+]
+
+
+@pytest.mark.parametrize(
+    "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial",
+    zip(*(list(zip(*parameter_exact))[1:])),
+    ids=list(zip(*parameter_exact))[0],
+)
+@pytest.mark.parametrize(
+    "ClassVI",
+    [ANOVA, MutualInformationClassification],
+    ids=["ANOVA", "MutualInformation"],
+)
+def test_linear_data_exact(data_generator, ClassVI):
+    """Tests the method on linear cases with noise and/or correlation"""
+    X, y, important_features, _ = data_generator
+
+    importance = configure_marginal_classication(ClassVI, X, y)
+    # check that importance scores are defined for each feature
+    assert importance.shape == (X.shape[1],)
+    # check that important features have the highest importance scores
+    assert np.all([int(i) in important_features for i in np.argsort(importance)[-1:]])
+
+
+parameter_bad_detection = [
+    ("HiDim with high correlated features", 150, 200, 1, 1.0, 42, 1.0, 5.0, 0.0),
+    ("HiDim multivaribale", 150, 200, 10, 0.0, 42, 1.0, np.inf, 0.0),
+    ("HiDim multivaribale noise", 150, 200, 10, 0.0, 42, 1.0, 10.0, 0.0),
+]
+
+
+@pytest.mark.parametrize(
+    "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial",
+    zip(*(list(zip(*parameter_bad_detection))[1:])),
+    ids=list(zip(*parameter_bad_detection))[0],
+)
+@pytest.mark.parametrize(
+    "ClassVI",
+    [ANOVA, MutualInformationClassification],
+    ids=["ANOVA", "MutualInformation"],
+)
+def test_linear_data_fail(data_generator, ClassVI):
+    """Tests the faillure of the method on linear cases with correlation
+    or multiple variable of importance"""
+    X, y, important_features, _ = data_generator
+    size_support = np.sum(important_features != 0)
+
+    importance = configure_marginal_classication(ClassVI, X, y)
+    # check that importance scores are defined for each feature
+    assert importance.shape == (X.shape[1],)
+    # check that important features have the highest importance scores
+    assert np.any(
+        [
+            int(i) not in important_features
+            for i in np.argsort(importance)[-size_support:]
+        ]
+    )
+
+
+################################################################################
+# Specific test for ANOVA
+parameter_exact_ANOVA = [
+    ("HiDim with high level noise", 150, 200, 1, 0.2, 42, 1.0, 0.5, 0.0),
+]
+
+
+@pytest.mark.parametrize(
+    "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial",
+    zip(*(list(zip(*parameter_exact_ANOVA))[1:])),
+    ids=list(zip(*parameter_exact_ANOVA))[0],
+)
+def test_ANOVA_exact(data_generator):
+    """Tests the method on high noise"""
+    X, y, important_features, not_important_features = data_generator
+
+    importance = configure_marginal_classication(ANOVA, X, y)
+    # check that importance scores are defined for each feature
+    assert importance.shape == (X.shape[1],)
+    # check that important features have the highest importance scores
+    assert np.all([int(i) in important_features for i in np.argsort(importance)[-1:]])
+    # Check that important features have higher mean importance scores
+    assert (
+        importance[important_features].mean()
+        > importance[not_important_features][
+            np.where(importance[not_important_features] != 0)
+        ].mean()
+    )
+
+
+# Spefic test for MutualInformation
+@pytest.mark.parametrize(
+    "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial",
+    [parameter_exact[0][1:]],
+    ids=[parameter_exact[0][0]],
+)
+@pytest.mark.parametrize(
+    "discrete_features, n_neighbors",
+    [
+        ("auto", 5),
+        (False, 5),
+    ],
+    ids=[
+        "change number of neighboor",
+        "discrete_features True",
+    ],
+)
+def test_MutualInformation_exact(data_generator, discrete_features, n_neighbors):
+    """Tests parameters of classes"""
+    X, y, important_features, _ = data_generator
+    y_ = classification_float(y, nb_classes=6, min_value=-1, max_value=1)
+
+    importance = (
+        MutualInformationClassification(
+            discrete_features=discrete_features, n_neighbors=n_neighbors
+        )
+        .fit()
+        .importance(X, y_)
+    )
+    # check that importance scores are defined for each feature
+    assert importance.shape == (X.shape[1],)
+    # check that important features have the highest importance scores
+    assert np.all([int(i) in important_features for i in np.argsort(importance)[-1:]])
+
+
+parameter_fail_MutualInformation = [
+    ("HiDim with high level noise", 150, 200, 1, 0.2, 42, 1.0, 0.5, 0.0),
+]
+
+
+@pytest.mark.parametrize(
+    "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial",
+    zip(*(list(zip(*parameter_fail_MutualInformation))[1:])),
+    ids=list(zip(*parameter_fail_MutualInformation))[0],
+)
+def test_MutualInformation_fail(data_generator):
+    """Tests faillure of the method on high noise"""
+    X, y, important_features, _ = data_generator
+    size_support = np.sum(important_features != 0)
+
+    importance = configure_marginal_classication(MutualInformationClassification, X, y)
+    # check that importance scores are defined for each feature
+    assert importance.shape == (X.shape[1],)
+    # check that important features have the highest importance scores
+    assert np.any(
+        [
+            int(i) not in important_features
+            for i in np.argsort(importance)[-size_support:]
+        ]
+    )
+
+
+##############################################################################
+@pytest.mark.parametrize(
+    "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial",
+    [(150, 200, 1, 0.0, 42, 1.0, 0.0, 0.0)],
+    ids=["default data"],
+)
+@pytest.mark.parametrize(
+    "ClassVI",
+    [ANOVA, MutualInformationClassification],
+    ids=["ANOVA", "MutualInformation"],
+)
+class TestClass:
+    """Test the element of the class"""
+
+    def test_init(self, data_generator, ClassVI):
+        """Test initialization work"""
+        classvi = ClassVI()
+
+    def test_fit(self, data_generator, ClassVI):
+        """Test fitting is doing nothing"""
+        classvi = ClassVI()
+        classvi_reference = deepcopy(classvi)
+        classvi.fit()
+        for attribute_name in classvi.__dict__.keys():
+            assert classvi.__getattribute__(
+                attribute_name
+            ) == classvi_reference.__getattribute__(attribute_name)
+
+    def test_categorical(
+        self,
+        n_samples,
+        n_features,
+        support_size,
+        rho,
+        seed,
+        value,
+        signal_noise_ratio,
+        rho_serial,
+        ClassVI,
+    ):
+        """Test the fit_importance function on mix type of feature"""
+        rng = np.random.default_rng(seed)
+        X_cont = rng.random((n_samples, 2))
+        X_cat = rng.integers(low=0, high=3, size=(n_samples, 1))
+        X = np.hstack([X_cont, X_cat])
+        y = rng.integers(0, 10, (n_samples, 1))
+
+        classvi = ClassVI()
+
+        importances = classvi.fit_importance(X, y)
+        assert len(importances) == 3
+        assert np.all(importances >= 0)
+
+
+##############################################################################
+def test_error_abstract_class():
+    """Test the warning and the error of the class AdapterScikitLearn"""
+    adapter = AdapterScikitLearn()
+    with pytest.warns(Warning, match="X won't be used"):
+        adapter.fit(X=np.random.rand(10, 10))
+    with pytest.warns(Warning, match="y won't be used"):
+        adapter.fit(y=np.random.rand(10, 1))
+    with pytest.raises(NotImplementedError):
+        adapter.importance(X=np.random.rand(10, 10), y=np.random.rand(10, 1))
+    with pytest.raises(NotImplementedError):
+        adapter.fit_importance(X=np.random.rand(10, 10), y=np.random.rand(10, 1))
+    with pytest.raises(NotImplementedError):
+        with pytest.warns(Warning, match="cv won't be used"):
+            adapter.fit_importance(
+                X=np.random.rand(10, 10), y=np.random.rand(10, 1), cv="other"
+            )
+
+
+def test_error_Mutual_Information():
+    """Test error in Mutual Information"""
+    with pytest.raises(
+        AssertionError,
+        match="the value of problem type should be 'regression' or 'classification'",
+    ):
+        MutualInformation(problem_type="bad type")
+    vi = MutualInformation()
+    vi.problem_type = "bad type"
+    with pytest.raises(
+        ValueError,
+        match="the value of problem type should be 'regression' or 'classification'",
+    ):
+        vi.importance(X=np.random.rand(10, 10), y=np.random.rand(10, 1))
diff --git a/test/marginal/test_selection_variable_scikit_learn_regression.py b/test/marginal/test_selection_variable_scikit_learn_regression.py
new file mode 100644
index 000000000..9a33fb602
--- /dev/null
+++ b/test/marginal/test_selection_variable_scikit_learn_regression.py
@@ -0,0 +1,229 @@
+from copy import deepcopy
+import numpy as np
+import pytest
+
+from hidimstat import UnivariateLinearRegressionTests, MutualInformation
+
+
+def configure_marginal_regression(ClassMethod, X, y):
+    """
+    Configure ClassMethod model for feature importance analysis.
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Input data matrix where each column represents a feature
+        and each row a sample.
+    y : array-like of shape (n_samples,)
+        Target variable array.
+    Returns
+    -------
+    importance : array-like
+        Array containing importance scores for each feature.
+        Higher values indicate greater feature importance in predicting
+        the target variable.
+    Notes
+    -----
+    The function performs the following steps:
+    1. Intanciate ClassMethod
+    2. Calculates feature importance
+    """
+    # instantiate model
+    vi = ClassMethod()
+    # fit the model using the training set
+    vi.fit()
+    # calculate feature importance using the test set
+    importance = vi.importance(X, y)
+    return np.array(importance)
+
+
+parameter_exact = [
+    ("HiDim", 150, 200, 1, 0.0, 42, 1.0, np.inf, 0.0),
+    ("HiDim with noise", 150, 200, 1, 0.0, 42, 1.0, 10.0, 0.0),
+    ("HiDim with correlated noise", 150, 200, 1, 0.0, 42, 1.0, 10.0, 0.5),
+    ("HiDim with correlated features", 150, 200, 1, 0.8, 42, 1.0, np.inf, 0.0),
+]
+
+
+@pytest.mark.parametrize(
+    "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial",
+    zip(*(list(zip(*parameter_exact))[1:])),
+    ids=list(zip(*parameter_exact))[0],
+)
+@pytest.mark.parametrize(
+    "ClassVI",
+    [UnivariateLinearRegressionTests, MutualInformation],
+    ids=["UnivariateLinearRegressionTests", "MutualInformation"],
+)
+def test_linear_data_exact(data_generator, ClassVI):
+    """Tests the method on linear cases with noise and/or correlation"""
+    X, y, important_features, _ = data_generator
+
+    importance = configure_marginal_regression(ClassVI, X, y)
+    # check that importance scores are defined for each feature
+    assert importance.shape == (X.shape[1],)
+    # check that important features have the highest importance scores
+    assert np.all([int(i) in important_features for i in np.argsort(importance)[-1:]])
+
+
+parameter_bad_detection = [
+    ("HiDim with high correlated features", 150, 200, 1, 1.0, 42, 1.0, 5.0, 0.0),
+    ("HiDim multivaribale", 150, 200, 10, 0.0, 42, 1.0, np.inf, 0.0),
+    ("HiDim multivaribale noise", 150, 200, 10, 0.0, 42, 1.0, 10.0, 0.0),
+]
+
+
+@pytest.mark.parametrize(
+    "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial",
+    zip(*(list(zip(*parameter_bad_detection))[1:])),
+    ids=list(zip(*parameter_bad_detection))[0],
+)
+@pytest.mark.parametrize(
+    "ClassVI",
+    [UnivariateLinearRegressionTests, MutualInformation],
+    ids=["UnivariateLinearRegressionTests", "MutualInformation"],
+)
+def test_linear_data_fail(data_generator, ClassVI):
+    """Tests the faillure of the method on linear cases with correlation
+    or multiple variable of importance"""
+    X, y, important_features, _ = data_generator
+    size_support = np.sum(important_features != 0)
+
+    importance = configure_marginal_regression(ClassVI, X, y)
+    # check that importance scores are defined for each feature
+    assert importance.shape == (X.shape[1],)
+    # check that important features have the highest importance scores
+    assert np.any(
+        [
+            int(i) not in important_features
+            for i in np.argsort(importance)[-size_support:]
+        ]
+    )
+
+
+##############################################################################
+# Spefic test for UnivariateLinearRegressionTests
+parameter_exact_UnivariateLinearRegressionTests = [
+    ("HiDim with high level noise", 150, 200, 1, 0.2, 42, 1.0, 0.5, 0.0),
+]
+
+
+@pytest.mark.parametrize(
+    "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial",
+    zip(*(list(zip(*parameter_exact_UnivariateLinearRegressionTests))[1:])),
+    ids=list(zip(*parameter_exact_UnivariateLinearRegressionTests))[0],
+)
+@pytest.mark.parametrize(
+    "center, force_finite",
+    [
+        (True, True),
+        (False, True),
+        (True, False),
+    ],
+    ids=["default", "no center", "no force finite"],
+)
+def test_UnivariateLinearRegressionTests_exact(data_generator, center, force_finite):
+    """Tests parameters of classes"""
+    X, y, important_features, not_important_features = data_generator
+
+    importance = (
+        UnivariateLinearRegressionTests(center=center, force_finite=force_finite)
+        .fit()
+        .importance(X, y)
+    )
+    # check that importance scores are defined for each feature
+    assert importance.shape == (X.shape[1],)
+    # check that important features have the highest importance scores
+    assert np.all([int(i) in important_features for i in np.argsort(importance)[-1:]])
+    # Check that important features have higher mean importance scores
+    assert (
+        importance[important_features].mean()
+        > importance[not_important_features][
+            np.where(importance[not_important_features] != 0)
+        ].mean()
+    )
+
+
+# Spefic test for MutualInformation
+@pytest.mark.parametrize(
+    "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial",
+    [parameter_exact[0][1:]],
+    ids=[parameter_exact[0][0]],
+)
+@pytest.mark.parametrize(
+    "discrete_features, n_neighbors",
+    [
+        ("auto", 5),
+        (False, 3),
+    ],
+    ids=["change number of neighboor", "discrete_features False"],
+)
+def test_MutualInformation_exact(data_generator, discrete_features, n_neighbors):
+    """Tests parameters of classes"""
+    X, y, important_features, _ = data_generator
+    size_support = np.sum(important_features != 0)
+
+    importance = (
+        MutualInformation(discrete_features=discrete_features, n_neighbors=n_neighbors)
+        .fit()
+        .importance(X, y)
+    )
+    # check that importance scores are defined for each feature
+    assert importance.shape == (X.shape[1],)
+    # check that important features have the highest importance scores
+    assert np.all(
+        [int(i) in important_features for i in np.argsort(importance)[-size_support:]]
+    )
+
+
+##############################################################################
+@pytest.mark.parametrize(
+    "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial",
+    [(150, 200, 1, 0.0, 42, 1.0, 0.0, 0.0)],
+    ids=["default data"],
+)
+@pytest.mark.parametrize(
+    "ClassVI",
+    [UnivariateLinearRegressionTests, MutualInformation],
+    ids=["UnivariateLinearRegressionTests", "MutualInformation"],
+)
+class TestClass:
+    """Test the element of the class"""
+
+    def test_init(self, data_generator, ClassVI):
+        """Test initialization work"""
+        classvi = ClassVI()
+
+    def test_fit(self, data_generator, ClassVI):
+        """Test fitting is doing nothing"""
+        classvi = ClassVI()
+        classvi_reference = deepcopy(classvi)
+        classvi.fit()
+        for attribute_name in classvi.__dict__.keys():
+            assert classvi.__getattribute__(
+                attribute_name
+            ) == classvi_reference.__getattribute__(attribute_name)
+
+    def test_categorical(
+        self,
+        n_samples,
+        n_features,
+        support_size,
+        rho,
+        seed,
+        value,
+        signal_noise_ratio,
+        rho_serial,
+        ClassVI,
+    ):
+        """Test the fit_importance function on mix type of feature"""
+        rng = np.random.default_rng(seed)
+        X_cont = rng.random((n_samples, 2))
+        X_cat = rng.integers(low=0, high=3, size=(n_samples, 1))
+        X = np.hstack([X_cont, X_cat])
+        y = rng.random((n_samples, 1))
+
+        classvi = ClassVI()
+
+        importances = classvi.fit_importance(X, y)
+        assert len(importances) == 3
+        assert np.all(importances >= 0)
\ No newline at end of file

From d85c37343a0ea26770e87bedf192a26760b95f4e Mon Sep 17 00:00:00 2001
From: kusch lionel <lionel.a.kusch@inria.fr>
Date: Wed, 27 Aug 2025 19:24:06 +0200
Subject: [PATCH 2/4] fix format

---
 src/hidimstat/marginal/__init__.py                              | 2 +-
 .../marginal/test_selection_variable_scikit_learn_regression.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/hidimstat/marginal/__init__.py b/src/hidimstat/marginal/__init__.py
index afd1e43e4..47a788fdc 100644
--- a/src/hidimstat/marginal/__init__.py
+++ b/src/hidimstat/marginal/__init__.py
@@ -10,4 +10,4 @@
     "AnalysisOfVariance",
     "UnivariateLinearRegressionTests",
     "MutualInformation",
-]
\ No newline at end of file
+]
diff --git a/test/marginal/test_selection_variable_scikit_learn_regression.py b/test/marginal/test_selection_variable_scikit_learn_regression.py
index 9a33fb602..e1dfe4fde 100644
--- a/test/marginal/test_selection_variable_scikit_learn_regression.py
+++ b/test/marginal/test_selection_variable_scikit_learn_regression.py
@@ -226,4 +226,4 @@ def test_categorical(
 
         importances = classvi.fit_importance(X, y)
         assert len(importances) == 3
-        assert np.all(importances >= 0)
\ No newline at end of file
+        assert np.all(importances >= 0)

From 9b03055baded9cc9d8ac2fb208aab09381c7814a Mon Sep 17 00:00:00 2001
From: kusch lionel <lionel.a.kusch@inria.fr>
Date: Fri, 29 Aug 2025 12:39:40 +0200
Subject: [PATCH 3/4] change the importance to f-statistic

---
 src/hidimstat/marginal/selection_variable_scikit_learn.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/hidimstat/marginal/selection_variable_scikit_learn.py b/src/hidimstat/marginal/selection_variable_scikit_learn.py
index 6be5a645a..c89a95edd 100644
--- a/src/hidimstat/marginal/selection_variable_scikit_learn.py
+++ b/src/hidimstat/marginal/selection_variable_scikit_learn.py
@@ -134,9 +134,8 @@ def importance(self, X, y):
         f_statistic, p_values = f_classif(X, y)
         # Test the opposite hypothese to the anova
         # Test the similarity in the distribution instead of the difference
-        self.importances_ = 1 - p_values
+        self.importances_ = f_statistic
         self.pvalues_ = 1 - p_values
-        self.f_statitstic_ = f_statistic
         return self.importances_
 
 
@@ -198,9 +197,8 @@ def importance(self, X, y):
         )
         # Test the opposite hypothese to the anova
         # Test the similarity in the distribution instead of the difference
-        self.importances_ = 1 - p_values
+        self.importances_ = f_statistic
         self.pvalues_ = 1 - p_values
-        self.f_statitstic_ = f_statistic
         return self.importances_
 
 

From cc2a7412090936db3b0a15f3a5dd5666fb2a9ba3 Mon Sep 17 00:00:00 2001
From: kusch lionel <lionel.a.kusch@inria.fr>
Date: Tue, 2 Sep 2025 11:39:31 +0200
Subject: [PATCH 4/4] fix p-value

---
 src/hidimstat/marginal/selection_variable_scikit_learn.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/hidimstat/marginal/selection_variable_scikit_learn.py b/src/hidimstat/marginal/selection_variable_scikit_learn.py
index c89a95edd..3ce705ca0 100644
--- a/src/hidimstat/marginal/selection_variable_scikit_learn.py
+++ b/src/hidimstat/marginal/selection_variable_scikit_learn.py
@@ -135,7 +135,7 @@ def importance(self, X, y):
         # Test the opposite hypothese to the anova
         # Test the similarity in the distribution instead of the difference
         self.importances_ = f_statistic
-        self.pvalues_ = 1 - p_values
+        self.pvalues_ = p_values
         return self.importances_
 
 
@@ -198,7 +198,7 @@ def importance(self, X, y):
         # Test the opposite hypothese to the anova
         # Test the similarity in the distribution instead of the difference
         self.importances_ = f_statistic
-        self.pvalues_ = 1 - p_values
+        self.pvalues_ = p_values
         return self.importances_