From d4caef58faf87edaf7684d69a3cec2cc6e504a3e Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 27 Aug 2025 19:18:23 +0200 Subject: [PATCH 1/4] New marginal method base on scikitlearn --- docs/src/api.rst | 11 + docs/tools/references.bib | 47 +++ src/hidimstat/__init__.py | 13 + src/hidimstat/marginal/__init__.py | 13 + .../selection_variable_scikit_learn.py | 302 ++++++++++++++++ ...on_variable_scikit_learn_classification.py | 342 ++++++++++++++++++ ...ection_variable_scikit_learn_regression.py | 229 ++++++++++++ 7 files changed, 957 insertions(+) create mode 100644 src/hidimstat/marginal/__init__.py create mode 100644 src/hidimstat/marginal/selection_variable_scikit_learn.py create mode 100644 test/marginal/test_selection_variable_scikit_learn_classification.py create mode 100644 test/marginal/test_selection_variable_scikit_learn_regression.py diff --git a/docs/src/api.rst b/docs/src/api.rst index 51074f1dc..77d82cdb1 100644 --- a/docs/src/api.rst +++ b/docs/src/api.rst @@ -40,3 +40,14 @@ Classes CFI PFI D0CRT + +Marginal Importance +=================== +.. autosummary:: + :toctree: ./generated/api/marginal + :template: class.rst + + ANOVA + AnalysisOfVariance + UnivariateLinearRegressionTests + MutualInformation \ No newline at end of file diff --git a/docs/tools/references.bib b/docs/tools/references.bib index fe04dc12f..092f5aa68 100644 --- a/docs/tools/references.bib +++ b/docs/tools/references.bib @@ -135,6 +135,22 @@ @article{chevalier_statistical_2020 year = {2020} } +@article{chow1960tests, + title = {Tests of equality between sets of coefficients in two linear regressions}, + author = {Chow, Gregory C}, + journal = {Econometrica: Journal of the Econometric Society}, + pages = {591--605}, + year = {1960}, + publisher = {JSTOR} +} + +@book{cover1999elements, + title = {Elements of information theory}, + author = {Cover, Thomas M}, + year = {1999}, + publisher = {John Wiley \& Sons} +} + @article{eshel2003yule, author = {Eshel, Gidon}, journal = {Internet resource}, @@ -155,6 +171,15 @@ @article{fan2012variance year = {2012} } +@incollection{fisher1970statistical, + title = {Statistical methods for research workers}, + author = {Fisher, Ronald Aylmer}, + booktitle = {Breakthroughs in statistics: Methodology and distribution}, + pages = {66--70}, + year = {1970}, + publisher = {Springer} +} + @article{gaonkar_deriving_2012, author = {Gaonkar, Bilwaj and Davatzikos, Christos}, journal = {International Conference on Medical Image Computing and Computer-Assisted Intervention}, @@ -180,6 +205,17 @@ @article{hirschhorn2005genome year = {2005} } +@article{larson2008analysis, + title = {Analysis of variance}, + author = {Larson, Martin G}, + journal = {Circulation}, + volume = {117}, + number = {1}, + pages = {115--121}, + year = {2008}, + publisher = {Lippincott Williams \& Wilkins} +} + @article{lei2018distribution, title = {Distribution-free predictive inference for regression}, author = {Lei, Jing and G'Sell, Max and Rinaldo, Alessandro and Tibshirani, Ryan J and Wasserman, Larry}, @@ -269,6 +305,17 @@ @article{Ren_2023 year = {2023} } +@article{shannon1948mathematical, + title = {A mathematical theory of communication}, + author = {Shannon, Claude E}, + journal = {The Bell system technical journal}, + volume = {27}, + number = {3}, + pages = {379--423}, + year = {1948}, + publisher = {Nokia Bell Labs} +} + @article{stroblConditionalVariableImportance2008, author = {Strobl, Carolin and Boulesteix, Anne-Laure and Kneib, Thomas and Augustin, Thomas and Zeileis, Achim}, doi = {10.1186/1471-2105-9-307}, diff --git a/src/hidimstat/__init__.py b/src/hidimstat/__init__.py index 81d5a0cce..b57abeb77 100644 --- a/src/hidimstat/__init__.py +++ b/src/hidimstat/__init__.py @@ -27,6 +27,15 @@ from .statistical_tools.aggregation import quantile_aggregation +# marginal methods +from .marginal import ( + AdapterScikitLearn, # for documentation + AnalysisOfVariance, # for documentation + UnivariateLinearRegressionTests, + MutualInformation, +) +from .marginal import AnalysisOfVariance as ANOVA + try: from ._version import __version__ except ImportError: @@ -51,4 +60,8 @@ "CFI", "LOCO", "PFI", + # marginal + "ANOVA", + "UnivariateLinearRegressionTests", + "MutualInformation", ] diff --git a/src/hidimstat/marginal/__init__.py b/src/hidimstat/marginal/__init__.py new file mode 100644 index 000000000..afd1e43e4 --- /dev/null +++ b/src/hidimstat/marginal/__init__.py @@ -0,0 +1,13 @@ +from .selection_variable_scikit_learn import ( + AdapterScikitLearn, + AnalysisOfVariance, + UnivariateLinearRegressionTests, + MutualInformation, +) + +__all__ = [ + "AdapterScikitLearn", + "AnalysisOfVariance", + "UnivariateLinearRegressionTests", + "MutualInformation", +] \ No newline at end of file diff --git a/src/hidimstat/marginal/selection_variable_scikit_learn.py b/src/hidimstat/marginal/selection_variable_scikit_learn.py new file mode 100644 index 000000000..6be5a645a --- /dev/null +++ b/src/hidimstat/marginal/selection_variable_scikit_learn.py @@ -0,0 +1,302 @@ +from typing import override +import warnings + +from sklearn.feature_selection import ( + f_classif, + f_regression, + mutual_info_classif, + mutual_info_regression, +) + +from hidimstat.base_variable_importance import BaseVariableImportance + + +class AdapterScikitLearn(BaseVariableImportance): + """ + Adapter base class for scikit-learn feature selection methods. + This class provides a unified interface for scikit-learn feature selection + methods to be used within the hidimstat framework. + Notes + ----- + Subclasses should implement the `importance` methods. + """ + + def fit(self, X=None, y=None): + """ + Fit the feature selection model to the data. + This method does nothing because fitting is not required for these + scikit-learn feature selection methods. + Parameters + ---------- + X : array-like of shape (n_samples, n_features), optional + (not used) Input data matrix. + y : array-like of shape (n_samples,), optional + (not used) Target values. + Returns + ------- + self : object + Returns self. + """ + if X is not None: + warnings.warn("X won't be used") + if y is not None: + warnings.warn("y won't be used") + return self + + def importance(self, X, y): + """ + Return the computed feature importances. + This method should be implemented by subclasses to compute feature + importances for the given data. + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data matrix. + y : array-like of shape (n_samples,) + Target values. + Raises + ------ + NotImplementedError + This method should be implemented by subclasses. + """ + raise NotImplementedError() + + def fit_importance(self, X, y, cv=None): + """ + Fit the model and compute feature importances. + This method fits the model (if necessary) and computes feature + importances for the given data. + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data matrix. + y : array-like of shape (n_samples,) + Target values. + cv : None or int, optional + (not used) Cross-validation parameter. + Returns + ------- + importances_ : ndarray + Feature importance scores. + """ + if cv is not None: + warnings.warn("cv won't be used") + self.fit() + return self.importance(X, y) + + +class AnalysisOfVariance(AdapterScikitLearn): + """ + Analysis of Variance (ANOVA) :footcite:t:`fisher1970statistical` feature + selection for classification tasks. + This class uses scikit-learn's f_classif to compute F-statistics and p-values + for each feature. For a short summary of this method, see + :footcite:t:`larson2008analysis`. + Attributes + ---------- + importances_ : ndarray + 1 - p-values for each feature (higher is more important). + pvalues_ : ndarray + 1 - p-values for each feature. + f_statitstic_ : ndarray + F-statistics for each feature. + Notes + ----- + See sklearn.feature_selection.f_classif + """ + + def __init__(self): + super().__init__() + + @override + def importance(self, X, y): + """ + Compute ANOVA F-statistics and p-values for each feature. + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data matrix. + y : array-like of shape (n_samples,) + Target class labels. + Sets + ---- + importances_ : ndarray + 1 - p-values for each feature. + pvalues_ : ndarray + 1 - p-values for each feature. + f_statitstic_ : ndarray + F-statistics for each feature. + Returns + ------- + importances_ : ndarray + 1 - p-values for each feature. + """ + f_statistic, p_values = f_classif(X, y) + # Test the opposite hypothese to the anova + # Test the similarity in the distribution instead of the difference + self.importances_ = 1 - p_values + self.pvalues_ = 1 - p_values + self.f_statitstic_ = f_statistic + return self.importances_ + + +class UnivariateLinearRegressionTests(AdapterScikitLearn): + """ + Univariate linear regression F-test for regression tasks. + This test is also known as the Chow test :footcite:t:`chow1960tests`. + Uses scikit-learn's f_regression to compute F-statistics and p-values for each feature. + Parameters + ---------- + center : bool, default=True + If True, center the data before computing F-statistics. + force_finite : bool, default=True + If True, replace NaNs and infs in the output with finite numbers. + Attributes + ---------- + importances_ : ndarray + 1 - p-values for each feature. + pvalues_ : ndarray + 1 - p-values for each feature. + f_statitstic_ : ndarray + F-statistics for each feature. + Notes + ----- + See sklearn.feature_selection.f_regression + """ + + def __init__(self, center=True, force_finite=True): + + super().__init__() + self.center = center + self.force_finite = force_finite + + @override + def importance(self, X, y): + """ + Compute univariate linear regression F-statistics and p-values for each feature. + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data matrix. + y : array-like of shape (n_samples,) + Target values. + Sets + ---- + importances_ : ndarray + 1 - p-values for each feature. + pvalues_ : ndarray + 1 - p-values for each feature. + f_statitstic_ : ndarray + F-statistics for each feature. + Returns + ------- + importances_ : ndarray + 1 - p-values for each feature. + """ + f_statistic, p_values = f_regression( + X, y, center=self.center, force_finite=self.force_finite + ) + # Test the opposite hypothese to the anova + # Test the similarity in the distribution instead of the difference + self.importances_ = 1 - p_values + self.pvalues_ = 1 - p_values + self.f_statitstic_ = f_statistic + return self.importances_ + + +class MutualInformation(AdapterScikitLearn): + """ + Mutual information feature selection for regression or classification. + This method was introduced by Shannon :footcite:t:`shannon1948mathematical`. + For an introduction, see section 2.4 of :footcite:t:`cover1999elements`. + Parameters + ---------- + problem_type : {'regression', 'classification'}, default='regression' + Type of prediction problem. + discrete_features : 'auto' or array-like, default='auto' + Indicates which features are discrete. + n_neighbors : int, default=3 + Number of neighbors to use for MI estimation. + random_state : int, default=None + Random seed for reproducibility. + n_jobs : int, default=1 + Number of parallel jobs. + Attributes + ---------- + importances_ : ndarray + Mutual information scores for each feature. + pvalues_ : None + P-values are not computed for mutual information. + Notes + ----- + See sklearn.feature_selection.mutual_info_regression + See sklearn.feature_selection.mutual_info_classification + """ + + def __init__( + self, + problem_type="regression", + discrete_features="auto", + n_neighbors=3, + random_state=None, + n_jobs=1, + ): + super().__init__() + assert ( + problem_type == "regression" or problem_type == "classification" + ), "the value of problem type should be 'regression' or 'classification'" + self.problem_type = problem_type + self.discrete_features = discrete_features + self.n_neighbors = n_neighbors + self.random_state = random_state + self.n_jobs = n_jobs + + @override + def importance(self, X, y): + """ + Compute mutual information scores for each feature. + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data matrix. + y : array-like of shape (n_samples,) + Target values. + Sets + ---- + importances_ : ndarray + Mutual information scores for each feature. + pvalues_ : None + P-values are not computed for mutual information. + Returns + ------- + importances_ : ndarray + Mutual information scores for each feature. + """ + if self.problem_type == "regression": + mutual_information = mutual_info_regression( + X, + y, + discrete_features=self.discrete_features, + n_neighbors=self.n_neighbors, + copy=True, + random_state=self.random_state, + n_jobs=self.n_jobs, + ) + elif self.problem_type == "classification": + mutual_information = mutual_info_classif( + X, + y, + discrete_features=self.discrete_features, + n_neighbors=self.n_neighbors, + copy=True, + random_state=self.random_state, + n_jobs=self.n_jobs, + ) + else: + raise ValueError( + "the value of problem type should be 'regression' or 'classification'" + ) + self.importances_ = mutual_information + self.pvalues_ = None + + return self.importances_ diff --git a/test/marginal/test_selection_variable_scikit_learn_classification.py b/test/marginal/test_selection_variable_scikit_learn_classification.py new file mode 100644 index 000000000..930829f3d --- /dev/null +++ b/test/marginal/test_selection_variable_scikit_learn_classification.py @@ -0,0 +1,342 @@ +from copy import deepcopy +import numpy as np +import pytest + +from hidimstat import ANOVA, MutualInformation, AdapterScikitLearn + + +class MutualInformationClassification(MutualInformation): + """Specify the class for classification problem""" + + def __init__( + self, + discrete_features="auto", + n_neighbors=3, + random_state=None, + n_jobs=1, + ): + super().__init__( + problem_type="classification", + discrete_features=discrete_features, + n_neighbors=n_neighbors, + random_state=random_state, + n_jobs=n_jobs, + ) + + +def classification_float(y, nb_classes=4, min_value=None, max_value=None): + """ + Create classification problem bae on regression problem + Parameters + ---------- + y : array-like of shape (n_samples,) + target for the prediction + nb_classes : int, default=4 + number of classes + min_value : None or int, default=None + maximal value for the first class + max_value : None or int, default=None + minimal value for the last class + Returns + ------- + array-like of shape (n_samples,) + classification for target + """ + assert nb_classes >= 2 + if min_value is None: + min_value = np.min(y) + if max_value is None: + max_value = np.max(y) + assert min_value < max_value + # change from regression to classification problem + y_ = deepcopy(y) + previous_value = min_value + for classe, value in enumerate(np.linspace(min_value, max_value, nb_classes)): + if value == min_value: + y_[np.where(y < min_value)] = classe + elif value == max_value: + y_[np.where(y >= max_value)] = classe + else: + y_[np.where(np.logical_and(previous_value <= y, y < value))] = classe + previous_value = value + y_ = np.array(y_, dtype=int) + return y_ + + +def configure_marginal_classication( + ClassMethod, X, y, nb_classes=4, min_value=None, max_value=None +): + """ + Configure ClassMethod model for feature importance analysis. + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data matrix where each column represents a feature + and each row a sample. + y : array-like of shape (n_samples,) + Target variable array. + Returns + ------- + importance : array-like + Array containing importance scores for each feature. + Higher values indicate greater feature importance in predicting + the target variable. + Notes + ----- + The function performs the following steps: + 1. Intanciate ClassMethod + 2. Calculates feature importance + """ + y_ = classification_float(y, nb_classes, min_value, max_value) + # instantiate model + vi = ClassMethod() + # fit the model using the training set + vi.fit() + # calculate feature importance using the test set + importance = vi.importance(X, y_) + return np.array(importance) + + +parameter_exact = [ + ("HiDim", 150, 200, 1, 0.0, 42, 1.0, np.inf, 0.0), + ("HiDim with noise", 150, 200, 1, 0.0, 42, 1.0, 10.0, 0.0), + ("HiDim with correlated noise", 150, 200, 1, 0.0, 42, 1.0, 10.0, 0.5), + ("HiDim with correlated features", 150, 200, 1, 0.8, 42, 1.0, np.inf, 0.0), +] + + +@pytest.mark.parametrize( + "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial", + zip(*(list(zip(*parameter_exact))[1:])), + ids=list(zip(*parameter_exact))[0], +) +@pytest.mark.parametrize( + "ClassVI", + [ANOVA, MutualInformationClassification], + ids=["ANOVA", "MutualInformation"], +) +def test_linear_data_exact(data_generator, ClassVI): + """Tests the method on linear cases with noise and/or correlation""" + X, y, important_features, _ = data_generator + + importance = configure_marginal_classication(ClassVI, X, y) + # check that importance scores are defined for each feature + assert importance.shape == (X.shape[1],) + # check that important features have the highest importance scores + assert np.all([int(i) in important_features for i in np.argsort(importance)[-1:]]) + + +parameter_bad_detection = [ + ("HiDim with high correlated features", 150, 200, 1, 1.0, 42, 1.0, 5.0, 0.0), + ("HiDim multivaribale", 150, 200, 10, 0.0, 42, 1.0, np.inf, 0.0), + ("HiDim multivaribale noise", 150, 200, 10, 0.0, 42, 1.0, 10.0, 0.0), +] + + +@pytest.mark.parametrize( + "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial", + zip(*(list(zip(*parameter_bad_detection))[1:])), + ids=list(zip(*parameter_bad_detection))[0], +) +@pytest.mark.parametrize( + "ClassVI", + [ANOVA, MutualInformationClassification], + ids=["ANOVA", "MutualInformation"], +) +def test_linear_data_fail(data_generator, ClassVI): + """Tests the faillure of the method on linear cases with correlation + or multiple variable of importance""" + X, y, important_features, _ = data_generator + size_support = np.sum(important_features != 0) + + importance = configure_marginal_classication(ClassVI, X, y) + # check that importance scores are defined for each feature + assert importance.shape == (X.shape[1],) + # check that important features have the highest importance scores + assert np.any( + [ + int(i) not in important_features + for i in np.argsort(importance)[-size_support:] + ] + ) + + +################################################################################ +# Specific test for ANOVA +parameter_exact_ANOVA = [ + ("HiDim with high level noise", 150, 200, 1, 0.2, 42, 1.0, 0.5, 0.0), +] + + +@pytest.mark.parametrize( + "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial", + zip(*(list(zip(*parameter_exact_ANOVA))[1:])), + ids=list(zip(*parameter_exact_ANOVA))[0], +) +def test_ANOVA_exact(data_generator): + """Tests the method on high noise""" + X, y, important_features, not_important_features = data_generator + + importance = configure_marginal_classication(ANOVA, X, y) + # check that importance scores are defined for each feature + assert importance.shape == (X.shape[1],) + # check that important features have the highest importance scores + assert np.all([int(i) in important_features for i in np.argsort(importance)[-1:]]) + # Check that important features have higher mean importance scores + assert ( + importance[important_features].mean() + > importance[not_important_features][ + np.where(importance[not_important_features] != 0) + ].mean() + ) + + +# Spefic test for MutualInformation +@pytest.mark.parametrize( + "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial", + [parameter_exact[0][1:]], + ids=[parameter_exact[0][0]], +) +@pytest.mark.parametrize( + "discrete_features, n_neighbors", + [ + ("auto", 5), + (False, 5), + ], + ids=[ + "change number of neighboor", + "discrete_features True", + ], +) +def test_MutualInformation_exact(data_generator, discrete_features, n_neighbors): + """Tests parameters of classes""" + X, y, important_features, _ = data_generator + y_ = classification_float(y, nb_classes=6, min_value=-1, max_value=1) + + importance = ( + MutualInformationClassification( + discrete_features=discrete_features, n_neighbors=n_neighbors + ) + .fit() + .importance(X, y_) + ) + # check that importance scores are defined for each feature + assert importance.shape == (X.shape[1],) + # check that important features have the highest importance scores + assert np.all([int(i) in important_features for i in np.argsort(importance)[-1:]]) + + +parameter_fail_MutualInformation = [ + ("HiDim with high level noise", 150, 200, 1, 0.2, 42, 1.0, 0.5, 0.0), +] + + +@pytest.mark.parametrize( + "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial", + zip(*(list(zip(*parameter_fail_MutualInformation))[1:])), + ids=list(zip(*parameter_fail_MutualInformation))[0], +) +def test_MutualInformation_fail(data_generator): + """Tests faillure of the method on high noise""" + X, y, important_features, _ = data_generator + size_support = np.sum(important_features != 0) + + importance = configure_marginal_classication(MutualInformationClassification, X, y) + # check that importance scores are defined for each feature + assert importance.shape == (X.shape[1],) + # check that important features have the highest importance scores + assert np.any( + [ + int(i) not in important_features + for i in np.argsort(importance)[-size_support:] + ] + ) + + +############################################################################## +@pytest.mark.parametrize( + "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial", + [(150, 200, 1, 0.0, 42, 1.0, 0.0, 0.0)], + ids=["default data"], +) +@pytest.mark.parametrize( + "ClassVI", + [ANOVA, MutualInformationClassification], + ids=["ANOVA", "MutualInformation"], +) +class TestClass: + """Test the element of the class""" + + def test_init(self, data_generator, ClassVI): + """Test initialization work""" + classvi = ClassVI() + + def test_fit(self, data_generator, ClassVI): + """Test fitting is doing nothing""" + classvi = ClassVI() + classvi_reference = deepcopy(classvi) + classvi.fit() + for attribute_name in classvi.__dict__.keys(): + assert classvi.__getattribute__( + attribute_name + ) == classvi_reference.__getattribute__(attribute_name) + + def test_categorical( + self, + n_samples, + n_features, + support_size, + rho, + seed, + value, + signal_noise_ratio, + rho_serial, + ClassVI, + ): + """Test the fit_importance function on mix type of feature""" + rng = np.random.default_rng(seed) + X_cont = rng.random((n_samples, 2)) + X_cat = rng.integers(low=0, high=3, size=(n_samples, 1)) + X = np.hstack([X_cont, X_cat]) + y = rng.integers(0, 10, (n_samples, 1)) + + classvi = ClassVI() + + importances = classvi.fit_importance(X, y) + assert len(importances) == 3 + assert np.all(importances >= 0) + + +############################################################################## +def test_error_abstract_class(): + """Test the warning and the error of the class AdapterScikitLearn""" + adapter = AdapterScikitLearn() + with pytest.warns(Warning, match="X won't be used"): + adapter.fit(X=np.random.rand(10, 10)) + with pytest.warns(Warning, match="y won't be used"): + adapter.fit(y=np.random.rand(10, 1)) + with pytest.raises(NotImplementedError): + adapter.importance(X=np.random.rand(10, 10), y=np.random.rand(10, 1)) + with pytest.raises(NotImplementedError): + adapter.fit_importance(X=np.random.rand(10, 10), y=np.random.rand(10, 1)) + with pytest.raises(NotImplementedError): + with pytest.warns(Warning, match="cv won't be used"): + adapter.fit_importance( + X=np.random.rand(10, 10), y=np.random.rand(10, 1), cv="other" + ) + + +def test_error_Mutual_Information(): + """Test error in Mutual Information""" + with pytest.raises( + AssertionError, + match="the value of problem type should be 'regression' or 'classification'", + ): + MutualInformation(problem_type="bad type") + vi = MutualInformation() + vi.problem_type = "bad type" + with pytest.raises( + ValueError, + match="the value of problem type should be 'regression' or 'classification'", + ): + vi.importance(X=np.random.rand(10, 10), y=np.random.rand(10, 1)) diff --git a/test/marginal/test_selection_variable_scikit_learn_regression.py b/test/marginal/test_selection_variable_scikit_learn_regression.py new file mode 100644 index 000000000..9a33fb602 --- /dev/null +++ b/test/marginal/test_selection_variable_scikit_learn_regression.py @@ -0,0 +1,229 @@ +from copy import deepcopy +import numpy as np +import pytest + +from hidimstat import UnivariateLinearRegressionTests, MutualInformation + + +def configure_marginal_regression(ClassMethod, X, y): + """ + Configure ClassMethod model for feature importance analysis. + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data matrix where each column represents a feature + and each row a sample. + y : array-like of shape (n_samples,) + Target variable array. + Returns + ------- + importance : array-like + Array containing importance scores for each feature. + Higher values indicate greater feature importance in predicting + the target variable. + Notes + ----- + The function performs the following steps: + 1. Intanciate ClassMethod + 2. Calculates feature importance + """ + # instantiate model + vi = ClassMethod() + # fit the model using the training set + vi.fit() + # calculate feature importance using the test set + importance = vi.importance(X, y) + return np.array(importance) + + +parameter_exact = [ + ("HiDim", 150, 200, 1, 0.0, 42, 1.0, np.inf, 0.0), + ("HiDim with noise", 150, 200, 1, 0.0, 42, 1.0, 10.0, 0.0), + ("HiDim with correlated noise", 150, 200, 1, 0.0, 42, 1.0, 10.0, 0.5), + ("HiDim with correlated features", 150, 200, 1, 0.8, 42, 1.0, np.inf, 0.0), +] + + +@pytest.mark.parametrize( + "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial", + zip(*(list(zip(*parameter_exact))[1:])), + ids=list(zip(*parameter_exact))[0], +) +@pytest.mark.parametrize( + "ClassVI", + [UnivariateLinearRegressionTests, MutualInformation], + ids=["UnivariateLinearRegressionTests", "MutualInformation"], +) +def test_linear_data_exact(data_generator, ClassVI): + """Tests the method on linear cases with noise and/or correlation""" + X, y, important_features, _ = data_generator + + importance = configure_marginal_regression(ClassVI, X, y) + # check that importance scores are defined for each feature + assert importance.shape == (X.shape[1],) + # check that important features have the highest importance scores + assert np.all([int(i) in important_features for i in np.argsort(importance)[-1:]]) + + +parameter_bad_detection = [ + ("HiDim with high correlated features", 150, 200, 1, 1.0, 42, 1.0, 5.0, 0.0), + ("HiDim multivaribale", 150, 200, 10, 0.0, 42, 1.0, np.inf, 0.0), + ("HiDim multivaribale noise", 150, 200, 10, 0.0, 42, 1.0, 10.0, 0.0), +] + + +@pytest.mark.parametrize( + "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial", + zip(*(list(zip(*parameter_bad_detection))[1:])), + ids=list(zip(*parameter_bad_detection))[0], +) +@pytest.mark.parametrize( + "ClassVI", + [UnivariateLinearRegressionTests, MutualInformation], + ids=["UnivariateLinearRegressionTests", "MutualInformation"], +) +def test_linear_data_fail(data_generator, ClassVI): + """Tests the faillure of the method on linear cases with correlation + or multiple variable of importance""" + X, y, important_features, _ = data_generator + size_support = np.sum(important_features != 0) + + importance = configure_marginal_regression(ClassVI, X, y) + # check that importance scores are defined for each feature + assert importance.shape == (X.shape[1],) + # check that important features have the highest importance scores + assert np.any( + [ + int(i) not in important_features + for i in np.argsort(importance)[-size_support:] + ] + ) + + +############################################################################## +# Spefic test for UnivariateLinearRegressionTests +parameter_exact_UnivariateLinearRegressionTests = [ + ("HiDim with high level noise", 150, 200, 1, 0.2, 42, 1.0, 0.5, 0.0), +] + + +@pytest.mark.parametrize( + "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial", + zip(*(list(zip(*parameter_exact_UnivariateLinearRegressionTests))[1:])), + ids=list(zip(*parameter_exact_UnivariateLinearRegressionTests))[0], +) +@pytest.mark.parametrize( + "center, force_finite", + [ + (True, True), + (False, True), + (True, False), + ], + ids=["default", "no center", "no force finite"], +) +def test_UnivariateLinearRegressionTests_exact(data_generator, center, force_finite): + """Tests parameters of classes""" + X, y, important_features, not_important_features = data_generator + + importance = ( + UnivariateLinearRegressionTests(center=center, force_finite=force_finite) + .fit() + .importance(X, y) + ) + # check that importance scores are defined for each feature + assert importance.shape == (X.shape[1],) + # check that important features have the highest importance scores + assert np.all([int(i) in important_features for i in np.argsort(importance)[-1:]]) + # Check that important features have higher mean importance scores + assert ( + importance[important_features].mean() + > importance[not_important_features][ + np.where(importance[not_important_features] != 0) + ].mean() + ) + + +# Spefic test for MutualInformation +@pytest.mark.parametrize( + "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial", + [parameter_exact[0][1:]], + ids=[parameter_exact[0][0]], +) +@pytest.mark.parametrize( + "discrete_features, n_neighbors", + [ + ("auto", 5), + (False, 3), + ], + ids=["change number of neighboor", "discrete_features False"], +) +def test_MutualInformation_exact(data_generator, discrete_features, n_neighbors): + """Tests parameters of classes""" + X, y, important_features, _ = data_generator + size_support = np.sum(important_features != 0) + + importance = ( + MutualInformation(discrete_features=discrete_features, n_neighbors=n_neighbors) + .fit() + .importance(X, y) + ) + # check that importance scores are defined for each feature + assert importance.shape == (X.shape[1],) + # check that important features have the highest importance scores + assert np.all( + [int(i) in important_features for i in np.argsort(importance)[-size_support:]] + ) + + +############################################################################## +@pytest.mark.parametrize( + "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial", + [(150, 200, 1, 0.0, 42, 1.0, 0.0, 0.0)], + ids=["default data"], +) +@pytest.mark.parametrize( + "ClassVI", + [UnivariateLinearRegressionTests, MutualInformation], + ids=["UnivariateLinearRegressionTests", "MutualInformation"], +) +class TestClass: + """Test the element of the class""" + + def test_init(self, data_generator, ClassVI): + """Test initialization work""" + classvi = ClassVI() + + def test_fit(self, data_generator, ClassVI): + """Test fitting is doing nothing""" + classvi = ClassVI() + classvi_reference = deepcopy(classvi) + classvi.fit() + for attribute_name in classvi.__dict__.keys(): + assert classvi.__getattribute__( + attribute_name + ) == classvi_reference.__getattribute__(attribute_name) + + def test_categorical( + self, + n_samples, + n_features, + support_size, + rho, + seed, + value, + signal_noise_ratio, + rho_serial, + ClassVI, + ): + """Test the fit_importance function on mix type of feature""" + rng = np.random.default_rng(seed) + X_cont = rng.random((n_samples, 2)) + X_cat = rng.integers(low=0, high=3, size=(n_samples, 1)) + X = np.hstack([X_cont, X_cat]) + y = rng.random((n_samples, 1)) + + classvi = ClassVI() + + importances = classvi.fit_importance(X, y) + assert len(importances) == 3 + assert np.all(importances >= 0) \ No newline at end of file From d85c37343a0ea26770e87bedf192a26760b95f4e Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 27 Aug 2025 19:24:06 +0200 Subject: [PATCH 2/4] fix format --- src/hidimstat/marginal/__init__.py | 2 +- .../marginal/test_selection_variable_scikit_learn_regression.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hidimstat/marginal/__init__.py b/src/hidimstat/marginal/__init__.py index afd1e43e4..47a788fdc 100644 --- a/src/hidimstat/marginal/__init__.py +++ b/src/hidimstat/marginal/__init__.py @@ -10,4 +10,4 @@ "AnalysisOfVariance", "UnivariateLinearRegressionTests", "MutualInformation", -] \ No newline at end of file +] diff --git a/test/marginal/test_selection_variable_scikit_learn_regression.py b/test/marginal/test_selection_variable_scikit_learn_regression.py index 9a33fb602..e1dfe4fde 100644 --- a/test/marginal/test_selection_variable_scikit_learn_regression.py +++ b/test/marginal/test_selection_variable_scikit_learn_regression.py @@ -226,4 +226,4 @@ def test_categorical( importances = classvi.fit_importance(X, y) assert len(importances) == 3 - assert np.all(importances >= 0) \ No newline at end of file + assert np.all(importances >= 0) From 9b03055baded9cc9d8ac2fb208aab09381c7814a Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 29 Aug 2025 12:39:40 +0200 Subject: [PATCH 3/4] change the importance to f-statistic --- src/hidimstat/marginal/selection_variable_scikit_learn.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/hidimstat/marginal/selection_variable_scikit_learn.py b/src/hidimstat/marginal/selection_variable_scikit_learn.py index 6be5a645a..c89a95edd 100644 --- a/src/hidimstat/marginal/selection_variable_scikit_learn.py +++ b/src/hidimstat/marginal/selection_variable_scikit_learn.py @@ -134,9 +134,8 @@ def importance(self, X, y): f_statistic, p_values = f_classif(X, y) # Test the opposite hypothese to the anova # Test the similarity in the distribution instead of the difference - self.importances_ = 1 - p_values + self.importances_ = f_statistic self.pvalues_ = 1 - p_values - self.f_statitstic_ = f_statistic return self.importances_ @@ -198,9 +197,8 @@ def importance(self, X, y): ) # Test the opposite hypothese to the anova # Test the similarity in the distribution instead of the difference - self.importances_ = 1 - p_values + self.importances_ = f_statistic self.pvalues_ = 1 - p_values - self.f_statitstic_ = f_statistic return self.importances_ From cc2a7412090936db3b0a15f3a5dd5666fb2a9ba3 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 2 Sep 2025 11:39:31 +0200 Subject: [PATCH 4/4] fix p-value --- src/hidimstat/marginal/selection_variable_scikit_learn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hidimstat/marginal/selection_variable_scikit_learn.py b/src/hidimstat/marginal/selection_variable_scikit_learn.py index c89a95edd..3ce705ca0 100644 --- a/src/hidimstat/marginal/selection_variable_scikit_learn.py +++ b/src/hidimstat/marginal/selection_variable_scikit_learn.py @@ -135,7 +135,7 @@ def importance(self, X, y): # Test the opposite hypothese to the anova # Test the similarity in the distribution instead of the difference self.importances_ = f_statistic - self.pvalues_ = 1 - p_values + self.pvalues_ = p_values return self.importances_ @@ -198,7 +198,7 @@ def importance(self, X, y): # Test the opposite hypothese to the anova # Test the similarity in the distribution instead of the difference self.importances_ = f_statistic - self.pvalues_ = 1 - p_values + self.pvalues_ = p_values return self.importances_