From df93c7819fa2109ad06f6c556b265f917b54e44e Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 2 Sep 2025 16:23:48 +0200 Subject: [PATCH 01/24] New API for CFI, PFI, LOCO --- src/hidimstat/base_perturbation.py | 62 +++++++----- .../conditional_feature_importance.py | 99 ++++++++++--------- src/hidimstat/conditional_sampling.py | 8 +- src/hidimstat/leave_one_covariate_out.py | 76 +++++++------- .../permutation_feature_importance.py | 70 ++++++------- 5 files changed, 168 insertions(+), 147 deletions(-) diff --git a/src/hidimstat/base_perturbation.py b/src/hidimstat/base_perturbation.py index ef3c58343..741c527cd 100644 --- a/src/hidimstat/base_perturbation.py +++ b/src/hidimstat/base_perturbation.py @@ -14,9 +14,9 @@ class BasePerturbation(BaseVariableImportance): def __init__( self, estimator, + method: str = "predict", loss: callable = root_mean_squared_error, n_permutations: int = 50, - method: str = "predict", n_jobs: int = 1, ): """ @@ -27,6 +27,10 @@ def __init__( ---------- estimator : sklearn compatible estimator, optional The estimator to use for the prediction. + method : str, default="predict" + The method used for making predictions. This determines the predictions + passed to the loss function. Supported methods are "predict", + "predict_proba", "decision_function", "transform". loss : callable, default=root_mean_squared_error The function to compute the loss when comparing the perturbed model to the original model. @@ -35,10 +39,6 @@ def __init__( Specifies the number of times the variable group (residual for CFI) is permuted. For each permutation, the perturbed model's loss is calculated and averaged over all permutations. - method : str, default="predict" - The method used for making predictions. This determines the predictions - passed to the loss function. Supported methods are "predict", - "predict_proba", "decision_function", "transform". n_jobs : int, default=1 The number of parallel jobs to run. Parallelization is done over the variables or groups of variables. @@ -50,9 +50,16 @@ def __init__( self.loss = loss _check_vim_predict_method(method) self.method = method - self.n_jobs = n_jobs self.n_permutations = n_permutations - self.n_groups = None + self.n_jobs = n_jobs + # variable set in fit + self.groups = None + # varaible set in importance + self.loss_reference_ = None + self.loss_ = None + # internal variables + self._n_groups = None + self._groups_ids = None def fit(self, X, y=None, groups=None): """Base fit method for perturbation-based methods. Identifies the groups. @@ -69,11 +76,11 @@ def fit(self, X, y=None, groups=None): identified based on the columns of X. """ if groups is None: - self.n_groups = X.shape[1] - self.groups = {j: [j] for j in range(self.n_groups)} + self._n_groups = X.shape[1] + self.groups = {j: [j] for j in range(self._n_groups)} self._groups_ids = np.array(list(self.groups.values()), dtype=int) elif isinstance(groups, dict): - self.n_groups = len(groups) + self._n_groups = len(groups) self.groups = groups if isinstance(X, pd.DataFrame): self._groups_ids = [] @@ -91,6 +98,7 @@ def fit(self, X, y=None, groups=None): ] else: raise ValueError("groups needs to be a dictionnary") + return self def predict(self, X): """ @@ -139,27 +147,25 @@ def importance(self, X, y): """ self._check_fit(X) - out_dict = dict() - y_pred = getattr(self.estimator, self.method)(X) - loss_reference = self.loss(y, y_pred) - out_dict["loss_reference"] = loss_reference + self.loss_reference_ = self.loss(y, y_pred) y_pred = self.predict(X) - out_dict["loss"] = dict() + self.loss_ = dict() for j, y_pred_j in enumerate(y_pred): list_loss = [] for y_pred_perm in y_pred_j: list_loss.append(self.loss(y, y_pred_perm)) - out_dict["loss"][j] = np.array(list_loss) + self.loss_[j] = np.array(list_loss) - out_dict["importance"] = np.array( + self.importances_ = np.array( [ - np.mean(out_dict["loss"][j]) - loss_reference - for j in range(self.n_groups) + np.mean(self.loss_[j]) - self.loss_reference_ + for j in range(self._n_groups) ] ) - return out_dict + self.pvalues_ = None + return self.importances_ def _check_fit(self, X): """ @@ -183,11 +189,7 @@ def _check_fit(self, X): If the number of features in X does not match the total number of features in the grouped variables. """ - if ( - self.n_groups is None - or not hasattr(self, "groups") - or not hasattr(self, "_groups_ids") - ): + if self._n_groups is None or self.groups is None or self._groups_ids is None: raise ValueError( "The class is not fitted. The fit method must be called" " to set variable groups. If no grouping is needed," @@ -231,6 +233,16 @@ def _check_fit(self, X): f"{number_unique_feature_in_groups}" ) + def _check_importance(self): + """ + Checks if the loss have been computed. + """ + super()._check_importance() + if self.loss_reference_ is None or self.loss_ is None: + raise ValueError( + "The importances need to be called before calling this method" + ) + def _joblib_predict_one_group(self, X, group_id, group_key): """ Compute the predictions after perturbation of the data for a given diff --git a/src/hidimstat/conditional_feature_importance.py b/src/hidimstat/conditional_feature_importance.py index 9b0e7905f..0f433ea4c 100644 --- a/src/hidimstat/conditional_feature_importance.py +++ b/src/hidimstat/conditional_feature_importance.py @@ -9,63 +9,65 @@ class CFI(BasePerturbation): + """ + Conditional Feature Importance (CFI) algorithm. + :footcite:t:`Chamma_NeurIPS2023` and for group-level see + :footcite:t:`Chamma_AAAI2024`. + + Parameters + ---------- + estimator : sklearn compatible estimator, optional + The estimator to use for the prediction. + method : str, default="predict" + The method to use for the prediction. This determines the predictions passed + to the loss function. Supported methods are "predict", "predict_proba", + "decision_function", "transform". + loss : callable, default=root_mean_squared_error + The loss function to use when comparing the perturbed model to the full + model. + n_permutations : int, default=50 + The number of permutations to perform. For each variable/group of variables, + the mean of the losses over the `n_permutations` is computed. + imputation_model_continuous : sklearn compatible estimator, optional + The model used to estimate the conditional distribution of a given + continuous variable/group of variables given the others. + imputation_model_categorical : sklearn compatible estimator, optional + The model used to estimate the conditional distribution of a given + categorical variable/group of variables given the others. Binary is + considered as a special case of categorical. + categorical_max_cardinality : int, default=10 + The maximum cardinality of a variable to be considered as categorical + when the variable type is inferred (set to "auto" or not provided). + random_state : int, default=None + The random state to use for sampling. + n_jobs : int, default=1 + The number of jobs to run in parallel. Parallelization is done over the + variables or groups of variables. + + References + ---------- + .. footbibliography:: + """ + def __init__( self, estimator, - loss: callable = root_mean_squared_error, method: str = "predict", - n_jobs: int = 1, + loss: callable = root_mean_squared_error, n_permutations: int = 50, imputation_model_continuous=None, imputation_model_categorical=None, - random_state: int = None, categorical_max_cardinality: int = 10, + random_state: int = None, + n_jobs: int = 1, ): - """ - Conditional Feature Importance (CFI) algorithm. - :footcite:t:`Chamma_NeurIPS2023` and for group-level see - :footcite:t:`Chamma_AAAI2024`. - Parameters - ---------- - estimator : sklearn compatible estimator, optional - The estimator to use for the prediction. - loss : callable, default=root_mean_squared_error - The loss function to use when comparing the perturbed model to the full - model. - method : str, default="predict" - The method to use for the prediction. This determines the predictions passed - to the loss function. Supported methods are "predict", "predict_proba", - "decision_function", "transform". - n_jobs : int, default=1 - The number of jobs to run in parallel. Parallelization is done over the - variables or groups of variables. - n_permutations : int, default=50 - The number of permutations to perform. For each variable/group of variables, - the mean of the losses over the `n_permutations` is computed. - imputation_model_continuous : sklearn compatible estimator, optional - The model used to estimate the conditional distribution of a given - continuous variable/group of variables given the others. - imputation_model_categorical : sklearn compatible estimator, optional - The model used to estimate the conditional distribution of a given - categorical variable/group of variables given the others. Binary is - considered as a special case of categorical. - random_state : int, default=None - The random state to use for sampling. - categorical_max_cardinality : int, default=10 - The maximum cardinality of a variable to be considered as categorical - when the variable type is inferred (set to "auto" or not provided). - - References - ---------- - .. footbibliography:: - """ super().__init__( estimator=estimator, - loss=loss, method=method, - n_jobs=n_jobs, + loss=loss, n_permutations=n_permutations, + n_jobs=n_jobs, ) # check the validity of the inputs @@ -83,7 +85,8 @@ def __init__( self.random_state = random_state def fit(self, X, y=None, groups=None, var_type="auto"): - """Fit the imputation models. + """ + Fit the imputation models. Parameters ---------- @@ -107,13 +110,13 @@ def fit(self, X, y=None, groups=None, var_type="auto"): self.random_state = check_random_state(self.random_state) super().fit(X, None, groups=groups) if isinstance(var_type, str): - self.var_type = [var_type for _ in range(self.n_groups)] + var_type = [var_type for _ in range(self._n_groups)] else: - self.var_type = var_type + var_type = var_type self._list_imputation_models = [ ConditionalSampler( - data_type=self.var_type[groupd_id], + data_type=var_type[groupd_id], model_regression=( None if self.imputation_model_continuous is None @@ -127,7 +130,7 @@ def fit(self, X, y=None, groups=None, var_type="auto"): random_state=self.random_state, categorical_max_cardinality=self.categorical_max_cardinality, ) - for groupd_id in range(self.n_groups) + for groupd_id in range(self._n_groups) ] # Parallelize the fitting of the covariate estimators diff --git a/src/hidimstat/conditional_sampling.py b/src/hidimstat/conditional_sampling.py index f8920581a..609c7ea5d 100644 --- a/src/hidimstat/conditional_sampling.py +++ b/src/hidimstat/conditional_sampling.py @@ -45,8 +45,8 @@ def __init__( model_regression=None, model_categorical=None, data_type: str = "auto", - random_state=None, categorical_max_cardinality=10, + random_state=None, ): """ Class use to sample from the conditional distribution $p(X^j | X^{-j})$. @@ -62,11 +62,11 @@ def __init__( The variable type. Supported types include "auto", "continuous", and "categorical". If "auto", the type is inferred from the cardinality of the unique values passed to the `fit` method. - random_state : int, optional - The random state to use for sampling. categorical_max_cardinality : int, default=10 The maximum cardinality of a variable to be considered as categorical when `data_type` is "auto". + random_state : int, optional + The random state to use for sampling. """ # check the validity of the inputs @@ -79,8 +79,8 @@ def __init__( self.data_type = data_type self.model_regression = model_regression self.model_categorical = model_categorical - self.rng = check_random_state(random_state) self.categorical_max_cardinality = categorical_max_cardinality + self.rng = check_random_state(random_state) def fit(self, X: np.ndarray, y: np.ndarray): r""" diff --git a/src/hidimstat/leave_one_covariate_out.py b/src/hidimstat/leave_one_covariate_out.py index c9c64c464..8e946a55a 100644 --- a/src/hidimstat/leave_one_covariate_out.py +++ b/src/hidimstat/leave_one_covariate_out.py @@ -8,55 +8,59 @@ class LOCO(BasePerturbation): + """ + Leave-One-Covariate-Out (LOCO) as presented in + :footcite:t:`lei2018distribution` and :footcite:t:`verdinelli2024feature`. + The model is re-fitted for each variable/group of variables. The importance is + then computed as the difference between the loss of the full model and the loss + of the model without the variable/group. + + Parameters + ---------- + estimator : sklearn compatible estimator, optional + The estimator to use for the prediction. + method : str, default="predict" + The method to use for the prediction. This determines the predictions passed + to the loss function. Supported methods are "predict", "predict_proba", + "decision_function", "transform". + loss : callable, default=root_mean_squared_error + The loss function to use when comparing the perturbed model to the full + model. + n_jobs : int, default=1 + The number of jobs to run in parallel. Parallelization is done over the + variables or groups of variables. + + Notes + ----- + :footcite:t:`Williamson_General_2023` also presented a LOCO method with an + additional data splitting strategy. + + References + ---------- + .. footbibliography:: + """ + def __init__( self, estimator, - loss: callable = root_mean_squared_error, method: str = "predict", + loss: callable = root_mean_squared_error, n_jobs: int = 1, ): - """ - Leave-One-Covariate-Out (LOCO) as presented in - :footcite:t:`lei2018distribution` and :footcite:t:`verdinelli2024feature`. - The model is re-fitted for each variable/group of variables. The importance is - then computed as the difference between the loss of the full model and the loss - of the model without the variable/group. - Parameters - ---------- - estimator : sklearn compatible estimator, optional - The estimator to use for the prediction. - loss : callable, default=root_mean_squared_error - The loss function to use when comparing the perturbed model to the full - model. - method : str, default="predict" - The method to use for the prediction. This determines the predictions passed - to the loss function. Supported methods are "predict", "predict_proba", - "decision_function", "transform". - n_jobs : int, default=1 - The number of jobs to run in parallel. Parallelization is done over the - variables or groups of variables. - - Notes - ----- - :footcite:t:`Williamson_General_2023` also presented a LOCO method with an - additional data splitting strategy. - - References - ---------- - .. footbibliography:: - """ super().__init__( estimator=estimator, - loss=loss, method=method, - n_jobs=n_jobs, + loss=loss, n_permutations=1, + n_jobs=n_jobs, ) + # internal variable self._list_estimators = [] def fit(self, X, y, groups=None): - """Fit a model after removing each covariate/group of covariates. + """ + Fit a model after removing each covariate/group of covariates. Parameters ---------- @@ -75,7 +79,7 @@ def fit(self, X, y, groups=None): """ super().fit(X, y, groups) # create a list of covariate estimators for each group if not provided - self._list_estimators = [clone(self.estimator) for _ in range(self.n_groups)] + self._list_estimators = [clone(self.estimator) for _ in range(self._n_groups)] # Parallelize the fitting of the covariate estimators self._list_estimators = Parallel(n_jobs=self.n_jobs)( @@ -93,7 +97,7 @@ def _joblib_fit_one_group(self, estimator, X, y, key_groups): estimator.fit(X_minus_j, y) return estimator - def _joblib_predict_one_group(self, X, group_id, key_groups): + def _joblib_predict_one_group(self, X, group_id, group_key): """Predict the target variable after removing a group of covariates. Used in parallel.""" X_minus_j = np.delete(X, self._groups_ids[group_id], axis=1) diff --git a/src/hidimstat/permutation_feature_importance.py b/src/hidimstat/permutation_feature_importance.py index 29d007656..e6fa08a44 100644 --- a/src/hidimstat/permutation_feature_importance.py +++ b/src/hidimstat/permutation_feature_importance.py @@ -6,52 +6,54 @@ class PFI(BasePerturbation): + """ + Permutation Feature Importance algorithm as presented in + :footcite:t:`breimanRandomForests2001`. For each variable/group of variables, + the importance is computed as the difference between the loss of the initial + model and the loss of the model with the variable/group permuted. + The method was also used in :footcite:t:`mi2021permutation` + + Parameters + ---------- + estimator : sklearn compatible estimator, optionals + The estimator to use for the prediction. + method : str, default="predict" + The method to use for the prediction. This determines the predictions passed + to the loss function. Supported methods are "predict", "predict_proba", + "decision_function", "transform". + loss : callable, default=root_mean_squared_error + The loss function to use when comparing the perturbed model to the full + model. + n_permutations : int, default=50 + The number of permutations to perform. For each variable/group of variables, + the mean of the losses over the `n_permutations` is computed. + random_state : int, default=None + The random state to use for sampling. + n_jobs : int, default=1 + The number of jobs to run in parallel. Parallelization is done over the + variables or groups of variables. + + References + ---------- + .. footbibliography:: + """ + def __init__( self, estimator, - loss: callable = root_mean_squared_error, method: str = "predict", - n_jobs: int = 1, + loss: callable = root_mean_squared_error, n_permutations: int = 50, random_state: int = None, + n_jobs: int = 1, ): - """ - Permutation Feature Importance algorithm as presented in - :footcite:t:`breimanRandomForests2001`. For each variable/group of variables, - the importance is computed as the difference between the loss of the initial - model and the loss of the model with the variable/group permuted. - The method was also used in :footcite:t:`mi2021permutation` - - Parameters - ---------- - estimator : sklearn compatible estimator, optionals - The estimator to use for the prediction. - loss : callable, default=root_mean_squared_error - The loss function to use when comparing the perturbed model to the full - model. - method : str, default="predict" - The method to use for the prediction. This determines the predictions passed - to the loss function. Supported methods are "predict", "predict_proba", - "decision_function", "transform". - n_jobs : int, default=1 - The number of jobs to run in parallel. Parallelization is done over the - variables or groups of variables. - n_permutations : int, default=50 - The number of permutations to perform. For each variable/group of variables, - the mean of the losses over the `n_permutations` is computed. - random_state : int, default=None - The random state to use for sampling. - References - ---------- - .. footbibliography:: - """ super().__init__( estimator=estimator, - loss=loss, method=method, - n_jobs=n_jobs, + loss=loss, n_permutations=n_permutations, + n_jobs=n_jobs, ) self.random_state = random_state From ccb60ed4b5229e96aea3e337cea3e3bdd0e9f2a3 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 2 Sep 2025 16:29:54 +0200 Subject: [PATCH 02/24] fix test for new API --- test/test_conditional_feature_importance.py | 15 ++++++--------- test/test_leave_one_covariate_out.py | 9 +++------ test/test_permutation_feature_importance.py | 9 +++------ 3 files changed, 12 insertions(+), 21 deletions(-) diff --git a/test/test_conditional_feature_importance.py b/test/test_conditional_feature_importance.py index f3bec9735..529cf891d 100644 --- a/test/test_conditional_feature_importance.py +++ b/test/test_conditional_feature_importance.py @@ -66,8 +66,7 @@ def run_cfi(X, y, n_permutation, seed): var_type="auto", ) # calculate feature importance using the test set - vim = cfi.importance(X_test, y_test) - importance = vim["importance"] + importance = cfi.importance(X_test, y_test) return importance @@ -199,9 +198,8 @@ def test_group(data_generator): ) # Warning expected since column names in pandas are not considered with pytest.warns(UserWarning, match="X does not have valid feature names, but"): - vim = cfi.importance(X_test_df, y_test) + importance = cfi.importance(X_test_df, y_test) - importance = vim["importance"] # Check if importance scores are computed for each feature assert importance.shape == (2,) # Verify that important feature group has higher score @@ -248,8 +246,7 @@ def test_classication(data_generator): groups=None, var_type=["continuous"] * X.shape[1], ) - vim = cfi.importance(X_test, y_test_clf) - importance = vim["importance"] + importance = cfi.importance(X_test, y_test_clf) # Check that importance scores are defined for each feature assert importance.shape == (X.shape[1],) # Check that important features have higher mean importance scores @@ -297,13 +294,13 @@ def test_fit(self, data_generator): # Test fit with auto var_type cfi.fit(X) assert len(cfi._list_imputation_models) == X.shape[1] - assert cfi.n_groups == X.shape[1] + assert cfi._n_groups == X.shape[1] # Test fit with specified groups groups = {"g1": [0, 1], "g2": [2, 3, 4]} cfi.fit(X, groups=groups) assert len(cfi._list_imputation_models) == 2 - assert cfi.n_groups == 2 + assert cfi._n_groups == 2 def test_categorical( self, @@ -334,7 +331,7 @@ def test_categorical( var_type = ["continuous", "continuous", "categorical"] cfi.fit(X, y, var_type=var_type) - importances = cfi.importance(X, y)["importance"] + importances = cfi.importance(X, y) assert len(importances) == 3 assert np.all(importances >= 0) diff --git a/test/test_leave_one_covariate_out.py b/test/test_leave_one_covariate_out.py index d8fd2a763..f6f4b2319 100644 --- a/test/test_leave_one_covariate_out.py +++ b/test/test_leave_one_covariate_out.py @@ -38,9 +38,8 @@ def test_loco(): y_train, groups=None, ) - vim = loco.importance(X_test, y_test) + importance = loco.importance(X_test, y_test) - importance = vim["importance"] assert importance.shape == (X.shape[1],) assert ( importance[important_features].mean() @@ -67,9 +66,8 @@ def test_loco(): ) # warnings because we doesn't considere the name of columns of pandas with pytest.warns(UserWarning, match="X does not have valid feature names, but"): - vim = loco.importance(X_test_df, y_test) + importance = loco.importance(X_test_df, y_test) - importance = vim["importance"] assert importance[0].mean() > importance[1].mean() # Classification case @@ -89,9 +87,8 @@ def test_loco(): y_train_clf, groups={"group_0": important_features, "the_group_1": non_important_features}, ) - vim_clf = loco_clf.importance(X_test, y_test_clf) + importance_clf = loco_clf.importance(X_test, y_test_clf) - importance_clf = vim_clf["importance"] assert importance_clf.shape == (2,) assert importance[0].mean() > importance[1].mean() diff --git a/test/test_permutation_feature_importance.py b/test/test_permutation_feature_importance.py index b9639f359..ee0a870c1 100644 --- a/test/test_permutation_feature_importance.py +++ b/test/test_permutation_feature_importance.py @@ -39,9 +39,8 @@ def test_permutation_importance(): y_train, groups=None, ) - vim = pfi.importance(X_test, y_test) + importance = pfi.importance(X_test, y_test) - importance = vim["importance"] assert importance.shape == (X.shape[1],) assert ( importance[important_features].mean() @@ -70,9 +69,8 @@ def test_permutation_importance(): ) # warnings because we doesn't considere the name of columns of pandas with pytest.warns(UserWarning, match="X does not have valid feature names, but"): - vim = pfi.importance(X_test_df, y_test) + importance = pfi.importance(X_test_df, y_test) - importance = vim["importance"] assert importance[0].mean() > importance[1].mean() # Classification case @@ -95,7 +93,6 @@ def test_permutation_importance(): y_train_clf, groups=None, ) - vim_clf = pfi_clf.importance(X_test, y_test_clf) + importance_clf = pfi_clf.importance(X_test, y_test_clf) - importance_clf = vim_clf["importance"] assert importance_clf.shape == (X.shape[1],) From 7c827ad57ee14269692847f1b01eb0d1b1f8615a Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 2 Sep 2025 16:51:42 +0200 Subject: [PATCH 03/24] fix example --- examples/plot_conditional_vs_marginal_xor_data.py | 2 +- .../plot_diabetes_variable_importance_example.py | 12 ++++++------ examples/plot_importance_classification_iris.py | 2 +- examples/plot_model_agnostic_importance.py | 6 ++---- examples/plot_pitfalls_permutation_importance.py | 4 ++-- src/hidimstat/leave_one_covariate_out.py | 5 +++-- src/hidimstat/permutation_feature_importance.py | 10 ++++++---- 7 files changed, 21 insertions(+), 20 deletions(-) diff --git a/examples/plot_conditional_vs_marginal_xor_data.py b/examples/plot_conditional_vs_marginal_xor_data.py index e9ea09ec9..55a095fc5 100644 --- a/examples/plot_conditional_vs_marginal_xor_data.py +++ b/examples/plot_conditional_vs_marginal_xor_data.py @@ -117,7 +117,7 @@ random_state=0, ) vim.fit(X_train, y_train) - importances.append(vim.importance(X_test, y_test)["importance"]) + importances.append(vim.importance(X_test, y_test)) importances = np.array(importances).T diff --git a/examples/plot_diabetes_variable_importance_example.py b/examples/plot_diabetes_variable_importance_example.py index 0340d9a3d..17e933802 100644 --- a/examples/plot_diabetes_variable_importance_example.py +++ b/examples/plot_diabetes_variable_importance_example.py @@ -184,14 +184,14 @@ def compute_pval(vim): # ------------------- -cfi_vim_arr = np.array([x["importance"] for x in cfi_importance_list]) / 2 +cfi_vim_arr = np.array(cfi_importance_list) / 2 cfi_pval = compute_pval(cfi_vim_arr) vim = [ pd.DataFrame( { "var": np.arange(cfi_vim_arr.shape[1]), - "importance": x["importance"], + "importance": x, "fold": i, "pval": cfi_pval, "method": "CFI", @@ -200,14 +200,14 @@ def compute_pval(vim): for x in cfi_importance_list ] -loco_vim_arr = np.array([x["importance"] for x in loco_importance_list]) +loco_vim_arr = np.array(loco_importance_list) loco_pval = compute_pval(loco_vim_arr) vim += [ pd.DataFrame( { "var": np.arange(loco_vim_arr.shape[1]), - "importance": x["importance"], + "importance": x, "fold": i, "pval": loco_pval, "method": "LOCO", @@ -216,14 +216,14 @@ def compute_pval(vim): for x in loco_importance_list ] -pfi_vim_arr = np.array([x["importance"] for x in pfi_importance_list]) +pfi_vim_arr = np.array(pfi_importance_list) pfi_pval = compute_pval(pfi_vim_arr) vim += [ pd.DataFrame( { "var": np.arange(pfi_vim_arr.shape[1]), - "importance": x["importance"], + "importance": x, "fold": i, "pval": pfi_pval, "method": "PFI", diff --git a/examples/plot_importance_classification_iris.py b/examples/plot_importance_classification_iris.py index eb92d7abf..9a2be2b72 100644 --- a/examples/plot_importance_classification_iris.py +++ b/examples/plot_importance_classification_iris.py @@ -93,7 +93,7 @@ def run_one_fold(X, y, model, train_index, test_index, vim_name="CFI", groups=No ) vim.fit(X[train_index], y[train_index], groups=groups) - importance = vim.importance(X[test_index], y[test_index])["importance"] + importance = vim.importance(X[test_index], y[test_index]) return pd.DataFrame( { diff --git a/examples/plot_model_agnostic_importance.py b/examples/plot_model_agnostic_importance.py index 9ace442d4..03f8ceded 100644 --- a/examples/plot_model_agnostic_importance.py +++ b/examples/plot_model_agnostic_importance.py @@ -108,10 +108,8 @@ vim_linear.fit(X[train], y[train]) vim_non_linear.fit(X[train], y[train]) - importances_linear.append(vim_linear.importance(X[test], y[test])["importance"]) - importances_non_linear.append( - vim_non_linear.importance(X[test], y[test])["importance"] - ) + importances_linear.append(vim_linear.importance(X[test], y[test])) + importances_non_linear.append(vim_non_linear.importance(X[test], y[test])) ################################################################################ diff --git a/examples/plot_pitfalls_permutation_importance.py b/examples/plot_pitfalls_permutation_importance.py index af4deb83e..e7a041d8e 100644 --- a/examples/plot_pitfalls_permutation_importance.py +++ b/examples/plot_pitfalls_permutation_importance.py @@ -132,7 +132,7 @@ ) pfi.fit(X_test, y_test) - permutation_importances.append(pfi.importance(X_test, y_test)["importance"]) + permutation_importances.append(pfi.importance(X_test, y_test)) permutation_importances = np.stack(permutation_importances) pval_pfi = ttest_1samp( permutation_importances, 0.0, axis=0, alternative="greater" @@ -200,7 +200,7 @@ ) cfi.fit(X_test, y_test) - conditional_importances.append(cfi.importance(X_test, y_test)["importance"]) + conditional_importances.append(cfi.importance(X_test, y_test)) cfi_pval = ttest_1samp( diff --git a/src/hidimstat/leave_one_covariate_out.py b/src/hidimstat/leave_one_covariate_out.py index 8e946a55a..5a865cb71 100644 --- a/src/hidimstat/leave_one_covariate_out.py +++ b/src/hidimstat/leave_one_covariate_out.py @@ -9,8 +9,9 @@ class LOCO(BasePerturbation): """ - Leave-One-Covariate-Out (LOCO) as presented in - :footcite:t:`lei2018distribution` and :footcite:t:`verdinelli2024feature`. + Leave-One-Covariate-Out (LOCO) algorithm + + This method is presented in :footcite:t:`lei2018distribution` and :footcite:t:`verdinelli2024feature`. The model is re-fitted for each variable/group of variables. The importance is then computed as the difference between the loss of the full model and the loss of the model without the variable/group. diff --git a/src/hidimstat/permutation_feature_importance.py b/src/hidimstat/permutation_feature_importance.py index e6fa08a44..10f18289f 100644 --- a/src/hidimstat/permutation_feature_importance.py +++ b/src/hidimstat/permutation_feature_importance.py @@ -7,10 +7,12 @@ class PFI(BasePerturbation): """ - Permutation Feature Importance algorithm as presented in - :footcite:t:`breimanRandomForests2001`. For each variable/group of variables, - the importance is computed as the difference between the loss of the initial - model and the loss of the model with the variable/group permuted. + Permutation Feature Importance algorithm + + This as presented in :footcite:t:`breimanRandomForests2001`. + For each variable/group of variables, the importance is computed as + the difference between the loss of the initial model and the loss of + the model with the variable/group permuted. The method was also used in :footcite:t:`mi2021permutation` Parameters From 82d61e6120eb55c83a280c3e4944e6ab1ef311f6 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 2 Sep 2025 17:23:02 +0200 Subject: [PATCH 04/24] add test for new check --- test/test_base_perturbation.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/test/test_base_perturbation.py b/test/test_base_perturbation.py index dd3ff6d6c..9198cf72d 100644 --- a/test/test_base_perturbation.py +++ b/test/test_base_perturbation.py @@ -12,3 +12,16 @@ def test_no_implemented_methods(): basic_class = BasePerturbation(estimator=estimator) with pytest.raises(NotImplementedError): basic_class._permutation(X, group_id=None) + + +def test_chek_importance(): + """test that the methods are not implemented in the base class""" + X = np.random.randint(0, 2, size=(100, 2, 1)) + estimator = LinearRegression() + estimator.fit(X[:, 0], X[:, 1]) + basic_class = BasePerturbation(estimator=estimator) + basic_class.importances_ = [] + with pytest.raises( + ValueError, match="The importances need to be called before calling this method" + ): + basic_class.selection() From 28593e49e77b9310861b4dc6fca3e4931d687ea8 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 2 Sep 2025 18:13:55 +0200 Subject: [PATCH 05/24] add pvalue and fit_importance and function --- src/hidimstat/base_perturbation.py | 52 +++++++++++++- .../conditional_feature_importance.py | 70 +++++++++++++++++++ src/hidimstat/leave_one_covariate_out.py | 63 +++++++++++++++++ .../permutation_feature_importance.py | 62 ++++++++++++++++ 4 files changed, 244 insertions(+), 3 deletions(-) diff --git a/src/hidimstat/base_perturbation.py b/src/hidimstat/base_perturbation.py index 741c527cd..fa6f7fde2 100644 --- a/src/hidimstat/base_perturbation.py +++ b/src/hidimstat/base_perturbation.py @@ -1,9 +1,12 @@ +import warnings + import numpy as np import pandas as pd from joblib import Parallel, delayed -from sklearn.base import check_is_fitted +from scipy.stats import ttest_1samp +from sklearn.base import check_is_fitted, clone from sklearn.metrics import root_mean_squared_error -import warnings +from sklearn.model_selection import KFold from hidimstat._utils.utils import _check_vim_predict_method from hidimstat._utils.exception import InternalError @@ -164,9 +167,52 @@ def importance(self, X, y): for j in range(self._n_groups) ] ) - self.pvalues_ = None + self.pvalues_ = ttest_1samp( + self.importances_, 0.0, axis=0, alternative="greater" + ).pvalue return self.importances_ + def fit_importance( + self, X, y, cv=KFold(n_splits=5, shuffle=True, random_state=0), **fit_kwargs + ): + """ + Compute feature importance scores using cross-validation. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + y : array-like of shape (n_samples,) + Target values. + cv : cross-validation generator or iterable, default=KFold(n_splits=5, shuffle=True, random_state=0) + Determines the cross-validation splitting strategy. + **fit_kwargs : dict + Additional arguments passed to the fit method during variable group identification. + + Returns + ------- + importances : float + Mean feature importance scores across CV folds. + + Notes + ----- + For each CV fold: + 1. Clones and fits the estimator on training fold + 2. Identifies variable groups on training fold + 3. Computes feature importances on test fold + 4. Returns average importance across all folds + + The importances for each fold are stored in self.importances_ + """ + importances = [] + for train, test in cv.split(X): + estimator = clone(self.estimator) + estimator.fit(X[train], y[train]) + self.fit(X[train], y[train], **fit_kwargs) + importances.append(self.importance(X[test], y[test])) + self.importances_ = importances + return np.mean(importances) + def _check_fit(self, X): """ Check if the perturbation method has been properly fitted. diff --git a/src/hidimstat/conditional_feature_importance.py b/src/hidimstat/conditional_feature_importance.py index 0f433ea4c..f45e8d402 100644 --- a/src/hidimstat/conditional_feature_importance.py +++ b/src/hidimstat/conditional_feature_importance.py @@ -2,10 +2,12 @@ from joblib import Parallel, delayed from sklearn.base import check_is_fitted, clone, BaseEstimator from sklearn.metrics import root_mean_squared_error +from sklearn.model_selection import KFold from sklearn.utils.validation import check_random_state from hidimstat.base_perturbation import BasePerturbation from hidimstat.conditional_sampling import ConditionalSampler +from hidimstat._utils.docstring import _aggregate_docstring class CFI(BasePerturbation): @@ -191,3 +193,71 @@ def _permutation(self, X, group_id): return self._list_imputation_models[group_id].sample( X_minus_j, X_j, n_samples=self.n_permutations ) + + +def cfi( + estimator, + X, + y, + cv=KFold(n_splits=5, shuffle=True, random_state=0), + groups: dict = None, + var_type: str = "auto", + method: str = "predict", + loss: callable = root_mean_squared_error, + n_permutations: int = 50, + imputation_model_continuous=None, + imputation_model_categorical=None, + categorical_max_cardinality: int = 10, + k_best=None, + percentile=None, + threshold=None, + threshold_pvalue=None, + random_state: int = None, + n_jobs: int = 1, +): + methods = CFI( + estimator=estimator, + method=method, + loss=loss, + n_permutations=n_permutations, + imputation_model_continuous=imputation_model_continuous, + imputation_model_categorical=imputation_model_categorical, + categorical_max_cardinality=categorical_max_cardinality, + random_state=random_state, + n_jobs=n_jobs, + ) + methods.fit_importance( + X, + y, + cv=cv, + groups=groups, + var_type=var_type, + ) + selection = methods.selection( + k_best=k_best, + percentile=percentile, + threshold=threshold, + threshold_pvalue=threshold_pvalue, + ) + return selection, methods.importances_, methods.pvalues_ + + +# use the docstring of the class for the function +cfi.__doc__ = _aggregate_docstring( + [ + CFI.__doc__, + CFI.__init__.__doc__, + CFI.fit_importance.__doc__, + CFI.selection.__doc__, + ], + """ + Returns + ------- + selection : ndarray of shape (n_features,) + Boolean array indicating selected features (True = selected) + importances : ndarray of shape (n_features,) + Feature importance scores/test statistics. + pvalues : ndarray of shape (n_features,) + + """, +) diff --git a/src/hidimstat/leave_one_covariate_out.py b/src/hidimstat/leave_one_covariate_out.py index 5a865cb71..6a9f2dce4 100644 --- a/src/hidimstat/leave_one_covariate_out.py +++ b/src/hidimstat/leave_one_covariate_out.py @@ -2,9 +2,11 @@ import pandas as pd from joblib import Parallel, delayed from sklearn.base import check_is_fitted, clone +from sklearn.model_selection import KFold from sklearn.metrics import root_mean_squared_error from hidimstat.base_perturbation import BasePerturbation +from hidimstat._utils.docstring import _aggregate_docstring class LOCO(BasePerturbation): @@ -89,6 +91,11 @@ def fit(self, X, y, groups=None): ) return self + def importance(self, X, y): + super().importance(X, y) + self.pvalues_ = None + return self.importances_ + def _joblib_fit_one_group(self, estimator, X, y, key_groups): """Fit the estimator after removing a group of covariates. Used in parallel.""" if isinstance(X, pd.DataFrame): @@ -116,3 +123,59 @@ def _check_fit(self, X): raise ValueError("The estimators require to be fit before to use them") for m in self._list_estimators: check_is_fitted(m) + + +def loco( + estimator, + X, + y, + cv=KFold(n_splits=5, shuffle=True, random_state=0), + groups: dict = None, + method: str = "predict", + loss: callable = root_mean_squared_error, + k_best=None, + percentile=None, + threshold=None, + threshold_pvalue=None, + n_jobs: int = 1, +): + methods = LOCO( + estimator=estimator, + method=method, + loss=loss, + n_jobs=n_jobs, + ) + methods.fit_importance( + X, + y, + cv=cv, + groups=groups, + ) + selection = methods.selection( + k_best=k_best, + percentile=percentile, + threshold=threshold, + threshold_pvalue=threshold_pvalue, + ) + return selection, methods.importances_, methods.pvalues_ + + +# use the docstring of the class for the function +loco.__doc__ = _aggregate_docstring( + [ + LOCO.__doc__, + LOCO.__init__.__doc__, + LOCO.fit_importance.__doc__, + LOCO.selection.__doc__, + ], + """ + Returns + ------- + selection : ndarray of shape (n_features,) + Boolean array indicating selected features (True = selected) + importances : ndarray of shape (n_features,) + Feature importance scores/test statistics. + pvalues : ndarray of shape (n_features,) + + """, +) diff --git a/src/hidimstat/permutation_feature_importance.py b/src/hidimstat/permutation_feature_importance.py index 10f18289f..14f02fcc5 100644 --- a/src/hidimstat/permutation_feature_importance.py +++ b/src/hidimstat/permutation_feature_importance.py @@ -1,8 +1,10 @@ import numpy as np from sklearn.metrics import root_mean_squared_error +from sklearn.model_selection import KFold from sklearn.utils import check_random_state from hidimstat.base_perturbation import BasePerturbation +from hidimstat._utils.docstring import _aggregate_docstring class PFI(BasePerturbation): @@ -69,3 +71,63 @@ def _permutation(self, X, group_id): ] ) return X_perm_j + + +def pfi( + estimator, + X, + y, + cv=KFold(n_splits=5, shuffle=True, random_state=0), + groups: dict = None, + method: str = "predict", + loss: callable = root_mean_squared_error, + n_permutations: int = 50, + k_best=None, + percentile=None, + threshold=None, + threshold_pvalue=None, + random_state: int = None, + n_jobs: int = 1, +): + methods = PFI( + estimator=estimator, + method=method, + loss=loss, + n_permutations=n_permutations, + random_state=random_state, + n_jobs=n_jobs, + ) + methods.fit_importance( + X, + y, + cv=cv, + groups=groups, + ) + selection = methods.selection( + k_best=k_best, + percentile=percentile, + threshold=threshold, + threshold_pvalue=threshold_pvalue, + ) + return selection, methods.importances_, methods.pvalues_ + + +# use the docstring of the class for the function +pfi.__doc__ = _aggregate_docstring( + [ + PFI.__doc__, + PFI.__init__.__doc__, + PFI.fit_importance.__doc__, + PFI.selection.__doc__, + ], + """ + Returns + ------- + selection : ndarray of shape (n_features,) + Boolean array indicating selected features (True = selected) + importances : ndarray of shape (n_features,) + Feature importance scores/test statistics. + pvalues : ndarray of shape (n_features,) + + """, +) From cabfb6301056fb8468f6802ca493636e35c66451 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 2 Sep 2025 18:46:47 +0200 Subject: [PATCH 06/24] Add new function --- src/hidimstat/__init__.py | 9 ++++-- src/hidimstat/base_perturbation.py | 7 ++-- test/test_conditional_feature_importance.py | 21 +++++++++++- test/test_leave_one_covariate_out.py | 34 ++++++++++++++++++- test/test_permutation_feature_importance.py | 36 ++++++++++++++++++++- 5 files changed, 99 insertions(+), 8 deletions(-) diff --git a/src/hidimstat/__init__.py b/src/hidimstat/__init__.py index 81d5a0cce..3037bbcc9 100644 --- a/src/hidimstat/__init__.py +++ b/src/hidimstat/__init__.py @@ -14,16 +14,16 @@ desparsified_group_lasso_pvalue, ) from .distilled_conditional_randomization_test import d0crt, D0CRT -from .conditional_feature_importance import CFI +from .conditional_feature_importance import cfi, CFI from .knockoffs import ( model_x_knockoff, model_x_knockoff_pvalue, model_x_knockoff_bootstrap_quantile, model_x_knockoff_bootstrap_e_value, ) -from .leave_one_covariate_out import LOCO +from .leave_one_covariate_out import loco, LOCO from .noise_std import reid -from .permutation_feature_importance import PFI +from .permutation_feature_importance import pfi, PFI from .statistical_tools.aggregation import quantile_aggregation @@ -49,6 +49,9 @@ "model_x_knockoff_bootstrap_quantile", "model_x_knockoff_bootstrap_e_value", "CFI", + "cfi", "LOCO", + "loco", "PFI", + "pfi", ] diff --git a/src/hidimstat/base_perturbation.py b/src/hidimstat/base_perturbation.py index fa6f7fde2..c42892fbe 100644 --- a/src/hidimstat/base_perturbation.py +++ b/src/hidimstat/base_perturbation.py @@ -60,6 +60,8 @@ def __init__( # varaible set in importance self.loss_reference_ = None self.loss_ = None + # variable set in fit_importance + self.importances_cv_ = None # internal variables self._n_groups = None self._groups_ids = None @@ -210,8 +212,9 @@ def fit_importance( estimator.fit(X[train], y[train]) self.fit(X[train], y[train], **fit_kwargs) importances.append(self.importance(X[test], y[test])) - self.importances_ = importances - return np.mean(importances) + self.importances_cv_ = importances + self.importances_ = np.mean(importances, axis=0) + return self.importances_ def _check_fit(self, X): """ diff --git a/test/test_conditional_feature_importance.py b/test/test_conditional_feature_importance.py index 529cf891d..ef00be5b7 100644 --- a/test/test_conditional_feature_importance.py +++ b/test/test_conditional_feature_importance.py @@ -8,7 +8,7 @@ from sklearn.model_selection import train_test_split from sklearn.metrics import root_mean_squared_error -from hidimstat import CFI, BasePerturbation +from hidimstat import cfi, CFI, BasePerturbation from hidimstat._utils.exception import InternalError @@ -565,3 +565,22 @@ def test_groups_warning(self, data_generator): " number of features for which importance is computed: 4", ): cfi.importance(X, y) + + +@pytest.mark.parametrize( + "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial", + [(150, 200, 10, 0.2, 42, 1.0, 1.0, 0.0)], + ids=["high level noise"], +) +@pytest.mark.parametrize("n_permutation, cfi_seed", [(20, 0)], ids=["default_cfi"]) +def test_function_cfi(data_generator, n_permutation, cfi_seed): + """Test CFI function""" + X, y, _, _ = data_generator + cfi( + LinearRegression().fit(X, y), + X, + y, + imputation_model_continuous=LinearRegression(), + n_permutations=n_permutation, + random_state=cfi_seed, + ) diff --git a/test/test_leave_one_covariate_out.py b/test/test_leave_one_covariate_out.py index f6f4b2319..a875f98d0 100644 --- a/test/test_leave_one_covariate_out.py +++ b/test/test_leave_one_covariate_out.py @@ -7,7 +7,7 @@ from sklearn.model_selection import train_test_split from hidimstat._utils.scenario import multivariate_simulation -from hidimstat import LOCO, BasePerturbation +from hidimstat import loco, LOCO, BasePerturbation def test_loco(): @@ -135,3 +135,35 @@ def test_raises_value_error(): ) BasePerturbation.fit(loco, X, y) loco.importance(X, y) + + +def test_loco_function(): + """Test the function of LOCO algorithm on a linear scenario.""" + X, y, beta, noise = multivariate_simulation( + n_samples=150, + n_features=200, + support_size=10, + shuffle=False, + seed=42, + ) + important_features = np.where(beta != 0)[0] + non_important_features = np.where(beta == 0)[0] + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + + regression_model = LinearRegression() + regression_model.fit(X_train, y_train) + + selection, importance, pvalue = loco( + regression_model, + X, + y, + method="predict", + n_jobs=1, + ) + + assert importance.shape == (X.shape[1],) + assert ( + importance[important_features].mean() + > importance[non_important_features].mean() + ) diff --git a/test/test_permutation_feature_importance.py b/test/test_permutation_feature_importance.py index ee0a870c1..d38dfe1a2 100644 --- a/test/test_permutation_feature_importance.py +++ b/test/test_permutation_feature_importance.py @@ -5,7 +5,7 @@ from sklearn.model_selection import train_test_split import pytest -from hidimstat import PFI +from hidimstat import PFI, pfi from hidimstat._utils.scenario import multivariate_simulation @@ -96,3 +96,37 @@ def test_permutation_importance(): importance_clf = pfi_clf.importance(X_test, y_test_clf) assert importance_clf.shape == (X.shape[1],) + + +def test_permutation_importance_function(): + """Test the function of Permutation Importance algorithm on a linear scenario.""" + X, y, beta, noise = multivariate_simulation( + n_samples=150, + n_features=200, + support_size=10, + shuffle=False, + seed=42, + ) + important_features = np.where(beta != 0)[0] + non_important_features = np.where(beta == 0)[0] + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + + regression_model = LinearRegression() + regression_model.fit(X_train, y_train) + + selection, importance, pvalue = pfi( + regression_model, + X, + y, + n_permutations=20, + method="predict", + random_state=0, + n_jobs=1, + ) + + assert importance.shape == (X.shape[1],) + assert ( + importance[important_features].mean() + > importance[non_important_features].mean() + ) From 7d7fd7d9b0b507b165497eab1963ce6750fca1f8 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 2 Sep 2025 18:55:41 +0200 Subject: [PATCH 07/24] fix docstring --- src/hidimstat/base_perturbation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/base_perturbation.py b/src/hidimstat/base_perturbation.py index c42892fbe..bbddcf765 100644 --- a/src/hidimstat/base_perturbation.py +++ b/src/hidimstat/base_perturbation.py @@ -204,7 +204,7 @@ def fit_importance( 3. Computes feature importances on test fold 4. Returns average importance across all folds - The importances for each fold are stored in self.importances_ + The importances for each fold are stored in self.importances\_ """ importances = [] for train, test in cv.split(X): From b958cc7b013abb849b0e6e9bca6762ac0047f9f1 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 3 Sep 2025 15:19:11 +0200 Subject: [PATCH 08/24] Improve cross validation --- src/hidimstat/_utils/utils.py | 31 ++++++++++++++++++++++++++++++ src/hidimstat/base_perturbation.py | 24 ++++++++++++++++------- test/_utils/test_utils.py | 16 +++++++++++++++ 3 files changed, 64 insertions(+), 7 deletions(-) create mode 100644 test/_utils/test_utils.py diff --git a/src/hidimstat/_utils/utils.py b/src/hidimstat/_utils/utils.py index 66166f444..57cc38f70 100644 --- a/src/hidimstat/_utils/utils.py +++ b/src/hidimstat/_utils/utils.py @@ -25,3 +25,34 @@ def _check_vim_predict_method(method): "The method {} is not a valid method " "for variable importance measure prediction".format(method) ) + + +def get_generated_attributes(cls): + """ + Get all attributes from a class that end with a single underscore + and doesn't start with one underscore. + + Parameters + ---------- + cls : class + The class to inspect for attributes. + + Returns + ------- + list + A list of attribute names that end with a single underscore but not double underscore. + """ + # Get all attributes and methods of the class + all_attributes = dir(cls) + + # Filter out attributes that start with an underscore + filtered_attributes = [attr for attr in all_attributes if not attr.startswith("_")] + + # Filter out attributes that do not end with a single underscore + result = [ + attr + for attr in filtered_attributes + if attr.endswith("_") and not attr.endswith("__") + ] + + return result diff --git a/src/hidimstat/base_perturbation.py b/src/hidimstat/base_perturbation.py index bbddcf765..3cd3ae2b5 100644 --- a/src/hidimstat/base_perturbation.py +++ b/src/hidimstat/base_perturbation.py @@ -11,6 +11,7 @@ from hidimstat._utils.utils import _check_vim_predict_method from hidimstat._utils.exception import InternalError from hidimstat.base_variable_importance import BaseVariableImportance +from hidimstat._utils.utils import get_generated_attributes class BasePerturbation(BaseVariableImportance): @@ -60,8 +61,6 @@ def __init__( # varaible set in importance self.loss_reference_ = None self.loss_ = None - # variable set in fit_importance - self.importances_cv_ = None # internal variables self._n_groups = None self._groups_ids = None @@ -206,14 +205,23 @@ def fit_importance( The importances for each fold are stored in self.importances\_ """ - importances = [] + name_attribute_save = get_generated_attributes(self) + for name in name_attribute_save: + setattr(self, name + "cv_", []) + self.estimators_cv_ = [] + for train, test in cv.split(X): estimator = clone(self.estimator) estimator.fit(X[train], y[train]) self.fit(X[train], y[train], **fit_kwargs) - importances.append(self.importance(X[test], y[test])) - self.importances_cv_ = importances - self.importances_ = np.mean(importances, axis=0) + self.importance(X[test], y[test]) + # save result of each cv + for name in name_attribute_save: + getattr(self, name + "cv_").append(getattr(self, name)) + setattr(self, name, None) + self.estimators_cv_.append(estimator) + self.importances_ = np.mean(self.importances_cv_, axis=0) + self.pvalues_ = np.mean(self.pvalues_cv_, axis=0) return self.importances_ def _check_fit(self, X): @@ -287,7 +295,9 @@ def _check_importance(self): Checks if the loss have been computed. """ super()._check_importance() - if self.loss_reference_ is None or self.loss_ is None: + if ( + self.loss_reference_ is None and not hasattr(self, "loss_reference_cv_") + ) or (self.loss_ is None and not hasattr(self, "loss_cv_")): raise ValueError( "The importances need to be called before calling this method" ) diff --git a/test/_utils/test_utils.py b/test/_utils/test_utils.py new file mode 100644 index 000000000..192e08149 --- /dev/null +++ b/test/_utils/test_utils.py @@ -0,0 +1,16 @@ +from hidimstat._utils.utils import get_generated_attributes + + +def test_generated_attributes(): + """Test function for getting generated attribute""" + + class MyClass: + def __init__(self): + self.attr1 = 1 + self.attr2_ = 2 + self._attr3 = 3 + self.attr4__ = 4 + self.attr5_ = 5 + + attributes = get_generated_attributes(MyClass()) + assert attributes == ["attr2_", "attr5_"] From 1f97d60c3259c989983e63960d67a182b7ad6680 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 3 Sep 2025 15:26:24 +0200 Subject: [PATCH 09/24] update docstring --- src/hidimstat/base_perturbation.py | 62 ++++++++++++------- .../conditional_feature_importance.py | 2 +- src/hidimstat/leave_one_covariate_out.py | 2 +- .../permutation_feature_importance.py | 2 +- 4 files changed, 42 insertions(+), 26 deletions(-) diff --git a/src/hidimstat/base_perturbation.py b/src/hidimstat/base_perturbation.py index 3cd3ae2b5..c2095ae9c 100644 --- a/src/hidimstat/base_perturbation.py +++ b/src/hidimstat/base_perturbation.py @@ -15,6 +15,45 @@ class BasePerturbation(BaseVariableImportance): + """ + Base class for model-agnostic variable importance measures based on + perturbation. + + Parameters + ---------- + estimator : sklearn compatible estimator + The estimator to use for the prediction. + method : str, default="predict" + The method used for making predictions. This determines the predictions + passed to the loss function. Supported methods are "predict", + "predict_proba", "decision_function", "transform". + loss : callable, default=root_mean_squared_error + Loss function to compute difference between original and perturbed predictions. + n_permutations : int, default=50 + Number of permutations to perform for calculating variable importance. + Higher values give more stable results but increase computation time. + n_jobs : int, default=1 + Number of parallel jobs to run. -1 means using all processors. + + Attributes + ---------- + groups : dict + Mapping of feature groups identified during fit. + importances_ : ndarray + Computed importance scores for each feature group. + loss_reference_ : float + Loss of the original model without perturbation. + loss_ : dict + Loss values for each perturbed feature group. + pvalues_ : ndarray + P-values for importance scores. + + Notes + ----- + This is an abstract base class. Concrete implementations must override + the _permutation method. + """ + def __init__( self, estimator, @@ -23,30 +62,7 @@ def __init__( n_permutations: int = 50, n_jobs: int = 1, ): - """ - Base class for model-agnostic variable importance measures based on - perturbation. - Parameters - ---------- - estimator : sklearn compatible estimator, optional - The estimator to use for the prediction. - method : str, default="predict" - The method used for making predictions. This determines the predictions - passed to the loss function. Supported methods are "predict", - "predict_proba", "decision_function", "transform". - loss : callable, default=root_mean_squared_error - The function to compute the loss when comparing the perturbed model - to the original model. - n_permutations : int, default=50 - This parameter is relevant only for PFI or CFI. - Specifies the number of times the variable group (residual for CFI) is - permuted. For each permutation, the perturbed model's loss is calculated - and averaged over all permutations. - n_jobs : int, default=1 - The number of parallel jobs to run. Parallelization is done over the - variables or groups of variables. - """ super().__init__() check_is_fitted(estimator) assert n_permutations > 0, "n_permutations must be positive" diff --git a/src/hidimstat/conditional_feature_importance.py b/src/hidimstat/conditional_feature_importance.py index f45e8d402..42686bc5d 100644 --- a/src/hidimstat/conditional_feature_importance.py +++ b/src/hidimstat/conditional_feature_importance.py @@ -258,6 +258,6 @@ def cfi( importances : ndarray of shape (n_features,) Feature importance scores/test statistics. pvalues : ndarray of shape (n_features,) - + P-values for importance scores. """, ) diff --git a/src/hidimstat/leave_one_covariate_out.py b/src/hidimstat/leave_one_covariate_out.py index 6a9f2dce4..048be4216 100644 --- a/src/hidimstat/leave_one_covariate_out.py +++ b/src/hidimstat/leave_one_covariate_out.py @@ -176,6 +176,6 @@ def loco( importances : ndarray of shape (n_features,) Feature importance scores/test statistics. pvalues : ndarray of shape (n_features,) - + None because there is no p-value for this method """, ) diff --git a/src/hidimstat/permutation_feature_importance.py b/src/hidimstat/permutation_feature_importance.py index 14f02fcc5..6e25bc38a 100644 --- a/src/hidimstat/permutation_feature_importance.py +++ b/src/hidimstat/permutation_feature_importance.py @@ -128,6 +128,6 @@ def pfi( importances : ndarray of shape (n_features,) Feature importance scores/test statistics. pvalues : ndarray of shape (n_features,) - + P-values for importance scores. """, ) From db96bb6e54272d887740cdf03cf9b8e0b688bc2c Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 3 Sep 2025 15:31:14 +0200 Subject: [PATCH 10/24] update doctring --- src/hidimstat/base_perturbation.py | 66 ++++++++++++++++++------ src/hidimstat/leave_one_covariate_out.py | 18 +++++++ 2 files changed, 67 insertions(+), 17 deletions(-) diff --git a/src/hidimstat/base_perturbation.py b/src/hidimstat/base_perturbation.py index c2095ae9c..cb6a546d2 100644 --- a/src/hidimstat/base_perturbation.py +++ b/src/hidimstat/base_perturbation.py @@ -151,19 +151,38 @@ def importance(self, X, y): Parameters ---------- - X: array-like of shape (n_samples, n_features) - The input samples. - y: array-like of shape (n_samples,) - The target values. + X : array-like of shape (n_samples, n_features) + The input samples to compute importance scores for. + y : array-like of shape (n_samples,) + + importances_ : ndarray of shape (n_groups,) + The importance scores for each group of covariates. + A higher score indicates greater importance of that group. Returns ------- - out_dict: dict - A dictionary containing the following keys: - - 'loss_reference': the loss of the model with the original data. - - 'loss': a dictionary containing the loss of the perturbed model - for each group. - - 'importance': the importance scores for each group. + importances_ : ndarray of shape (n_features,) + Importance scores for each feature. + + Attributes + ---------- + loss_reference_ : float + The loss of the model with the original (non-perturbed) data. + loss_ : dict + Dictionary with indices as keys and arrays of perturbed losses as values. + Contains the loss values for each permutation of each group. + importances_ : ndarray of shape (n_groups,) + The calculated importance scores for each group. + pvalues_ : ndarray of shape (n_groups,) + P-values from one-sided t-test testing if importance scores are + significantly greater than 0. + + Notes + ----- + The importance score for each group is calculated as the mean increase in loss + when that group is perturbed, compared to the reference loss. + A higher importance score indicates that perturbing that group leads to + worse model performance, suggesting those features are more important. """ self._check_fit(X) @@ -208,18 +227,31 @@ def fit_importance( Returns ------- - importances : float - Mean feature importance scores across CV folds. + importances_ : ndarray + Average importance scores for each feature group across CV folds. + + Attributes + ---------- + estimators_cv_ : list + List of fitted estimators for each CV fold. + importances_cv_ : list + List of importance scores for each CV fold. + pvalues_cv_ : list + List of p-values for each CV fold. + loss_cv_ : list + List of loss values for each CV fold. + loss_reference_cv_ : list + List of reference loss values for each CV fold. Notes ----- For each CV fold: - 1. Clones and fits the estimator on training fold - 2. Identifies variable groups on training fold - 3. Computes feature importances on test fold - 4. Returns average importance across all folds + 1. Fits a clone of the base estimator on the training fold + 2. Identifies variable groups on the training fold + 3. Computes feature importances using the test fold + 4. Stores results for each fold in respective cv_ attributes - The importances for each fold are stored in self.importances\_ + Final importances_ and pvalues_ are averaged across all CV folds. """ name_attribute_save = get_generated_attributes(self) for name in name_attribute_save: diff --git a/src/hidimstat/leave_one_covariate_out.py b/src/hidimstat/leave_one_covariate_out.py index 048be4216..8b67739cc 100644 --- a/src/hidimstat/leave_one_covariate_out.py +++ b/src/hidimstat/leave_one_covariate_out.py @@ -92,6 +92,24 @@ def fit(self, X, y, groups=None): return self def importance(self, X, y): + """ + Compute the importance scores for each group of covariates. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The input samples to compute importance scores for. + y : array-like of shape (n_samples,) + + importances_ : ndarray of shape (n_groups,) + The importance scores for each group of covariates. + A higher score indicates greater importance of that group. + + Returns + ------- + importances_ : ndarray of shape (n_features,) + Importance scores for each feature. + """ super().importance(X, y) self.pvalues_ = None return self.importances_ From d656f17213246471e040311f93c99c23db5581df Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 3 Sep 2025 15:48:10 +0200 Subject: [PATCH 11/24] fix error --- src/hidimstat/base_perturbation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/hidimstat/base_perturbation.py b/src/hidimstat/base_perturbation.py index cb6a546d2..7fe4eb681 100644 --- a/src/hidimstat/base_perturbation.py +++ b/src/hidimstat/base_perturbation.py @@ -269,7 +269,9 @@ def fit_importance( setattr(self, name, None) self.estimators_cv_.append(estimator) self.importances_ = np.mean(self.importances_cv_, axis=0) - self.pvalues_ = np.mean(self.pvalues_cv_, axis=0) + self.pvalues_ = ( + None if self.pvalues_cv_[0] is None else np.mean(self.pvalues_cv_, axis=0) + ) return self.importances_ def _check_fit(self, X): From 0493b6f05446a7a3deaa997f7a18486c5dc7595f Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 3 Sep 2025 16:48:04 +0200 Subject: [PATCH 12/24] fix docstring --- src/hidimstat/base_perturbation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hidimstat/base_perturbation.py b/src/hidimstat/base_perturbation.py index 7fe4eb681..e1d5726e5 100644 --- a/src/hidimstat/base_perturbation.py +++ b/src/hidimstat/base_perturbation.py @@ -249,9 +249,9 @@ def fit_importance( 1. Fits a clone of the base estimator on the training fold 2. Identifies variable groups on the training fold 3. Computes feature importances using the test fold - 4. Stores results for each fold in respective cv_ attributes + 4. Stores results for each fold in respective cv\_ attributes - Final importances_ and pvalues_ are averaged across all CV folds. + Final importances\_ and pvalues\_ are averaged across all CV folds. """ name_attribute_save = get_generated_attributes(self) for name in name_attribute_save: From 9c54e1bd26c7cdbbe2f947e58392a3f3a490d037 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 5 Sep 2025 11:52:43 +0200 Subject: [PATCH 13/24] Apply suggestions from code review Co-authored-by: Joseph Paillard --- src/hidimstat/conditional_feature_importance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hidimstat/conditional_feature_importance.py b/src/hidimstat/conditional_feature_importance.py index 42686bc5d..42a09a813 100644 --- a/src/hidimstat/conditional_feature_importance.py +++ b/src/hidimstat/conditional_feature_importance.py @@ -30,10 +30,10 @@ class CFI(BasePerturbation): n_permutations : int, default=50 The number of permutations to perform. For each variable/group of variables, the mean of the losses over the `n_permutations` is computed. - imputation_model_continuous : sklearn compatible estimator, optional + imputation_model_continuous : sklearn compatible estimator, default=RidgeCV() The model used to estimate the conditional distribution of a given continuous variable/group of variables given the others. - imputation_model_categorical : sklearn compatible estimator, optional + imputation_model_categorical : sklearn compatible estimator, default=LogisticRegressionCV() The model used to estimate the conditional distribution of a given categorical variable/group of variables given the others. Binary is considered as a special case of categorical. From 7bf75e4c7600b4ac633282f7d6fa6f3078916e9d Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 5 Sep 2025 11:55:04 +0200 Subject: [PATCH 14/24] Update default --- src/hidimstat/conditional_feature_importance.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/hidimstat/conditional_feature_importance.py b/src/hidimstat/conditional_feature_importance.py index 42a09a813..4b1e2cdf3 100644 --- a/src/hidimstat/conditional_feature_importance.py +++ b/src/hidimstat/conditional_feature_importance.py @@ -3,6 +3,7 @@ from sklearn.base import check_is_fitted, clone, BaseEstimator from sklearn.metrics import root_mean_squared_error from sklearn.model_selection import KFold +from sklearn.linear_model import RidgeCV, LogisticRegressionCV from sklearn.utils.validation import check_random_state from hidimstat.base_perturbation import BasePerturbation @@ -57,8 +58,8 @@ def __init__( method: str = "predict", loss: callable = root_mean_squared_error, n_permutations: int = 50, - imputation_model_continuous=None, - imputation_model_categorical=None, + imputation_model_continuous=RidgeCV(), + imputation_model_categorical=LogisticRegressionCV(), categorical_max_cardinality: int = 10, random_state: int = None, n_jobs: int = 1, From b3cd78a3ea3378e7830ef49478cdd5049347ba18 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 5 Sep 2025 12:47:21 +0200 Subject: [PATCH 15/24] fix tests --- test/test_conditional_feature_importance.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/test_conditional_feature_importance.py b/test/test_conditional_feature_importance.py index ef00be5b7..54013e2f0 100644 --- a/test/test_conditional_feature_importance.py +++ b/test/test_conditional_feature_importance.py @@ -7,6 +7,7 @@ from sklearn.metrics import log_loss from sklearn.model_selection import train_test_split from sklearn.metrics import root_mean_squared_error +from sklearn.linear_model import RidgeCV, LogisticRegressionCV from hidimstat import cfi, CFI, BasePerturbation from hidimstat._utils.exception import InternalError @@ -278,8 +279,8 @@ def test_init(self, data_generator): assert cfi.loss == root_mean_squared_error assert cfi.method == "predict" assert cfi.categorical_max_cardinality == 10 - assert cfi.imputation_model_categorical is None - assert cfi.imputation_model_continuous is None + assert isinstance(cfi.imputation_model_categorical, LogisticRegressionCV) + assert isinstance(cfi.imputation_model_continuous, RidgeCV) def test_fit(self, data_generator): """Test fitting CFI""" From 782549018a88462d4220c2cbbe983028d98f9807 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Mon, 8 Sep 2025 10:59:18 +0200 Subject: [PATCH 16/24] Apply suggestions from code review Co-authored-by: bthirion --- src/hidimstat/base_perturbation.py | 4 ++-- test/test_permutation_feature_importance.py | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/hidimstat/base_perturbation.py b/src/hidimstat/base_perturbation.py index e1d5726e5..98f3a30cf 100644 --- a/src/hidimstat/base_perturbation.py +++ b/src/hidimstat/base_perturbation.py @@ -342,14 +342,14 @@ def _check_fit(self, X): def _check_importance(self): """ - Checks if the loss have been computed. + Checks if the loss has been computed. """ super()._check_importance() if ( self.loss_reference_ is None and not hasattr(self, "loss_reference_cv_") ) or (self.loss_ is None and not hasattr(self, "loss_cv_")): raise ValueError( - "The importances need to be called before calling this method" + "The importance method has not yet been called." ) def _joblib_predict_one_group(self, X, group_id, group_key): diff --git a/test/test_permutation_feature_importance.py b/test/test_permutation_feature_importance.py index d38dfe1a2..0d6ea7b24 100644 --- a/test/test_permutation_feature_importance.py +++ b/test/test_permutation_feature_importance.py @@ -107,8 +107,7 @@ def test_permutation_importance_function(): shuffle=False, seed=42, ) - important_features = np.where(beta != 0)[0] - non_important_features = np.where(beta == 0)[0] + important_features = beta != 0 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) @@ -128,5 +127,5 @@ def test_permutation_importance_function(): assert importance.shape == (X.shape[1],) assert ( importance[important_features].mean() - > importance[non_important_features].mean() + > importance[1 - important_features].mean() ) From 084ad245beca4d99d7046da12d1dd771a9a2acb5 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 8 Sep 2025 10:59:33 +0200 Subject: [PATCH 17/24] chnage group by features_groups --- src/hidimstat/base_perturbation.py | 27 ++++++++++++--------- src/hidimstat/leave_one_covariate_out.py | 15 ++++++------ test/test_conditional_feature_importance.py | 2 +- 3 files changed, 24 insertions(+), 20 deletions(-) diff --git a/src/hidimstat/base_perturbation.py b/src/hidimstat/base_perturbation.py index 98f3a30cf..e88c56c2b 100644 --- a/src/hidimstat/base_perturbation.py +++ b/src/hidimstat/base_perturbation.py @@ -73,7 +73,7 @@ def __init__( self.n_permutations = n_permutations self.n_jobs = n_jobs # variable set in fit - self.groups = None + self.features_groups = None # varaible set in importance self.loss_reference_ = None self.loss_ = None @@ -97,24 +97,25 @@ def fit(self, X, y=None, groups=None): """ if groups is None: self._n_groups = X.shape[1] - self.groups = {j: [j] for j in range(self._n_groups)} - self._groups_ids = np.array(list(self.groups.values()), dtype=int) + self.features_groups = {j: [j] for j in range(self._n_groups)} + self._groups_ids = np.array(list(self.features_groups.values()), dtype=int) elif isinstance(groups, dict): self._n_groups = len(groups) - self.groups = groups + self.features_groups = groups if isinstance(X, pd.DataFrame): self._groups_ids = [] - for group_key in self.groups.keys(): + for group_key in self.features_groups.keys(): self._groups_ids.append( [ i for i, col in enumerate(X.columns) - if col in self.groups[group_key] + if col in self.features_groups[group_key] ] ) else: self._groups_ids = [ - np.array(ids, dtype=int) for ids in list(self.groups.values()) + np.array(ids, dtype=int) + for ids in list(self.features_groups.values()) ] else: raise ValueError("groups needs to be a dictionnary") @@ -141,7 +142,7 @@ def predict(self, X): # Parallelize the computation of the importance scores for each group out_list = Parallel(n_jobs=self.n_jobs)( delayed(self._joblib_predict_one_group)(X_, group_id, group_key) - for group_id, group_key in enumerate(self.groups.keys()) + for group_id, group_key in enumerate(self.features_groups.keys()) ) return np.stack(out_list, axis=0) @@ -296,7 +297,11 @@ def _check_fit(self, X): If the number of features in X does not match the total number of features in the grouped variables. """ - if self._n_groups is None or self.groups is None or self._groups_ids is None: + if ( + self._n_groups is None + or self.features_groups is None + or self._groups_ids is None + ): raise ValueError( "The class is not fitted. The fit method must be called" " to set variable groups. If no grouping is needed," @@ -313,7 +318,7 @@ def _check_fit(self, X): else: raise ValueError("X should be a pandas dataframe or a numpy array.") number_columns = X.shape[1] - for index_variables in self.groups.values(): + for index_variables in self.features_groups.values(): if type(index_variables[0]) is int or np.issubdtype( type(index_variables[0]), int ): @@ -331,7 +336,7 @@ def _check_fit(self, X): "A problem with indexing has happened during the fit." ) number_unique_feature_in_groups = np.unique( - np.concatenate([values for values in self.groups.values()]) + np.concatenate([values for values in self.features_groups.values()]) ).shape[0] if X.shape[1] != number_unique_feature_in_groups: warnings.warn( diff --git a/src/hidimstat/leave_one_covariate_out.py b/src/hidimstat/leave_one_covariate_out.py index 8b67739cc..3bc2ad876 100644 --- a/src/hidimstat/leave_one_covariate_out.py +++ b/src/hidimstat/leave_one_covariate_out.py @@ -87,7 +87,9 @@ def fit(self, X, y, groups=None): # Parallelize the fitting of the covariate estimators self._list_estimators = Parallel(n_jobs=self.n_jobs)( delayed(self._joblib_fit_one_group)(estimator, X, y, key_groups) - for key_groups, estimator in zip(self.groups.keys(), self._list_estimators) + for key_groups, estimator in zip( + self.features_groups.keys(), self._list_estimators + ) ) return self @@ -101,14 +103,11 @@ def importance(self, X, y): The input samples to compute importance scores for. y : array-like of shape (n_samples,) - importances_ : ndarray of shape (n_groups,) - The importance scores for each group of covariates. - A higher score indicates greater importance of that group. - Returns ------- importances_ : ndarray of shape (n_features,) - Importance scores for each feature. + The importance scores for each group of covariates. + A higher score indicates greater importance of that group. """ super().importance(X, y) self.pvalues_ = None @@ -117,9 +116,9 @@ def importance(self, X, y): def _joblib_fit_one_group(self, estimator, X, y, key_groups): """Fit the estimator after removing a group of covariates. Used in parallel.""" if isinstance(X, pd.DataFrame): - X_minus_j = X.drop(columns=self.groups[key_groups]) + X_minus_j = X.drop(columns=self.features_groups[key_groups]) else: - X_minus_j = np.delete(X, self.groups[key_groups], axis=1) + X_minus_j = np.delete(X, self.features_groups[key_groups], axis=1) estimator.fit(X_minus_j, y) return estimator diff --git a/test/test_conditional_feature_importance.py b/test/test_conditional_feature_importance.py index 54013e2f0..30a0bda32 100644 --- a/test/test_conditional_feature_importance.py +++ b/test/test_conditional_feature_importance.py @@ -498,7 +498,7 @@ def test_internal_error(self, data_generator): ], } cfi.fit(X, groups=subgroups, var_type="auto") - cfi.groups["group1"] = [None for i in range(100)] + cfi.features_groups["group1"] = [None for i in range(100)] X = X.to_records(index=False) X = np.array(X, dtype=X.dtype.descr) From 7379ec1d2d991e2198a99820ac611a0f0488513a Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 8 Sep 2025 11:01:07 +0200 Subject: [PATCH 18/24] fix format --- src/hidimstat/base_perturbation.py | 4 +--- test/test_permutation_feature_importance.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/hidimstat/base_perturbation.py b/src/hidimstat/base_perturbation.py index e88c56c2b..c19bc607c 100644 --- a/src/hidimstat/base_perturbation.py +++ b/src/hidimstat/base_perturbation.py @@ -353,9 +353,7 @@ def _check_importance(self): if ( self.loss_reference_ is None and not hasattr(self, "loss_reference_cv_") ) or (self.loss_ is None and not hasattr(self, "loss_cv_")): - raise ValueError( - "The importance method has not yet been called." - ) + raise ValueError("The importance method has not yet been called.") def _joblib_predict_one_group(self, X, group_id, group_key): """ diff --git a/test/test_permutation_feature_importance.py b/test/test_permutation_feature_importance.py index 0d6ea7b24..1421cab0b 100644 --- a/test/test_permutation_feature_importance.py +++ b/test/test_permutation_feature_importance.py @@ -127,5 +127,5 @@ def test_permutation_importance_function(): assert importance.shape == (X.shape[1],) assert ( importance[important_features].mean() - > importance[1 - important_features].mean() + > importance[1 - important_features].mean() ) From 02ae5ba2ed95f92d10a5b5b10f3ab15a626146f7 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 8 Sep 2025 11:04:10 +0200 Subject: [PATCH 19/24] improve test --- test/test_permutation_feature_importance.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/test_permutation_feature_importance.py b/test/test_permutation_feature_importance.py index 1421cab0b..42b1e93fe 100644 --- a/test/test_permutation_feature_importance.py +++ b/test/test_permutation_feature_importance.py @@ -126,6 +126,5 @@ def test_permutation_importance_function(): assert importance.shape == (X.shape[1],) assert ( - importance[important_features].mean() - > importance[1 - important_features].mean() + importance[important_features].mean() > importance[~important_features].mean() ) From 1e91c65cc5e5e0ead7ca68f66a0eef93a9b3c66b Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 8 Sep 2025 11:05:02 +0200 Subject: [PATCH 20/24] fix docstring --- src/hidimstat/base_perturbation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/base_perturbation.py b/src/hidimstat/base_perturbation.py index c19bc607c..3c5d0bc06 100644 --- a/src/hidimstat/base_perturbation.py +++ b/src/hidimstat/base_perturbation.py @@ -37,7 +37,7 @@ class BasePerturbation(BaseVariableImportance): Attributes ---------- - groups : dict + features_groups : dict Mapping of feature groups identified during fit. importances_ : ndarray Computed importance scores for each feature group. From 58a57f8b455806bbab73d3ec074aae761fc24ee4 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 8 Sep 2025 11:09:25 +0200 Subject: [PATCH 21/24] fix test --- test/test_base_perturbation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_base_perturbation.py b/test/test_base_perturbation.py index 9198cf72d..d21cf1958 100644 --- a/test/test_base_perturbation.py +++ b/test/test_base_perturbation.py @@ -22,6 +22,6 @@ def test_chek_importance(): basic_class = BasePerturbation(estimator=estimator) basic_class.importances_ = [] with pytest.raises( - ValueError, match="The importances need to be called before calling this method" + ValueError, match="The importance method has not yet been called." ): basic_class.selection() From c4ea7318eeb33946139a2d177efae9f87a39cf07 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 11 Sep 2025 15:10:51 +0200 Subject: [PATCH 22/24] improve loco --- src/hidimstat/leave_one_covariate_out.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hidimstat/leave_one_covariate_out.py b/src/hidimstat/leave_one_covariate_out.py index d5625441b..6deba0c9f 100644 --- a/src/hidimstat/leave_one_covariate_out.py +++ b/src/hidimstat/leave_one_covariate_out.py @@ -59,7 +59,7 @@ def __init__( n_jobs=n_jobs, ) # internal variable - self._list_estimators = [] + self._list_estimators = None def fit(self, X, y, groups=None): """ @@ -136,7 +136,7 @@ def _check_fit(self, X): covariates.""" super()._check_fit(X) check_is_fitted(self.estimator) - if len(self._list_estimators) == 0: + if self._list_estimators is None: raise ValueError("The estimators require to be fit before to use them") for m in self._list_estimators: check_is_fitted(m) From 43d3f997a8bf32ddf599e4cf739797ec7887d643 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 11 Sep 2025 15:45:17 +0200 Subject: [PATCH 23/24] fix computation of pvalues --- src/hidimstat/base_perturbation.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/hidimstat/base_perturbation.py b/src/hidimstat/base_perturbation.py index 3c5d0bc06..0b63ef361 100644 --- a/src/hidimstat/base_perturbation.py +++ b/src/hidimstat/base_perturbation.py @@ -198,14 +198,12 @@ def importance(self, X, y): list_loss.append(self.loss(y, y_pred_perm)) self.loss_[j] = np.array(list_loss) - self.importances_ = np.array( - [ - np.mean(self.loss_[j]) - self.loss_reference_ - for j in range(self._n_groups) - ] + test_result = np.array( + [self.loss_[j] - self.loss_reference_ for j in range(self._n_groups)] ) + self.importances_ = np.mean(test_result, axis=1) self.pvalues_ = ttest_1samp( - self.importances_, 0.0, axis=0, alternative="greater" + test_result, 0.0, axis=1, alternative="greater" ).pvalue return self.importances_ From 804cab1eb0b7b2e8b67daa6d56d33b3187f4cd1e Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 11 Sep 2025 15:52:24 +0200 Subject: [PATCH 24/24] add test for function CFI --- test/test_conditional_feature_importance.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/test/test_conditional_feature_importance.py b/test/test_conditional_feature_importance.py index cbb348b10..7ad123654 100644 --- a/test/test_conditional_feature_importance.py +++ b/test/test_conditional_feature_importance.py @@ -571,18 +571,28 @@ def test_groups_warning(self, data_generator): @pytest.mark.parametrize( "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial", - [(150, 200, 10, 0.2, 42, 1.0, 1.0, 0.0)], - ids=["high level noise"], + [(150, 200, 10, 0.0, 42, 1.0, np.inf, 0.0)], + ids=["HiDim"], ) @pytest.mark.parametrize("n_permutation, cfi_seed", [(20, 0)], ids=["default_cfi"]) -def test_function_cfi(data_generator, n_permutation, cfi_seed): +def test_function_pfi(data_generator, n_permutation, cfi_seed): """Test CFI function""" - X, y, _, _ = data_generator - cfi( + X, y, important_features, _ = data_generator + selection, importance, pvalue = cfi( LinearRegression().fit(X, y), X, y, imputation_model_continuous=LinearRegression(), n_permutations=n_permutation, random_state=cfi_seed, + # TODO add a parameter for selection ) + # check that importance scores are defined for each feature + assert importance.shape == (X.shape[1],) + # check that important features have the highest importance scores + assert np.all([int(i) in important_features for i in np.argsort(importance)[-10:]]) + + assert pvalue.shape == (X.shape[1],) + assert pvalue[important_features].mean() < pvalue[~important_features].mean() + assert selection.shape == (X.shape[1],) + assert np.all(selection[important_features])