diff --git a/docs/src/api.rst b/docs/src/api.rst index dd6d6d2b9..86ef19945 100644 --- a/docs/src/api.rst +++ b/docs/src/api.rst @@ -37,11 +37,14 @@ Feature Importance functions .. autosummary:: :toctree: ./generated/api/class/ :template: function.rst - + + cfi_analysis clustered_inference clustered_inference_pvalue ensemble_clustered_inference ensemble_clustered_inference_pvalue + loco_analysis + pfi_analysis Visualization ============= @@ -60,8 +63,8 @@ Samplers :toctree: ./generated/api/class/ :template: class.rst - ~statistical_tools.ConditionalSampler - ~statistical_tools.GaussianKnockoffs + ~samplers.ConditionalSampler + ~samplers.GaussianKnockoffs Helper Functions ================ diff --git a/examples/plot_conditional_vs_marginal_xor_data.py b/examples/plot_conditional_vs_marginal_xor_data.py index 859d9df9e..c45dd0c7e 100644 --- a/examples/plot_conditional_vs_marginal_xor_data.py +++ b/examples/plot_conditional_vs_marginal_xor_data.py @@ -131,7 +131,7 @@ random_state=0, ) vim.fit(X_train, y_train) - importances.append(vim.importance(X_test, y_test)["importance"]) + importances.append(vim.importance(X_test, y_test)) importances = np.array(importances).T diff --git a/examples/plot_diabetes_variable_importance_example.py b/examples/plot_diabetes_variable_importance_example.py index aa36b4c05..9a03da1a9 100644 --- a/examples/plot_diabetes_variable_importance_example.py +++ b/examples/plot_diabetes_variable_importance_example.py @@ -63,7 +63,7 @@ import numpy as np from sklearn.base import clone from sklearn.linear_model import LogisticRegressionCV, RidgeCV -from sklearn.metrics import r2_score, root_mean_squared_error +from sklearn.metrics import mean_squared_error, r2_score from sklearn.model_selection import KFold n_folds = 5 @@ -78,7 +78,7 @@ score = r2_score( y_true=y[test_index], y_pred=regressor_list[i].predict(X[test_index]) ) - mse = root_mean_squared_error( + mse = mean_squared_error( y_true=y[test_index], y_pred=regressor_list[i].predict(X[test_index]) ) @@ -166,14 +166,14 @@ import pandas as pd from scipy.stats import ttest_1samp -cfi_vim_arr = np.array([x["importance"] for x in cfi_importance_list]) / 2 +cfi_vim_arr = np.array(cfi_importance_list) / 2 cfi_pval = ttest_1samp(cfi_vim_arr, 0, alternative="greater").pvalue vim = [ pd.DataFrame( { "var": np.arange(cfi_vim_arr.shape[1]), - "importance": x["importance"], + "importance": x, "fold": i, "pval": cfi_pval, "method": "CFI", @@ -182,14 +182,14 @@ for x in cfi_importance_list ] -loco_vim_arr = np.array([x["importance"] for x in loco_importance_list]) +loco_vim_arr = np.array(loco_importance_list) loco_pval = ttest_1samp(loco_vim_arr, 0, alternative="greater").pvalue vim += [ pd.DataFrame( { "var": np.arange(loco_vim_arr.shape[1]), - "importance": x["importance"], + "importance": x, "fold": i, "pval": loco_pval, "method": "LOCO", @@ -198,14 +198,14 @@ for x in loco_importance_list ] -pfi_vim_arr = np.array([x["importance"] for x in pfi_importance_list]) +pfi_vim_arr = np.array(pfi_importance_list) pfi_pval = ttest_1samp(pfi_vim_arr, 0, alternative="greater").pvalue vim += [ pd.DataFrame( { "var": np.arange(pfi_vim_arr.shape[1]), - "importance": x["importance"], + "importance": x, "fold": i, "pval": pfi_pval, "method": "PFI", diff --git a/examples/plot_importance_classification_iris.py b/examples/plot_importance_classification_iris.py index b1d87ff86..e17e19d1e 100644 --- a/examples/plot_importance_classification_iris.py +++ b/examples/plot_importance_classification_iris.py @@ -107,7 +107,7 @@ def run_one_fold( ) vim.fit(X[train_index], y[train_index]) - importance = vim.importance(X[test_index], y[test_index])["importance"] + importance = vim.importance(X[test_index], y[test_index]) return pd.DataFrame( { diff --git a/examples/plot_knockoffs_wisconsin.py b/examples/plot_knockoffs_wisconsin.py index ad1a85c57..115e98e05 100644 --- a/examples/plot_knockoffs_wisconsin.py +++ b/examples/plot_knockoffs_wisconsin.py @@ -147,7 +147,7 @@ from sklearn.covariance import LedoitWolf from hidimstat import ModelXKnockoff -from hidimstat.statistical_tools.gaussian_knockoffs import GaussianKnockoffs +from hidimstat.samplers import GaussianKnockoffs model_x_knockoff = ModelXKnockoff( ko_generator=GaussianKnockoffs( diff --git a/examples/plot_loco.py b/examples/plot_loco.py index 3ac3c7d8c..dbd9e62a5 100644 --- a/examples/plot_loco.py +++ b/examples/plot_loco.py @@ -82,7 +82,7 @@ # importance. This process is repeated for all features to assess their individual # contributions. loco.fit(X_train, y_train) - importances = loco.importance(X_test, y_test)["importance"] + importances = loco.importance(X_test, y_test) df_list.append( pd.DataFrame( { diff --git a/examples/plot_model_agnostic_importance.py b/examples/plot_model_agnostic_importance.py index 64487452f..5be96ad29 100644 --- a/examples/plot_model_agnostic_importance.py +++ b/examples/plot_model_agnostic_importance.py @@ -134,12 +134,8 @@ vim_linear.fit(X[train], y[train]) vim_non_linear.fit(X[train], y[train]) - importances_linear.append( - vim_linear.importance(X[test], y[test])["importance"], - ) - importances_non_linear.append( - vim_non_linear.importance(X[test], y[test])["importance"] - ) + importances_linear.append(vim_linear.importance(X[test], y[test])) + importances_non_linear.append(vim_non_linear.importance(X[test], y[test])) # %% diff --git a/examples/plot_pitfalls_permutation_importance.py b/examples/plot_pitfalls_permutation_importance.py index a4e0ed310..00185f905 100644 --- a/examples/plot_pitfalls_permutation_importance.py +++ b/examples/plot_pitfalls_permutation_importance.py @@ -144,7 +144,7 @@ ) pfi.fit(X_test, y_test) - permutation_importances.append(pfi.importance(X_test, y_test)["importance"]) + permutation_importances.append(pfi.importance(X_test, y_test)) permutation_importances = np.stack(permutation_importances) pval_pfi = ttest_1samp( permutation_importances, 0.0, axis=0, alternative="greater" @@ -216,7 +216,7 @@ ) cfi.fit(X_test, y_test) - conditional_importances.append(cfi.importance(X_test, y_test)["importance"]) + conditional_importances.append(cfi.importance(X_test, y_test)) cfi_pval = ttest_1samp( @@ -267,7 +267,7 @@ from matplotlib.lines import Line2D -from hidimstat.statistical_tools.conditional_sampling import ConditionalSampler +from hidimstat.samplers.conditional_sampling import ConditionalSampler X_train, X_test = train_test_split( X, diff --git a/src/hidimstat/__init__.py b/src/hidimstat/__init__.py index 3cb37dadd..edebc8d18 100644 --- a/src/hidimstat/__init__.py +++ b/src/hidimstat/__init__.py @@ -1,4 +1,4 @@ -from .conditional_feature_importance import CFI +from .conditional_feature_importance import CFI, cfi_analysis from .desparsified_lasso import DesparsifiedLasso, desparsified_lasso, reid from .distilled_conditional_randomization_test import D0CRT, d0crt from .ensemble_clustered_inference import ( @@ -8,8 +8,8 @@ ensemble_clustered_inference_pvalue, ) from .knockoffs import ModelXKnockoff -from .leave_one_covariate_out import LOCO -from .permutation_feature_importance import PFI +from .leave_one_covariate_out import LOCO, loco_analysis +from .permutation_feature_importance import PFI, pfi_analysis from .statistical_tools.aggregation import quantile_aggregation try: @@ -30,6 +30,9 @@ "reid", "ModelXKnockoff", "CFI", + "cfi_analysis", "LOCO", + "loco_analysis", "PFI", + "pfi_analysis", ] diff --git a/src/hidimstat/_utils/utils.py b/src/hidimstat/_utils/utils.py index 228a3bdfb..459afb0cb 100644 --- a/src/hidimstat/_utils/utils.py +++ b/src/hidimstat/_utils/utils.py @@ -1,7 +1,11 @@ import numbers +from functools import partial import numpy as np from numpy.random import RandomState +from scipy.stats import ttest_1samp, wilcoxon + +from hidimstat.statistical_tools.nadeau_bengio_ttest import nadeau_bengio_ttest def _check_vim_predict_method(method): @@ -33,6 +37,37 @@ def _check_vim_predict_method(method): ) +def get_fitted_attributes(cls): + """ + Get all attributes from a class that end with a single underscore + and doesn't start with one underscore. + + Parameters + ---------- + cls : class + The class to inspect for attributes. + + Returns + ------- + list + A list of attribute names that end with a single underscore but not double underscore. + """ + # Get all attributes and methods of the class + all_attributes = dir(cls) + + # Filter out attributes that start with an underscore + filtered_attributes = [attr for attr in all_attributes if not attr.startswith("_")] + + # Filter out attributes that do not end with a single underscore + result = [ + attr + for attr in filtered_attributes + if attr.endswith("_") and not attr.endswith("__") + ] + + return result + + def check_random_state(seed): """ Modified version of sklearn's check_random_state using np.random.Generator. @@ -105,3 +140,56 @@ def seed_estimator(estimator, random_state=None): setattr(value, "random_state", RandomState(rng.bit_generator)) return estimator + + +def check_statistical_test(statistical_test, test_frac=None): + """ + Validates and returns a test statistic function. + + Parameters + ---------- + statisticcal_test : str or callable + If str, must be either 'ttest' or 'wilcoxon'. + If callable, must be a function that can be used as a test statistic. + test_frac : float, optional + The fraction of data used for testing in the Nadeau-Bengio t-test. + + Returns + ------- + callable + A function that can be used as a test statistic. + For string inputs, returns a partial function of either ttest_1samp or wilcoxon. + For callable inputs, returns the input function. + + Raises + ------ + ValueError + If test is a string but not one of the supported test names ('ttest' or 'wilcoxon'). + ValueError + If test is neither a string nor a callable. + """ + if isinstance(statistical_test, str): + if statistical_test == "ttest": + return partial(ttest_1samp, popmean=0, alternative="greater", axis=1) + elif statistical_test == "wilcoxon": + return partial(wilcoxon, alternative="greater", axis=1) + elif statistical_test == "nb-ttest": + return partial( + nadeau_bengio_ttest, + popmean=0, + test_frac=test_frac, + alternative="greater", + axis=1, + ) + else: + raise ValueError(f"the test '{statistical_test}' is not supported") + elif callable(statistical_test): + return statistical_test + else: + raise ValueError( + f"Unsupported value for 'statistical_test'." + f"The provided argument was '{statistical_test}'. " + f"Please choose from the following valid options: " + f"string values ('ttest', 'wilcoxon', 'nb-ttest') " + f"or a custom callable function with a `scipy.stats` API-compatible signature." + ) diff --git a/src/hidimstat/base_perturbation.py b/src/hidimstat/base_perturbation.py index 11d3d7aac..35b49bb29 100644 --- a/src/hidimstat/base_perturbation.py +++ b/src/hidimstat/base_perturbation.py @@ -1,9 +1,13 @@ import numpy as np from joblib import Parallel, delayed from sklearn.base import check_is_fitted -from sklearn.metrics import root_mean_squared_error +from sklearn.metrics import mean_squared_error -from hidimstat._utils.utils import _check_vim_predict_method, check_random_state +from hidimstat._utils.utils import ( + _check_vim_predict_method, + check_random_state, + check_statistical_test, +) from hidimstat.base_variable_importance import ( BaseVariableImportance, GroupVariableImportanceMixin, @@ -11,56 +15,80 @@ class BasePerturbation(BaseVariableImportance, GroupVariableImportanceMixin): + """ + Abstract base class for model-agnostic variable importance measures using + perturbation techniques. + + Parameters + ---------- + estimator : sklearn-compatible estimator + The fitted estimator used for predictions. + method : str, default="predict" + The method used for making predictions. This determines the predictions + passed to the loss function. Supported methods are "predict", + "predict_proba", "decision_function", "transform". + loss : callable, default=mean_squared_error + The function to compute the loss when comparing the perturbed model + to the original model. + n_permutations : int, default=50 + Number of permutations for each feature group. + statistical_test : callable or str, default="nb-ttest" + Statistical test function for computing p-values from importance scores. + features_groups : dict or None, default=None + Mapping of group names to lists of feature indices or names. If None, groups are inferred. + n_jobs : int, default=1 + Number of parallel jobs for computation. + random_state : int or None, default=None + Seed for reproducible permutations. + + Attributes + ---------- + features_groups : dict + Mapping of feature groups identified during fit. + importances_ : ndarray (n_groups,) + Importance scores for each feature group. + loss_reference_ : float + Loss on original (non-perturbed) data. + loss_ : dict + Loss values for each permutation of each group. + pvalues_ : ndarray of shape (n_groups,) + P-values for importance scores. + + Notes + ----- + This class is abstract. Subclasses must implement the `_permutation` method + to define how feature groups are perturbed. + """ + def __init__( self, estimator, - loss: callable = root_mean_squared_error, - n_permutations: int = 50, method: str = "predict", + loss: callable = mean_squared_error, + n_permutations: int = 50, + statistical_test="ttest", features_groups=None, n_jobs: int = 1, random_state=None, ): - """ - Base class for model-agnostic variable importance measures based on - perturbation. - - Parameters - ---------- - estimator : sklearn compatible estimator, optional - The estimator to use for the prediction. - loss : callable, default=root_mean_squared_error - The function to compute the loss when comparing the perturbed model - to the original model. - n_permutations : int, default=50 - This parameter is relevant only for PFI or CFI. - Specifies the number of times the variable group (residual for CFI) is - permuted. For each permutation, the perturbed model's loss is calculated - and averaged over all permutations. - method : str, default="predict" - The method used for making predictions. This determines the predictions - passed to the loss function. Supported methods are "predict", - "predict_proba", "decision_function", "transform". - features_groups: dict or None, default=None - A dictionary where the keys are the group names and the values are the - list of column names corresponding to each features group. If None, - the features_groups are identified based on the columns of X. - n_jobs : int, default=1 - The number of parallel jobs to run. Parallelization is done over the - variables or groups of variables. - random_state : int, default=None - The random state to use for sampling. - """ super().__init__() + GroupVariableImportanceMixin.__init__(self, features_groups=features_groups) check_is_fitted(estimator) assert n_permutations > 0, "n_permutations must be positive" self.estimator = estimator self.loss = loss _check_vim_predict_method(method) self.method = method - self.n_jobs = n_jobs self.n_permutations = n_permutations - GroupVariableImportanceMixin.__init__(self, features_groups=features_groups) + self.statistical_test = statistical_test + self.n_jobs = n_jobs + + # variable set in importance + self.loss_reference_ = None + self.loss_ = None + # internal variables + self._n_groups = None + self._groups_ids = None self.random_state = random_state def fit(self, X, y=None): @@ -95,7 +123,7 @@ def _check_compatibility(self, X): """Check compatibility between input data and fitted model.""" GroupVariableImportanceMixin._check_compatibility(self, X) - def predict(self, X): + def _predict(self, X): """ Compute the predictions after perturbation of the data for each group of variables. @@ -110,8 +138,6 @@ def predict(self, X): out: array-like of shape (n_groups, n_permutations, n_samples) The predictions after perturbation of the data for each group of variables. """ - self._check_fit() - self._check_compatibility(X) X_ = np.asarray(X) rng = check_random_state(self.random_state) @@ -132,45 +158,100 @@ def importance(self, X, y): Parameters ---------- - X: array-like of shape (n_samples, n_features) - The input samples. - y: array-like of shape (n_samples,) - The target values. + X : array-like of shape (n_samples, n_features) + The input samples to compute importance scores for. + y : array-like of shape (n_samples,) + + importances_ : ndarray of shape (n_groups,) + The importance scores for each group of covariates. + A higher score indicates greater importance of that group. Returns ------- - out_dict: dict - A dictionary containing the following keys: - - 'loss_reference': the loss of the model with the original data. - - 'loss': a dictionary containing the loss of the perturbed model - for each group. - - 'importance': the importance scores for each group. - """ - GroupVariableImportanceMixin._check_fit(self) - GroupVariableImportanceMixin._check_compatibility(self, X) + importances_ : ndarray of shape (n_features,) + Importance scores for each feature. + + Attributes + ---------- + loss_reference_ : float + The loss of the model with the original (non-perturbed) data. + loss_ : dict + Dictionary with indices as keys and arrays of perturbed losses as values. + Contains the loss values for each permutation of each group. + importances_ : ndarray of shape (n_groups,) + The calculated importance scores for each group. + pvalues_ : ndarray of shape (n_groups,) + P-values from one-sample t-test testing if importance scores are + significantly greater than 0. - out_dict = dict() + Notes + ----- + The importance score for each group is calculated as the mean increase in loss + when that group is perturbed, compared to the reference loss. + A higher importance score indicates that perturbing that group leads to + worse model performance, suggesting those features are more important. + """ + self._check_fit() + self._check_compatibility(X) + statistical_test = check_statistical_test(self.statistical_test) y_pred = getattr(self.estimator, self.method)(X) - loss_reference = self.loss(y, y_pred) - out_dict["loss_reference"] = loss_reference + self.loss_reference_ = self.loss(y, y_pred) - y_pred = self.predict(X) - out_dict["loss"] = dict() + y_pred = self._predict(X) + self.loss_ = dict() for j, y_pred_j in enumerate(y_pred): list_loss = [] for y_pred_perm in y_pred_j: list_loss.append(self.loss(y, y_pred_perm)) - out_dict["loss"][j] = np.array(list_loss) + self.loss_[j] = np.array(list_loss) - out_dict["importance"] = np.array( + test_result = np.array( [ - np.mean(out_dict["loss"][j]) - loss_reference + self.loss_[j] - self.loss_reference_ for j in range(self.n_features_groups_) ] ) - self.importances_ = out_dict["importance"] - return out_dict + self.importances_ = np.mean(test_result, axis=1) + self.pvalues_ = statistical_test(test_result).pvalue + assert ( + self.pvalues_.shape[0] == y_pred.shape[0] + ), "The statistical test doesn't provide the correct dimension." + return self.importances_ + + def fit_importance(self, X, y): + """ + Fits the model to the data and computes feature importance scores. + Convenience method that combines fit() and importance() into a single call. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + y : array-like of shape (n_samples,) + Target values. + + Returns + ------- + importances_ : ndarray of shape (n_groups,) + The calculated importance scores for each feature group. + Higher values indicate greater importance. + + Notes + ----- + This method first calls fit() to identify feature groups, then calls + importance() to compute the importance scores for each group. + """ + self.fit(X, y) + return self.importance(X, y) + + def _check_importance(self): + """ + Checks if the loss has been computed. + """ + super()._check_importance() + if (self.loss_reference_ is None) or (self.loss_ is None): + raise ValueError("The importance method has not yet been called.") def _joblib_predict_one_features_group( self, X, features_group_id, features_group_key, random_state=None diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index e7b9a7e25..7d44ff149 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -409,7 +409,6 @@ class GroupVariableImportanceMixin: """ def __init__(self, features_groups=None): - super().__init__() self.features_groups = features_groups self.n_features_groups_ = None self._features_groups_ids = None diff --git a/src/hidimstat/conditional_feature_importance.py b/src/hidimstat/conditional_feature_importance.py index 626f7c12c..a2fa2072e 100644 --- a/src/hidimstat/conditional_feature_importance.py +++ b/src/hidimstat/conditional_feature_importance.py @@ -1,80 +1,90 @@ +from functools import partial + import numpy as np from joblib import Parallel, delayed +from scipy.stats import wilcoxon from sklearn.base import BaseEstimator, check_is_fitted, clone -from sklearn.metrics import root_mean_squared_error +from sklearn.linear_model import LogisticRegressionCV, RidgeCV +from sklearn.metrics import mean_squared_error +from hidimstat._utils.docstring import _aggregate_docstring from hidimstat.base_perturbation import BasePerturbation -from hidimstat.statistical_tools.conditional_sampling import ConditionalSampler +from hidimstat.samplers.conditional_sampling import ConditionalSampler class CFI(BasePerturbation): + """ + Conditional Feature Importance (CFI) algorithm. + :footcite:t:`Chamma_NeurIPS2023` and for group-level see + :footcite:t:`Chamma_AAAI2024`. + + Parameters + ---------- + estimator : sklearn compatible estimator + The estimator to use for the prediction. + method : str, default="predict" + The method to use for the prediction. This determines the predictions passed + to the loss function. Supported methods are "predict", "predict_proba" or + "decision_function". + loss : callable, default=mean_squared_error + The loss function to use when comparing the perturbed model to the full + model. + n_permutations : int, default=50 + The number of permutations to perform. For each variable/group of variables, + the mean of the losses over the `n_permutations` is computed. + imputation_model_continuous : sklearn compatible estimator, default=RidgeCV() + The model used to estimate the conditional distribution of a given + continuous variable/group of variables given the others. + imputation_model_categorical : sklearn compatible estimator, default=LogisticRegressionCV() + The model used to estimate the conditional distribution of a given + categorical variable/group of variables given the others. Binary is + considered as a special case of categorical. + features_groups: dict or None, default=None + A dictionary where the keys are the group names and the values are the + list of column names corresponding to each features group. If None, + the features_groups are identified based on the columns of X. + feature_types: str or list, default="auto" + The feature type. Supported types include "auto", "continuous", and + "categorical". If "auto", the type is inferred from the cardinality + of the unique values passed to the `fit` method. + categorical_max_cardinality : int, default=10 + The maximum cardinality of a variable to be considered as categorical + when the variable type is inferred (set to "auto" or not provided). + statistical_test : callable or str, default="ttest" + Statistical test function for computing p-values of importance scores. + random_state : int or None, default=None + The random state to use for sampling. + n_jobs : int, default=1 + The number of jobs to run in parallel. Parallelization is done over the + variables or groups of variables. + + References + ---------- + .. footbibliography:: + """ + def __init__( self, estimator, - loss: callable = root_mean_squared_error, method: str = "predict", + loss: callable = mean_squared_error, n_permutations: int = 50, - imputation_model_continuous=None, - imputation_model_categorical=None, + imputation_model_continuous=RidgeCV(), + imputation_model_categorical=LogisticRegressionCV(), features_groups=None, feature_types="auto", categorical_max_cardinality: int = 10, - n_jobs: int = 1, + statistical_test="ttest", random_state: int = None, + n_jobs: int = 1, ): - """ - Conditional Feature Importance (CFI) algorithm. - :footcite:t:`Chamma_NeurIPS2023` and for group-level see - :footcite:t:`Chamma_AAAI2024`. - - Parameters - ---------- - estimator : sklearn compatible estimator, optional - The estimator to use for the prediction. - loss : callable, default=root_mean_squared_error - The loss function to use when comparing the perturbed model to the full - model. - method : str, default="predict" - The method to use for the prediction. This determines the predictions passed - to the loss function. Supported methods are "predict", "predict_proba" or - "decision_function". - n_permutations : int, default=50 - The number of permutations to perform. For each feature/group of features, - the mean of the losses over the `n_permutations` is computed. - imputation_model_continuous : sklearn compatible estimator, optional - The model used to estimate the conditional distribution of a given - continuous features/group of features given the others. - imputation_model_categorical : sklearn compatible estimator, optional - The model used to estimate the conditional distribution of a given - categorical features/group of features given the others. Binary is - considered as a special case of categorical. - categorical_max_cardinality : int, default=10 - The maximum cardinality of a feature to be considered as categorical - when the feature type is inferred (set to "auto" or not provided). - features_groups: dict or None, default=None - A dictionary where the keys are the group names and the values are the - list of column names corresponding to each features group. If None, - the features_groups are identified based on the columns of X. - feature_types: str or list, default="auto" - The feature type. Supported types include "auto", "continuous", and - "categorical". If "auto", the type is inferred from the cardinality - of the unique values passed to the `fit` method. - random_state : int, default=None - The random state to use for sampling. - n_jobs : int, default=1 - The number of jobs to run in parallel. Parallelization is done over the - features or groups of features. - - References - ---------- - .. footbibliography:: - """ super().__init__( estimator=estimator, - loss=loss, method=method, - n_jobs=n_jobs, + loss=loss, n_permutations=n_permutations, + statistical_test=statistical_test, + n_jobs=n_jobs, features_groups=features_groups, random_state=random_state, ) @@ -94,7 +104,8 @@ def __init__( self.imputation_model_continuous = imputation_model_continuous def fit(self, X, y=None): - """Fit the imputation models. + """ + Fit the imputation models. Parameters ---------- @@ -151,6 +162,32 @@ def fit(self, X, y=None): return self + def fit_importance(self, X, y): + """ + Fits the model to the data and computes feature importance scores. + Convenience method that combines fit() and importance() into a single call. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + y : array-like of shape (n_samples,) + Target values. + + Returns + ------- + importances_ : ndarray of shape (n_groups,) + The calculated importance scores for each feature group. + Higher values indicate greater importance. + + Notes + ----- + This method first calls fit() to identify feature groups, then calls + importance() to compute the importance scores for each group. + """ + self.fit(X, y) + return self.importance(X, y) + def _joblib_fit_one_features_group(self, estimator, X, features_groups_ids): """Fit a single imputation model, for a single group of features. This method is parallelized.""" @@ -187,3 +224,68 @@ def _permutation(self, X, features_group_id, random_state=None): return self._list_imputation_models[features_group_id].sample( X_minus_j, X_j, n_samples=self.n_permutations, random_state=random_state ) + + +def cfi_analysis( + estimator, + X, + y, + method: str = "predict", + loss: callable = mean_squared_error, + n_permutations: int = 50, + imputation_model_continuous=RidgeCV(), + imputation_model_categorical=LogisticRegressionCV(), + features_groups=None, + feature_types="auto", + categorical_max_cardinality: int = 10, + test_statistic="ttest", + k_best=None, + percentile=None, + threshold_max=None, + threshold_min=None, + random_state: int = None, + n_jobs: int = 1, +): + methods = CFI( + estimator=estimator, + method=method, + loss=loss, + n_permutations=n_permutations, + imputation_model_continuous=imputation_model_continuous, + imputation_model_categorical=imputation_model_categorical, + features_groups=features_groups, + feature_types=feature_types, + categorical_max_cardinality=categorical_max_cardinality, + statistical_test=test_statistic, + random_state=random_state, + n_jobs=n_jobs, + ) + methods.fit_importance(X, y) + selection = methods.importance_selection( + k_best=k_best, + percentile=percentile, + threshold_max=threshold_max, + threshold_min=threshold_min, + ) + return selection, methods.importances_, methods.pvalues_ + + +# use the docstring of the class for the function +cfi_analysis.__doc__ = _aggregate_docstring( + [ + CFI.__doc__, + CFI.__init__.__doc__, + CFI.fit_importance.__doc__, + CFI.importance_selection.__doc__, + ], + """ + Returns + ------- + selection : ndarray of shape (n_features,) + Boolean array indicating selected features (True = selected) + importances : ndarray of shape (n_features,) + Feature importance scores/test statistics. + pvalues : ndarray of shape (n_features,) + P-values for importance scores. + """, +) diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index 249813833..4a04b8039 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -11,8 +11,8 @@ from hidimstat._utils.docstring import _aggregate_docstring from hidimstat._utils.utils import check_random_state, seed_estimator from hidimstat.base_variable_importance import BaseVariableImportance +from hidimstat.samplers import GaussianKnockoffs from hidimstat.statistical_tools.aggregation import quantile_aggregation -from hidimstat.statistical_tools.gaussian_knockoffs import GaussianKnockoffs from hidimstat.statistical_tools.multiple_testing import fdr_threshold diff --git a/src/hidimstat/leave_one_covariate_out.py b/src/hidimstat/leave_one_covariate_out.py index d84d6e5a3..dc48f3bfd 100644 --- a/src/hidimstat/leave_one_covariate_out.py +++ b/src/hidimstat/leave_one_covariate_out.py @@ -1,68 +1,82 @@ +from functools import partial + import numpy as np import pandas as pd from joblib import Parallel, delayed +from scipy.stats import wilcoxon from sklearn.base import check_is_fitted, clone -from sklearn.metrics import root_mean_squared_error +from sklearn.metrics import mean_squared_error +from hidimstat._utils.docstring import _aggregate_docstring +from hidimstat._utils.utils import check_statistical_test from hidimstat.base_perturbation import BasePerturbation +from hidimstat.base_variable_importance import GroupVariableImportanceMixin class LOCO(BasePerturbation): + """ + Leave-One-Covariate-Out (LOCO) algorithm + + This method is presented in :footcite:t:`lei2018distribution` and :footcite:t:`verdinelli2024feature`. + The model is re-fitted for each feature/group of features. The importance is + then computed as the difference between the loss of the full model and the loss + of the model without the feature/group. + + Parameters + ---------- + estimator : sklearn compatible estimator + The estimator to use for the prediction. + method : str, default="predict" + The method to use for the prediction. This determines the predictions passed + to the loss function. Supported methods are "predict", "predict_proba" or + "decision_function". + loss : callable, default=mean_squared_error + The loss function to use when comparing the perturbed model to the full + model. + statistical_test : callable or str, default="ttest" + Statistical test function for computing p-values of importance scores. + features_groups: dict or None, default=None + A dictionary where the keys are the group names and the values are the + list of column names corresponding to each features group. If None, + the features_groups are identified based on the columns of X. + n_jobs : int, default=1 + The number of jobs to run in parallel. Parallelization is done over the + variables or groups of variables. + + Notes + ----- + :footcite:t:`Williamson_General_2023` also presented a LOCO method with an + additional data splitting strategy. + + References + ---------- + .. footbibliography:: + """ + def __init__( self, estimator, - loss: callable = root_mean_squared_error, method: str = "predict", + loss: callable = mean_squared_error, + statistical_test="ttest", features_groups=None, n_jobs: int = 1, ): - """ - Leave-One-Covariate-Out (LOCO) as presented in - :footcite:t:`lei2018distribution` and :footcite:t:`verdinelli2024feature`. - The model is re-fitted for each feature/group of features. The importance is - then computed as the difference between the loss of the full model and the loss - of the model without the feature/group. - - Parameters - ---------- - estimator : sklearn compatible estimator - The estimator to use for the prediction. - loss : callable, default=root_mean_squared_error - The loss function to use when comparing the perturbed model to the full - model. - method : str, default="predict" - The method to use for the prediction. This determines the predictions passed - to the loss function. Supported methods are "predict", "predict_proba" or - "decision_function". - features_groups: dict or None, default=None - A dictionary where the keys are the group names and the values are the - list of column names corresponding to each features group. If None, - the features_groups are identified based on the columns of X. - n_jobs : int, default=1 - The number of jobs to run in parallel. Parallelization is done over the - features or groups of features. - - Notes - ----- - :footcite:t:`Williamson_General_2023` also presented a LOCO method with an - additional data splitting strategy. - - References - ---------- - .. footbibliography:: - """ super().__init__( estimator=estimator, - loss=loss, method=method, - n_jobs=n_jobs, + loss=loss, n_permutations=1, + statistical_test=statistical_test, features_groups=features_groups, + n_jobs=n_jobs, ) - self._list_estimators = [] + # internal variable + self._list_estimators = None def fit(self, X, y): - """Fit a model after removing each covariate/group of covariates. + """ + Fit a model after removing each covariate/group of covariates. Parameters ---------- @@ -93,6 +107,75 @@ def fit(self, X, y): ) return self + def importance(self, X, y): + """ + Compute the importance scores for each group of covariates. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The input samples to compute importance scores for. + y : array-like of shape (n_samples,) + + importances_ : ndarray of shape (n_groups,) + The importance scores for each group of covariates. + A higher score indicates greater importance of that group. + + Returns + ------- + importances_ : ndarray of shape (n_features,) + Importance scores for each feature. + + Attributes + ---------- + loss_reference_ : float + The loss of the model with the original (non-perturbed) data. + loss_ : dict + Dictionary with indices as keys and arrays of perturbed losses as values. + Contains the loss values for each permutation of each group. + importances_ : ndarray of shape (n_groups,) + The calculated importance scores for each group. + pvalues_ : ndarray of shape (n_groups,) + P-values from one-sided t-test testing if importance scores are + significantly greater than 0. + + Notes + ----- + The importance score for each group is calculated as the mean increase in loss + when that group is perturbed, compared to the reference loss. + A higher importance score indicates that perturbing that group leads to + worse model performance, suggesting those features are more important. + """ + self._check_fit() + self._check_compatibility(X) + statistical_test = check_statistical_test(self.statistical_test) + + y_pred = getattr(self.estimator, self.method)(X) + self.loss_reference_ = self.loss(y, y_pred) + + y_pred = self._predict(X) + test_result = [] + self.loss_ = dict() + for j, y_pred_j in enumerate(y_pred): + self.loss_[j] = np.array([self.loss(y, y_pred_j[0])]) + if np.all(np.equal(y.shape, y_pred_j[0].shape)): + test_result.append(y - y_pred_j[0]) + else: + test_result.append(y - np.unique(y)[np.argmax(y_pred_j[0], axis=-1)]) + + self.importances_ = np.mean( + [ + self.loss_[j] - self.loss_reference_ + for j in range(self.n_features_groups_) + ], + axis=1, + ) + self.pvalues_ = statistical_test(np.array(test_result)).pvalue + assert ( + self.pvalues_.shape[0] == y_pred.shape[0] + ), "The statistical test doesn't provide the correct dimension." + return self.importances_ + def _joblib_fit_one_features_group(self, estimator, X, y, key_features_group): """Fit the estimator after removing a group of covariates. Used in parallel.""" if isinstance(X, pd.DataFrame): @@ -103,7 +186,7 @@ def _joblib_fit_one_features_group(self, estimator, X, y, key_features_group): return estimator def _joblib_predict_one_features_group( - self, X, features_group_id, key_features_group, random_state=None + self, X, features_group_id, features_group_key, random_state=None ): """Predict the target feature after removing a group of covariates. Used in parallel.""" @@ -120,7 +203,60 @@ def _check_fit(self): covariates.""" super()._check_fit() check_is_fitted(self.estimator) - if len(self._list_estimators) == 0: + if self._list_estimators is None: raise ValueError("The estimators require to be fit before to use them") for m in self._list_estimators: check_is_fitted(m) + + +def loco_analysis( + estimator, + X, + y, + method: str = "predict", + loss: callable = mean_squared_error, + features_groups=None, + test_statistic="ttest", + k_best=None, + percentile=None, + threshold_min=None, + threshold_max=None, + n_jobs: int = 1, +): + method = LOCO( + estimator=estimator, + method=method, + loss=loss, + statistical_test=test_statistic, + features_groups=features_groups, + n_jobs=n_jobs, + ) + method.fit_importance(X, y) + selection = method.importance_selection( + k_best=k_best, + percentile=percentile, + threshold_min=threshold_min, + threshold_max=threshold_max, + ) + return selection, method.importances_, method.pvalues_ + + +# use the docstring of the class for the function +loco_analysis.__doc__ = _aggregate_docstring( + [ + LOCO.__doc__, + LOCO.__init__.__doc__, + LOCO.fit_importance.__doc__, + LOCO.importance_selection.__doc__, + ], + """ + Returns + ------- + selection : ndarray of shape (n_features,) + Boolean array indicating selected features (True = selected) + importances : ndarray of shape (n_features,) + Feature importance scores/test statistics. + pvalues : ndarray of shape (n_features,) + None because there is no p-value for this method + """, +) diff --git a/src/hidimstat/permutation_feature_importance.py b/src/hidimstat/permutation_feature_importance.py index 4a0dfc825..bb88927d4 100644 --- a/src/hidimstat/permutation_feature_importance.py +++ b/src/hidimstat/permutation_feature_importance.py @@ -1,64 +1,75 @@ +from functools import partial + import numpy as np -from sklearn.metrics import root_mean_squared_error +from scipy.stats import wilcoxon +from sklearn.metrics import mean_squared_error +from hidimstat._utils.docstring import _aggregate_docstring from hidimstat._utils.utils import check_random_state from hidimstat.base_perturbation import BasePerturbation class PFI(BasePerturbation): + """ + Permutation Feature Importance algorithm + + This as presented in :footcite:t:`breimanRandomForests2001`. + For each variable/group of variables, the importance is computed as the + difference between the loss of the initial model and the loss of the model + with the variable/group permuted. + The method was also used in :footcite:t:`mi2021permutation` + + Parameters + ---------- + estimator : sklearn compatible estimator + The estimator to use for the prediction. + method : str, default="predict" + The method to use for the prediction. This determines the predictions passed + to the loss function. Supported methods are "predict", "predict_proba" or + "decision_function". + loss : callable, default=mean_squared_error + The loss function to use when comparing the perturbed model to the full + model. + n_permutations : int, default=50 + The number of permutations to perform. For each variable/group of variables, + the mean of the losses over the `n_permutations` is computed. + statistical_test : callable or str, default="ttest" + Statistical test function for computing p-values of importance scores. + features_groups: dict or None, default=None + A dictionary where the keys are the group names and the values are the + list of column names corresponding to each features group. If None, + the features_groups are identified based on the columns of X. + random_state : int or None, default=None + The random state to use for sampling. + n_jobs : int, default=1 + The number of jobs to run in parallel. Parallelization is done over the + variables or groups of variables. + + References + ---------- + .. footbibliography:: + """ + def __init__( self, estimator, - loss: callable = root_mean_squared_error, method: str = "predict", + loss: callable = mean_squared_error, n_permutations: int = 50, + statistical_test="ttest", features_groups=None, random_state: int = None, n_jobs: int = 1, ): - """ - Permutation Feature Importance algorithm as presented in - :footcite:t:`breimanRandomForests2001`. For each feature/group of features, - the importance is computed as the difference between the loss of the initial - model and the loss of the model with the feature/group permuted. - The method was also used in :footcite:t:`mi2021permutation` - - Parameters - ---------- - estimator : sklearn compatible estimator, optionals - The estimator to use for the prediction. - loss : callable, default=root_mean_squared_error - The loss function to use when comparing the perturbed model to the full - model. - method : str, default="predict" - The method to use for the prediction. This determines the predictions passed - to the loss function. Supported methods are "predict", "predict_proba" or - "decision_function". - n_permutations : int, default=50 - The number of permutations to perform. For each feature/group of features, - the mean of the losses over the `n_permutations` is computed. - features_groups: dict or None, default=None - A dictionary where the keys are the group names and the values are the - list of column names corresponding to each features group. If None, - the features_groups are identified based on the columns of X. - random_state : int, default=None - The random state to use for sampling. - n_jobs : int, default=1 - The number of jobs to run in parallel. Parallelization is done over the - features or groups of features. - - References - ---------- - .. footbibliography:: - """ super().__init__( estimator=estimator, - loss=loss, method=method, - n_jobs=n_jobs, + loss=loss, n_permutations=n_permutations, + statistical_test=statistical_test, features_groups=features_groups, random_state=random_state, + n_jobs=n_jobs, ) def _permutation(self, X, features_group_id, random_state=None): @@ -73,3 +84,60 @@ def _permutation(self, X, features_group_id, random_state=None): ] ) return X_perm_j + + +def pfi_analysis( + estimator, + X, + y, + method: str = "predict", + loss: callable = mean_squared_error, + n_permutations: int = 50, + test_statistic="ttest", + features_groups=None, + k_best=None, + percentile=None, + threshold_min=None, + threshold_max=None, + random_state: int = None, + n_jobs: int = 1, +): + methods = PFI( + estimator=estimator, + method=method, + loss=loss, + n_permutations=n_permutations, + statistical_test=test_statistic, + features_groups=features_groups, + random_state=random_state, + n_jobs=n_jobs, + ) + methods.fit_importance(X, y) + selection = methods.importance_selection( + k_best=k_best, + percentile=percentile, + threshold_min=threshold_min, + threshold_max=threshold_max, + ) + return selection, methods.importances_, methods.pvalues_ + + +# use the docstring of the class for the function +pfi_analysis.__doc__ = _aggregate_docstring( + [ + PFI.__doc__, + PFI.__init__.__doc__, + PFI.fit_importance.__doc__, + PFI.importance_selection.__doc__, + ], + """ + Returns + ------- + selection : ndarray of shape (n_features,) + Boolean array indicating selected features (True = selected) + importances : ndarray of shape (n_features,) + Feature importance scores/test statistics. + pvalues : ndarray of shape (n_features,) + P-values for importance scores. + """, +) diff --git a/src/hidimstat/samplers/__init__.py b/src/hidimstat/samplers/__init__.py new file mode 100644 index 000000000..6cb04420a --- /dev/null +++ b/src/hidimstat/samplers/__init__.py @@ -0,0 +1,7 @@ +from .conditional_sampling import ConditionalSampler +from .gaussian_knockoffs import GaussianKnockoffs + +__all__ = [ + "ConditionalSampler", + "GaussianKnockoffs", +] diff --git a/src/hidimstat/statistical_tools/conditional_sampling.py b/src/hidimstat/samplers/conditional_sampling.py similarity index 100% rename from src/hidimstat/statistical_tools/conditional_sampling.py rename to src/hidimstat/samplers/conditional_sampling.py diff --git a/src/hidimstat/statistical_tools/gaussian_knockoffs.py b/src/hidimstat/samplers/gaussian_knockoffs.py similarity index 100% rename from src/hidimstat/statistical_tools/gaussian_knockoffs.py rename to src/hidimstat/samplers/gaussian_knockoffs.py diff --git a/src/hidimstat/statistical_tools/__init__.py b/src/hidimstat/statistical_tools/__init__.py index 4012354d2..b713a79f6 100644 --- a/src/hidimstat/statistical_tools/__init__.py +++ b/src/hidimstat/statistical_tools/__init__.py @@ -1,9 +1,5 @@ -from .conditional_sampling import ConditionalSampler -from .gaussian_knockoffs import GaussianKnockoffs from .nadeau_bengio_ttest import nadeau_bengio_ttest __all__ = [ - "ConditionalSampler", - "GaussianKnockoffs", "nadeau_bengio_ttest", ] diff --git a/test/_utils/test_utils.py b/test/_utils/test_utils.py index 59bcba0ae..4a4abae5c 100644 --- a/test/_utils/test_utils.py +++ b/test/_utils/test_utils.py @@ -1,7 +1,28 @@ import numpy as np import pytest +from scipy.stats import ttest_1samp, wilcoxon -from hidimstat._utils.utils import check_random_state +from hidimstat._utils.utils import ( + check_random_state, + check_statistical_test, + get_fitted_attributes, +) +from hidimstat.statistical_tools import nadeau_bengio_ttest + + +def test_generated_attributes(): + """Test function for getting generated attribute""" + + class MyClass: + def __init__(self): + self.attr1 = 1 + self.attr2_ = 2 + self._attr3 = 3 + self.attr4__ = 4 + self.attr5_ = 5 + + attributes = get_fitted_attributes(MyClass()) + assert attributes == ["attr2_", "attr5_"] def test_none(): @@ -41,3 +62,25 @@ def test_error(): ValueError, match="cannot be used to seed a numpy.random.Generator instance" ): check_random_state(random_state) + + +def test_check_test_statistic(): + "test the function of check" + test_func = check_statistical_test("wilcoxon") + assert test_func.func == wilcoxon + test_func = check_statistical_test("ttest") + assert test_func.func == ttest_1samp + test_func = check_statistical_test("nb-ttest") + assert test_func.func == nadeau_bengio_ttest + test_func = check_statistical_test(print) + assert test_func == print + test_func = check_statistical_test(lambda x: x) + assert test_func.__class__.__name__ == "function" + + +def test_check_test_statistic_warning(): + "test the exception" + with pytest.raises(ValueError, match="the test 'test' is not supported"): + check_statistical_test("test") + with pytest.raises(ValueError, match="Unsupported value for 'statistical_test'."): + check_statistical_test([]) diff --git a/test/statistical_tools/test_conditional_sampling.py b/test/samplers/test_conditional_sampling.py similarity index 99% rename from test/statistical_tools/test_conditional_sampling.py rename to test/samplers/test_conditional_sampling.py index a3de06970..888493334 100644 --- a/test/statistical_tools/test_conditional_sampling.py +++ b/test/samplers/test_conditional_sampling.py @@ -11,7 +11,7 @@ from sklearn.multiclass import OneVsRestClassifier from sklearn.preprocessing import StandardScaler -from hidimstat.statistical_tools.conditional_sampling import ConditionalSampler +from hidimstat.samplers.conditional_sampling import ConditionalSampler def test_continuous_case(): diff --git a/test/statistical_tools/test_gaussian_knockoffs.py b/test/samplers/test_gaussian_knockoffs.py similarity index 97% rename from test/statistical_tools/test_gaussian_knockoffs.py rename to test/samplers/test_gaussian_knockoffs.py index 2e671a6c0..6948dbfc5 100644 --- a/test/statistical_tools/test_gaussian_knockoffs.py +++ b/test/samplers/test_gaussian_knockoffs.py @@ -2,10 +2,7 @@ import pytest from hidimstat._utils.scenario import multivariate_simulation -from hidimstat.statistical_tools.gaussian_knockoffs import ( - GaussianKnockoffs, - _s_equi, -) +from hidimstat.samplers.gaussian_knockoffs import GaussianKnockoffs, _s_equi def test_gaussian_equi(): diff --git a/test/statistical_tools/test_nadeau_bengio_ttest.py b/test/statistical_tools/test_nadeau_bengio_ttest.py index 2771a651c..1f9223934 100644 --- a/test/statistical_tools/test_nadeau_bengio_ttest.py +++ b/test/statistical_tools/test_nadeau_bengio_ttest.py @@ -1,3 +1,5 @@ +from functools import partial + import numpy as np import numpy.ma.testutils as ma_npt import numpy.testing as npt @@ -48,7 +50,7 @@ def test_ttest_1samp_corrected_NB(data_generator): ) vim.fit(X_train, y_train) importances = vim.importance(X_test, y_test) - importance_list.append(importances["importance"]) + importance_list.append(importances) importance_array = np.array(importance_list) pvalue_corr = nadeau_bengio_ttest(importance_array, 0, test_frac=0.2).pvalue diff --git a/test/test_base_perturbation.py b/test/test_base_perturbation.py index ea29c2566..8ed0b8202 100644 --- a/test/test_base_perturbation.py +++ b/test/test_base_perturbation.py @@ -13,3 +13,16 @@ def test_no_implemented_methods(): basic_class = BasePerturbation(estimator=estimator) with pytest.raises(NotImplementedError): basic_class._permutation(X, features_group_id=None) + + +def test_check_importance(): + """test that the methods are not implemented in the base class""" + X = np.random.randint(0, 2, size=(100, 2, 1)) + estimator = LinearRegression() + estimator.fit(X[:, 0], X[:, 1]) + basic_class = BasePerturbation(estimator=estimator) + basic_class.importances_ = [] + with pytest.raises( + ValueError, match="The importance method has not yet been called." + ): + basic_class.importance_selection() diff --git a/test/test_conditional_feature_importance.py b/test/test_conditional_feature_importance.py index 459525131..57d89611e 100644 --- a/test/test_conditional_feature_importance.py +++ b/test/test_conditional_feature_importance.py @@ -1,15 +1,22 @@ from copy import deepcopy +from functools import partial import matplotlib.pyplot as plt import numpy as np import pandas as pd import pytest +from scipy.stats import ttest_1samp from sklearn.exceptions import NotFittedError -from sklearn.linear_model import LinearRegression, LogisticRegression -from sklearn.metrics import log_loss, root_mean_squared_error +from sklearn.linear_model import ( + LinearRegression, + LogisticRegression, + LogisticRegressionCV, + RidgeCV, +) +from sklearn.metrics import log_loss, mean_squared_error from sklearn.model_selection import train_test_split -from hidimstat import CFI +from hidimstat import CFI, cfi_analysis from hidimstat._utils.exception import InternalError from hidimstat._utils.scenario import multivariate_simulation from hidimstat.base_perturbation import BasePerturbation @@ -67,8 +74,7 @@ def run_cfi(X, y, n_permutation, seed): # fit the model using the training set cfi.fit(X_train) # calculate feature importance using the test set - vim = cfi.importance(X_test, y_test) - importance = vim["importance"] + importance = cfi.importance(X_test, y_test) return importance @@ -198,9 +204,8 @@ def test_group(data_generator): cfi.fit(X_train_df) # Warning expected since column names in pandas are not considered with pytest.warns(UserWarning, match="X does not have valid feature names, but"): - vim = cfi.importance(X_test_df, y_test) + importance = cfi.importance(X_test_df, y_test) - importance = vim["importance"] # Check if importance scores are computed for each feature assert importance.shape == (2,) # Verify that important feature group has higher score @@ -245,8 +250,7 @@ def test_classication(data_generator): n_jobs=1, ) cfi.fit(X_train) - vim = cfi.importance(X_test, y_test_clf) - importance = vim["importance"] + importance = cfi.importance(X_test, y_test_clf) # Check that importance scores are defined for each feature assert importance.shape == (X.shape[1],) # Check that important features have higher mean importance scores @@ -275,11 +279,11 @@ def test_init(self, data_generator): ) assert cfi.n_jobs == 1 assert cfi.n_permutations == 50 - assert cfi.loss == root_mean_squared_error + assert cfi.loss == mean_squared_error assert cfi.method == "predict" assert cfi.categorical_max_cardinality == 10 - assert cfi.imputation_model_categorical is None - assert cfi.imputation_model_continuous is None + assert isinstance(cfi.imputation_model_categorical, LogisticRegressionCV) + assert isinstance(cfi.imputation_model_continuous, RidgeCV) def test_fit(self, data_generator): """Test fitting CFI""" @@ -342,7 +346,7 @@ def test_categorical( ) cfi.fit(X, y) - importances = cfi.importance(X, y)["importance"] + importances = cfi.importance(X, y) assert len(importances) == 3 @@ -374,18 +378,6 @@ def test_unknown_predict_method(self, data_generator): method="unknown method", ) - def test_unfitted_predict(self, data_generator): - """Test predict method with unfitted model""" - X, y, _, _ = data_generator - fitted_model = LinearRegression().fit(X, y) - cfi = CFI( - estimator=fitted_model, - method="predict", - ) - - with pytest.raises(ValueError, match="The class is not fitted."): - cfi.predict(X) - def test_unfitted_importance(self, data_generator): """Test importance method with unfitted model""" X, y, _, _ = data_generator @@ -598,6 +590,41 @@ def test_groups_warning(self, data_generator): ): cfi.importance(X, y) + def test_assert_dimension_pvalue(self, data_generator): + """test that assert is raise if function stat is not good""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + cfi = CFI( + estimator=fitted_model, + imputation_model_continuous=LinearRegression(), + statistical_test=partial(ttest_1samp, popmean=0, axis=0), + ) + cfi.fit(X, y) + with pytest.raises( + AssertionError, + match="The statistical test doesn't provide the correct dimension.", + ): + cfi.importance(X, y) + + +@pytest.mark.parametrize( + "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial", + [(150, 200, 10, 0.2, 42, 1.0, 1.0, 0.0)], + ids=["high level noise"], +) +@pytest.mark.parametrize("n_permutation, cfi_seed", [(20, 0)], ids=["default_cfi"]) +def test_function_cfi(data_generator, n_permutation, cfi_seed): + """Test CFI function""" + X, y, _, _ = data_generator + cfi_analysis( + LinearRegression().fit(X, y), + X, + y, + imputation_model_continuous=LinearRegression(), + n_permutations=n_permutation, + random_state=cfi_seed, + ) + @pytest.mark.parametrize( "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial", @@ -619,6 +646,8 @@ def test_cfi_plot(data_generator): random_state=0, ) cfi.fit(X_train, y_train) + cfi.loss_reference_ = [] + cfi.loss_ = [] # Make the plot independent of data / randomness to test only the plotting function cfi.importances_ = np.arange(X.shape[1]) fig, ax = plt.subplots(figsize=(6, 3)) @@ -646,6 +675,8 @@ def test_cfi_plot_2d_imp(data_generator): random_state=0, ) cfi.fit(X_train, y_train) + cfi.loss_reference_ = [] + cfi.loss_ = [] # Make the plot independent of data / randomness to test only the plotting function cfi.importances_ = np.stack( [ @@ -678,6 +709,8 @@ def test_cfi_plot_coverage(data_generator): random_state=0, ) cfi.fit(X_train, y_train) + cfi.loss_reference_ = [] + cfi.loss_ = [] # Make the plot independent of data / randomness to test only the plotting function cfi.importances_ = np.arange(X.shape[1]) _, ax = plt.subplots(figsize=(6, 3)) @@ -729,9 +762,9 @@ def test_cfi_repeatibility(cfi_test_data): X_train, X_test, y_test, cfi_default_parameters = cfi_test_data cfi = CFI(**cfi_default_parameters) cfi.fit(X_train) - vim = cfi.importance(X_test, y_test)["importance"] + vim = cfi.importance(X_test, y_test) # repeat - vim_repeat = cfi.importance(X_test, y_test)["importance"] + vim_repeat = cfi.importance(X_test, y_test) assert not np.array_equal(vim, vim_repeat) @@ -742,20 +775,20 @@ def test_cfi_randomness_with_none(cfi_test_data): X_train, X_test, y_test, cfi_default_parameters = cfi_test_data cfi = CFI(random_state=None, **cfi_default_parameters) cfi.fit(X_train) - vim = cfi.importance(X_test, y_test)["importance"] + vim = cfi.importance(X_test, y_test) # repeat importance - vim_repeat = cfi.importance(X_test, y_test)["importance"] + vim_repeat = cfi.importance(X_test, y_test) assert not np.array_equal(vim, vim_repeat) # refit cfi.fit(X_train) - vim_refit = cfi.importance(X_test, y_test)["importance"] + vim_refit = cfi.importance(X_test, y_test) assert not np.array_equal(vim, vim_refit) # Reproducibility cfi_2 = CFI(random_state=None, **cfi_default_parameters) cfi_2.fit(X_train) - vim_reproducibility = cfi_2.importance(X_test, y_test)["importance"] + vim_reproducibility = cfi_2.importance(X_test, y_test) assert not np.array_equal(vim, vim_reproducibility) @@ -766,20 +799,20 @@ def test_cfi_reproducibility_with_integer(cfi_test_data): X_train, X_test, y_test, cfi_default_parameters = cfi_test_data cfi = CFI(random_state=42, **cfi_default_parameters) cfi.fit(X_train) - vim = cfi.importance(X_test, y_test)["importance"] + vim = cfi.importance(X_test, y_test) # repeat importance - vim_repeat = cfi.importance(X_test, y_test)["importance"] + vim_repeat = cfi.importance(X_test, y_test) assert np.array_equal(vim, vim_repeat) # refit cfi.fit(X_train) - vim_refit = cfi.importance(X_test, y_test)["importance"] + vim_refit = cfi.importance(X_test, y_test) assert np.array_equal(vim, vim_refit) # Reproducibility cfi_2 = CFI(random_state=42, **cfi_default_parameters) cfi_2.fit(X_train) - vim_reproducibility = cfi_2.importance(X_test, y_test)["importance"] + vim_reproducibility = cfi_2.importance(X_test, y_test) assert np.array_equal(vim, vim_reproducibility) @@ -793,25 +826,25 @@ def test_cfi_reproducibility_with_rng(cfi_test_data): rng = np.random.default_rng(0) cfi = CFI(random_state=rng, **cfi_default_parameters) cfi.fit(X_train) - vim = cfi.importance(X_test, y_test)["importance"] + vim = cfi.importance(X_test, y_test) # repeat importance - vim_repeat = cfi.importance(X_test, y_test)["importance"] + vim_repeat = cfi.importance(X_test, y_test) assert not np.array_equal(vim, vim_repeat) # refit cfi.fit(X_train) - vim_refit = cfi.importance(X_test, y_test)["importance"] + vim_refit = cfi.importance(X_test, y_test) assert not np.array_equal(vim, vim_refit) # refit repeatability rng = np.random.default_rng(0) cfi.random_state = rng cfi.fit(X_train) - vim_refit_2 = cfi.importance(X_test, y_test)["importance"] + vim_refit_2 = cfi.importance(X_test, y_test) assert np.array_equal(vim, vim_refit_2) # Reproducibility cfi_2 = CFI(random_state=np.random.default_rng(0), **cfi_default_parameters) cfi_2.fit(X_train) - vim_reproducibility = cfi_2.importance(X_test, y_test)["importance"] + vim_reproducibility = cfi_2.importance(X_test, y_test) assert np.array_equal(vim, vim_reproducibility) diff --git a/test/test_knockoff.py b/test/test_knockoff.py index a19e2e56b..cf1ccdf14 100644 --- a/test/test_knockoff.py +++ b/test/test_knockoff.py @@ -11,7 +11,7 @@ model_x_knockoff, set_alpha_max_lasso_path, ) -from hidimstat.statistical_tools.gaussian_knockoffs import GaussianKnockoffs +from hidimstat.samplers import GaussianKnockoffs from hidimstat.statistical_tools.multiple_testing import fdp_power diff --git a/test/test_leave_one_covariate_out.py b/test/test_leave_one_covariate_out.py index ea3738634..1c8c3b243 100644 --- a/test/test_leave_one_covariate_out.py +++ b/test/test_leave_one_covariate_out.py @@ -1,12 +1,15 @@ +from functools import partial + import numpy as np import pandas as pd import pytest +from scipy.stats import ttest_1samp from sklearn.exceptions import NotFittedError from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.metrics import log_loss from sklearn.model_selection import train_test_split -from hidimstat import LOCO +from hidimstat import LOCO, loco_analysis from hidimstat._utils.scenario import multivariate_simulation from hidimstat.base_perturbation import BasePerturbation @@ -39,9 +42,8 @@ def test_loco(): X_train, y_train, ) - vim = loco.importance(X_test, y_test) + importance = loco.importance(X_test, y_test) - importance = vim["importance"] assert importance.shape == (X.shape[1],) assert ( importance[important_features].mean() @@ -68,9 +70,8 @@ def test_loco(): ) # warnings because we doesn't consider the name of columns of pandas with pytest.warns(UserWarning, match="X does not have valid feature names, but"): - vim = loco.importance(X_test_df, y_test) + importance = loco.importance(X_test_df, y_test) - importance = vim["importance"] assert importance[0].mean() > importance[1].mean() # Classification case @@ -93,11 +94,10 @@ def test_loco(): X_train, y_train_clf, ) - vim_clf = loco_clf.importance(X_test, y_test_clf) + importance_clf = loco_clf.importance(X_test, y_test_clf) - importance_clf = vim_clf["importance"] assert importance_clf.shape == (2,) - assert importance[0].mean() > importance[1].mean() + assert importance_clf[0].mean() > importance_clf[1].mean() def test_raises_value_error(): @@ -123,7 +123,7 @@ def test_raises_value_error(): estimator=fitted_model, method="predict", ) - loco.predict(X) + loco.importance(X, None) with pytest.raises(ValueError, match="The class is not fitted."): fitted_model = LinearRegression().fit(X, y) loco = LOCO( @@ -142,3 +142,46 @@ def test_raises_value_error(): ) BasePerturbation.fit(loco, X, y) loco.importance(X, y) + + with pytest.raises( + AssertionError, + match="The statistical test doesn't provide the correct dimension.", + ): + fitted_model = LinearRegression().fit(X, y) + loco = LOCO( + estimator=fitted_model, + statistical_test=partial(ttest_1samp, popmean=0, axis=0), + ).fit(X, y) + loco.importance(X, y) + + +def test_loco_function(): + """Test the function of LOCO algorithm on a linear scenario.""" + X, y, beta, noise = multivariate_simulation( + n_samples=150, + n_features=100, + support_size=10, + shuffle=False, + seed=42, + ) + important_features = np.where(beta != 0)[0] + non_important_features = np.where(beta == 0)[0] + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + + regression_model = LinearRegression() + regression_model.fit(X_train, y_train) + + selection, importance, pvalue = loco_analysis( + regression_model, + X, + y, + method="predict", + n_jobs=1, + ) + + assert importance.shape == (X.shape[1],) + assert ( + importance[important_features].mean() + > importance[non_important_features].mean() + ) diff --git a/test/test_permutation_feature_importance.py b/test/test_permutation_feature_importance.py index 0e7ee0c0d..f806486ce 100644 --- a/test/test_permutation_feature_importance.py +++ b/test/test_permutation_feature_importance.py @@ -5,7 +5,7 @@ from sklearn.metrics import log_loss from sklearn.model_selection import train_test_split -from hidimstat import PFI +from hidimstat import PFI, pfi_analysis from hidimstat._utils.scenario import multivariate_simulation @@ -39,9 +39,8 @@ def test_permutation_importance(): X_train, y_train, ) - vim = pfi.importance(X_test, y_test) + importance = pfi.importance(X_test, y_test) - importance = vim["importance"] assert importance.shape == (X.shape[1],) assert ( importance[important_features].mean() @@ -70,9 +69,8 @@ def test_permutation_importance(): ) # warnings because we doesn't consider the name of columns of pandas with pytest.warns(UserWarning, match="X does not have valid feature names, but"): - vim = pfi.importance(X_test_df, y_test) + importance = pfi.importance(X_test_df, y_test) - importance = vim["importance"] assert importance[0].mean() > importance[1].mean() # Classification case @@ -95,12 +93,43 @@ def test_permutation_importance(): X_train, y_train_clf, ) - vim_clf = pfi_clf.importance(X_test, y_test_clf) + importance_clf = pfi_clf.importance(X_test, y_test_clf) - importance_clf = vim_clf["importance"] assert importance_clf.shape == (X.shape[1],) +def test_permutation_importance_function(): + """Test the function of Permutation Importance algorithm on a linear scenario.""" + X, y, beta, noise = multivariate_simulation( + n_samples=150, + n_features=200, + support_size=10, + shuffle=False, + seed=42, + ) + important_features = beta != 0 + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + + regression_model = LinearRegression() + regression_model.fit(X_train, y_train) + + selection, importance, pvalue = pfi_analysis( + regression_model, + X, + y, + n_permutations=20, + method="predict", + random_state=0, + n_jobs=1, + ) + + assert importance.shape == (X.shape[1],) + assert ( + importance[important_features].mean() > importance[~important_features].mean() + ) + + @pytest.fixture(scope="module") def pfi_test_data(): """ @@ -137,8 +166,8 @@ def test_pfi_repeatability(pfi_test_data): X_train, X_test, y_train, y_test, pfi_default_parameters = pfi_test_data pfi = PFI(**pfi_default_parameters, random_state=0) pfi.fit(X_train, y_train) - vim = pfi.importance(X_test, y_test)["importance"] - vim_reproducible = pfi.importance(X_test, y_test)["importance"] + vim = pfi.importance(X_test, y_test) + vim_reproducible = pfi.importance(X_test, y_test) assert np.array_equal(vim, vim_reproducible) @@ -150,17 +179,17 @@ def test_pfi_randomness_with_none(pfi_test_data): X_train, X_test, y_train, y_test, pfi_default_parameters = pfi_test_data pfi_fixed = PFI(**pfi_default_parameters, random_state=0) pfi_fixed.fit(X_train, y_train) - vim_fixed = pfi_fixed.importance(X_test, y_test)["importance"] + vim_fixed = pfi_fixed.importance(X_test, y_test) pfi_new_state = PFI(**pfi_default_parameters, random_state=1) pfi_new_state.fit(X_train, y_train) - vim_new_state = pfi_new_state.importance(X_test, y_test)["importance"] + vim_new_state = pfi_new_state.importance(X_test, y_test) assert not np.array_equal(vim_fixed, vim_new_state) pfi_none_state = PFI(**pfi_default_parameters, random_state=None) pfi_none_state.fit(X_train, y_train) - vim_none_state_1 = pfi_none_state.importance(X_test, y_test)["importance"] - vim_none_state_2 = pfi_none_state.importance(X_test, y_test)["importance"] + vim_none_state_1 = pfi_none_state.importance(X_test, y_test) + vim_none_state_2 = pfi_none_state.importance(X_test, y_test) assert not np.array_equal(vim_none_state_1, vim_none_state_2) @@ -172,11 +201,11 @@ def test_pfi_reproducibility_with_integer(pfi_test_data): X_train, X_test, y_train, y_test, pfi_default_parameters = pfi_test_data pfi_1 = PFI(**pfi_default_parameters, random_state=0) pfi_1.fit(X_train, y_train) - vim_1 = pfi_1.importance(X_test, y_test)["importance"] + vim_1 = pfi_1.importance(X_test, y_test) pfi_2 = PFI(**pfi_default_parameters, random_state=0) pfi_2.fit(X_train, y_train) - vim_2 = pfi_2.importance(X_test, y_test)["importance"] + vim_2 = pfi_2.importance(X_test, y_test) assert np.array_equal(vim_1, vim_2) @@ -190,13 +219,13 @@ def test_pfi_reproducibility_with_rng(pfi_test_data): rng = np.random.default_rng(0) pfi = PFI(**pfi_default_parameters, random_state=rng) pfi.fit(X_train, y_train) - vim = pfi.importance(X_test, y_test)["importance"] - vim_repeat = pfi.importance(X_test, y_test)["importance"] + vim = pfi.importance(X_test, y_test) + vim_repeat = pfi.importance(X_test, y_test) assert not np.array_equal(vim, vim_repeat) # Refit with same rng rng = np.random.default_rng(0) pfi_reproducibility = PFI(**pfi_default_parameters, random_state=rng) pfi_reproducibility.fit(X_train, y_train) - vim_reproducibility = pfi_reproducibility.importance(X_test, y_test)["importance"] + vim_reproducibility = pfi_reproducibility.importance(X_test, y_test) assert np.array_equal(vim, vim_reproducibility)