diff --git a/docs/src/api.rst b/docs/src/api.rst index 51074f1dc..a82fcc668 100644 --- a/docs/src/api.rst +++ b/docs/src/api.rst @@ -36,7 +36,18 @@ Classes BaseVariableImportance BasePerturbation + VariableImportanceFeatureGroup LOCO CFI PFI D0CRT + +Marginal Importance +=================== + +.. autosummary:: + :toctree: ./generated/api/marginal + :template: class.rst + + LOCI + LeaveOneCovariateIn diff --git a/docs/tools/references.bib b/docs/tools/references.bib index fe04dc12f..a73bc3f4d 100644 --- a/docs/tools/references.bib +++ b/docs/tools/references.bib @@ -144,6 +144,15 @@ @article{eshel2003yule year = {2003} } +@inproceedings{ewald2024guide, + title = {A guide to feature importance methods for scientific inference}, + author = {Ewald, Fiona Katharina and Bothmann, Ludwig and Wright, Marvin N and Bischl, Bernd and Casalicchio, Giuseppe and K{\"o}nig, Gunnar}, + booktitle = {World Conference on Explainable Artificial Intelligence}, + pages = {440--464}, + year = {2024}, + organization = {Springer} +} + @article{fan2012variance, author = {Fan, Jianqing and Guo, Shaojun and Hao, Ning}, journal = {Journal of the Royal Statistical Society Series B: Statistical Methodology}, diff --git a/examples/plot_conditional_vs_marginal_xor_data.py b/examples/plot_conditional_vs_marginal_xor_data.py index e9ea09ec9..1c3b6472c 100644 --- a/examples/plot_conditional_vs_marginal_xor_data.py +++ b/examples/plot_conditional_vs_marginal_xor_data.py @@ -12,11 +12,11 @@ import seaborn as sns from sklearn.base import clone from sklearn.linear_model import RidgeCV -from sklearn.metrics import hinge_loss +from sklearn.metrics import hinge_loss, accuracy_score from sklearn.model_selection import KFold, train_test_split from sklearn.svm import SVC -from hidimstat import CFI +from hidimstat import CFI, LOCI ############################################################################# # To solve the XOR problem, we will use a Support Vector Classier (SVC) with Radial Basis Function (RBF) kernel. The decision function of @@ -82,21 +82,9 @@ cv = KFold(n_splits=5, shuffle=True, random_state=0) clf = SVC(kernel="rbf", random_state=0) # Compute marginal importance using univariate models -marginal_scores = [] -for i in range(X.shape[1]): - feat_scores = [] - for train_index, test_index in cv.split(X): - X_train, X_test = X[train_index], X[test_index] - y_train, y_test = Y[train_index], Y[test_index] - - X_train_univariate = X_train[:, i].reshape(-1, 1) - X_test_univariate = X_test[:, i].reshape(-1, 1) - - univariate_model = clone(clf) - univariate_model.fit(X_train_univariate, y_train) - - feat_scores.append(univariate_model.score(X_test_univariate, y_test)) - marginal_scores.append(feat_scores) +loci = LOCI(estimator=clone(clf).fit(X, Y), method="decision_function", loss=hinge_loss) +mean_importances = loci.fit_importance(X, Y, cv=cv) +marginal_importances = np.array(loci.importances_) ########################################################################### @@ -129,7 +117,7 @@ fig, axes = plt.subplots(1, 2, sharey=True, figsize=(6, 2.5)) # Marginal scores boxplot sns.boxplot( - data=np.array(marginal_scores).T, + data=marginal_importances, orient="h", ax=axes[0], fill=False, diff --git a/examples/plot_importance_classification_iris.py b/examples/plot_importance_classification_iris.py index eb92d7abf..5f72561b5 100644 --- a/examples/plot_importance_classification_iris.py +++ b/examples/plot_importance_classification_iris.py @@ -60,7 +60,9 @@ # require a K-fold cross-fitting. Computing the importance for each fold is # embarassingly parallel. For this reason, we encapsulate the main computations in a # function and use joblib to parallelize the computation. -def run_one_fold(X, y, model, train_index, test_index, vim_name="CFI", groups=None): +def run_one_fold( + X, y, model, train_index, test_index, vim_name="CFI", features_groups=None +): model_c = clone(model) model_c.fit(X[train_index], y[train_index]) y_pred = model_c.predict(X[test_index]) @@ -92,12 +94,12 @@ def run_one_fold(X, y, model, train_index, test_index, vim_name="CFI", groups=No loss=loss, ) - vim.fit(X[train_index], y[train_index], groups=groups) + vim.fit(X[train_index], y[train_index], features_groups=features_groups) importance = vim.importance(X[test_index], y[test_index])["importance"] return pd.DataFrame( { - "feature": groups.keys(), + "feature": features_groups.keys(), "importance": importance, "vim": vim_name, "model": model_name, @@ -116,10 +118,16 @@ def run_one_fold(X, y, model, train_index, test_index, vim_name="CFI", groups=No GridSearchCV(SVC(kernel="rbf"), {"C": np.logspace(-3, 3, 10)}), ] cv = KFold(n_splits=5, shuffle=True, random_state=0) -groups = {ft: [i] for i, ft in enumerate(dataset.feature_names)} +features_groups = {ft: [i] for i, ft in enumerate(dataset.feature_names)} out_list = Parallel(n_jobs=5)( delayed(run_one_fold)( - X, y, model, train_index, test_index, vim_name=vim_name, groups=groups + X, + y, + model, + train_index, + test_index, + vim_name=vim_name, + features_groups=features_groups, ) for train_index, test_index in cv.split(X) for model in models @@ -255,16 +263,22 @@ def plot_results(df_importance, df_pval): # mitigate this issue, we can group correlated features together and measure the # importance of these feature groups. For instance, we can group 'sepal width' with # 'sepal length' and 'petal length' with 'petal width' and the spurious feature. -groups = {"sepal features": [0, 1], "petal features": [2, 3, 4]} +features_groups = {"sepal features": [0, 1], "petal features": [2, 3, 4]} out_list = Parallel(n_jobs=5)( delayed(run_one_fold)( - X, y, model, train_index, test_index, vim_name=vim_name, groups=groups + X, + y, + model, + train_index, + test_index, + vim_name=vim_name, + features_groups=features_groups, ) for train_index, test_index in cv.split(X) for model in models for vim_name in ["CFI", "PFI"] ) -df_grouped = pd.concat(out_list) -df_pval = compute_pval(df_grouped, threshold=threshold) -plot_results(df_grouped, df_pval) +df_features_grouped = pd.concat(out_list) +df_pval = compute_pval(df_features_grouped, threshold=threshold) +plot_results(df_features_grouped, df_pval) diff --git a/src/hidimstat/__init__.py b/src/hidimstat/__init__.py index 81d5a0cce..468407275 100644 --- a/src/hidimstat/__init__.py +++ b/src/hidimstat/__init__.py @@ -1,4 +1,7 @@ -from .base_variable_importance import BaseVariableImportance +from .base_variable_importance import ( + BaseVariableImportance, + VariableImportanceFeatureGroup, +) from .base_perturbation import BasePerturbation from .ensemble_clustered_inference import ( clustered_inference, @@ -25,6 +28,10 @@ from .noise_std import reid from .permutation_feature_importance import PFI +# marginal methods +from .marginal import LeaveOneCovariateIn # for having documentation +from .marginal import LeaveOneCovariateIn as LOCI + from .statistical_tools.aggregation import quantile_aggregation try: @@ -51,4 +58,6 @@ "CFI", "LOCO", "PFI", + # marginal methods + "LOCI", ] diff --git a/src/hidimstat/base_perturbation.py b/src/hidimstat/base_perturbation.py index a84555908..67889f184 100644 --- a/src/hidimstat/base_perturbation.py +++ b/src/hidimstat/base_perturbation.py @@ -1,16 +1,16 @@ import numpy as np -import pandas as pd from joblib import Parallel, delayed from sklearn.base import check_is_fitted from sklearn.metrics import root_mean_squared_error -import warnings from hidimstat._utils.utils import _check_vim_predict_method -from hidimstat._utils.exception import InternalError -from hidimstat.base_variable_importance import BaseVariableImportance +from hidimstat.base_variable_importance import ( + BaseVariableImportance, + VariableImportanceFeatureGroup, +) -class BasePerturbation(BaseVariableImportance): +class BasePerturbation(BaseVariableImportance, VariableImportanceFeatureGroup): def __init__( self, estimator, @@ -43,6 +43,7 @@ def __init__( The number of parallel jobs to run. Parallelization is done over the variables or groups of variables. """ + super().__init__() check_is_fitted(estimator) assert n_permutations > 0, "n_permutations must be positive" self.estimator = estimator @@ -51,45 +52,6 @@ def __init__( self.method = method self.n_jobs = n_jobs self.n_permutations = n_permutations - self.n_groups = None - - def fit(self, X, y=None, groups=None): - """Base fit method for perturbation-based methods. Identifies the groups. - - Parameters - ---------- - X: array-like of shape (n_samples, n_features) - The input samples. - y: array-like of shape (n_samples,) - Not used, only present for consistency with the sklearn API. - groups: dict, optional - A dictionary where the keys are the group names and the values are the - list of column names corresponding to each group. If None, the groups are - identified based on the columns of X. - """ - if groups is None: - self.n_groups = X.shape[1] - self.groups = {j: [j] for j in range(self.n_groups)} - self._groups_ids = np.array(list(self.groups.values()), dtype=int) - elif isinstance(groups, dict): - self.n_groups = len(groups) - self.groups = groups - if isinstance(X, pd.DataFrame): - self._groups_ids = [] - for group_key in self.groups.keys(): - self._groups_ids.append( - [ - i - for i, col in enumerate(X.columns) - if col in self.groups[group_key] - ] - ) - else: - self._groups_ids = [ - np.array(ids, dtype=int) for ids in list(self.groups.values()) - ] - else: - raise ValueError("groups needs to be a dictionnary") def predict(self, X): """ @@ -111,8 +73,12 @@ def predict(self, X): # Parallelize the computation of the importance scores for each group out_list = Parallel(n_jobs=self.n_jobs)( - delayed(self._joblib_predict_one_group)(X_, group_id, group_key) - for group_id, group_key in enumerate(self.groups.keys()) + delayed(self._joblib_predict_one_features_group)( + X_, features_group_id, features_group_key + ) + for features_group_id, features_group_key in enumerate( + self.features_groups.keys() + ) ) return np.stack(out_list, axis=0) @@ -155,82 +121,14 @@ def importance(self, X, y): out_dict["importance"] = np.array( [ np.mean(out_dict["loss"][j]) - loss_reference - for j in range(self.n_groups) + for j in range(self.n_features_groups) ] ) return out_dict - def _check_fit(self, X): - """ - Check if the perturbation method has been properly fitted. - - This method verifies that the perturbation method has been fitted by checking - if required attributes are set and if the number of features matches - the grouped variables. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - Input data to validate against the fitted model. - - Raises - ------ - ValueError - If the method has not been fitted (i.e., if n_groups, groups, - or _groups_ids attributes are missing). - AssertionError - If the number of features in X does not match the total number - of features in the grouped variables. - """ - if ( - self.n_groups is None - or not hasattr(self, "groups") - or not hasattr(self, "_groups_ids") - ): - raise ValueError( - "The class is not fitted. The fit method must be called" - " to set variable groups. If no grouping is needed," - " call fit with groups=None" - ) - if isinstance(X, pd.DataFrame): - names = list(X.columns) - elif isinstance(X, np.ndarray) and X.dtype.names is not None: - names = X.dtype.names - # transform Structured Array in pandas array for a better manipulation - X = pd.DataFrame(X) - elif isinstance(X, np.ndarray): - names = None - else: - raise ValueError("X should be a pandas dataframe or a numpy array.") - number_columns = X.shape[1] - for index_variables in self.groups.values(): - if type(index_variables[0]) is int or np.issubdtype( - type(index_variables[0]), int - ): - assert np.all( - np.array(index_variables, dtype=int) < number_columns - ), "X does not correspond to the fitting data." - elif type(index_variables[0]) is str or np.issubdtype( - type(index_variables[0]), str - ): - assert np.all( - [name in names for name in index_variables] - ), f"The array is missing at least one of the following columns {index_variables}." - else: - raise InternalError( - "A problem with indexing has happened during the fit." - ) - number_unique_feature_in_groups = np.unique( - np.concatenate([values for values in self.groups.values()]) - ).shape[0] - if X.shape[1] != number_unique_feature_in_groups: - warnings.warn( - f"The number of features in X: {X.shape[1]} differs from the" - " number of features for which importance is computed: " - f"{number_unique_feature_in_groups}" - ) - - def _joblib_predict_one_group(self, X, group_id, group_key): + def _joblib_predict_one_features_group( + self, X, features_group_id, features_group_key + ): """ Compute the predictions after perturbation of the data for a given group of variables. This function is parallelized. @@ -244,13 +142,15 @@ def _joblib_predict_one_group(self, X, group_id, group_key): group_key: str, int The key of the group of variables. (parameter use for debugging) """ - group_ids = self._groups_ids[group_id] - non_group_ids = np.delete(np.arange(X.shape[1]), group_ids) + features_group_ids = self._features_groups_ids[features_group_id] + non_features_group_ids = np.delete(np.arange(X.shape[1]), features_group_ids) # Create an array X_perm_j of shape (n_permutations, n_samples, n_features) # where the j-th group of covariates is permuted X_perm = np.empty((self.n_permutations, X.shape[0], X.shape[1])) - X_perm[:, :, non_group_ids] = np.delete(X, group_ids, axis=1) - X_perm[:, :, group_ids] = self._permutation(X, group_id=group_id) + X_perm[:, :, non_features_group_ids] = np.delete(X, features_group_ids, axis=1) + X_perm[:, :, features_group_ids] = self._permutation( + X, features_group_id=features_group_id + ) # Reshape X_perm to allow for batch prediction X_perm_batch = X_perm.reshape(-1, X.shape[1]) y_pred_perm = getattr(self.estimator, self.method)(X_perm_batch) @@ -264,6 +164,6 @@ def _joblib_predict_one_group(self, X, group_id, group_key): ) return y_pred_perm - def _permutation(self, X, group_id): + def _permutation(self, X, features_group_id): """Method for creating the permuted data for the j-th group of covariates.""" raise NotImplementedError diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index b4f539024..9bea2887d 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -2,6 +2,9 @@ from sklearn.base import BaseEstimator import numpy as np +import pandas as pd + +from hidimstat._utils.exception import InternalError class BaseVariableImportance(BaseEstimator): @@ -131,3 +134,146 @@ def _check_importance(self): raise ValueError( "The importances need to be called before calling this method" ) + + +class VariableImportanceFeatureGroup: + """ + Base class for variable importance methods using feature groups. + This class extends `BaseVariableImportance` to support variable importance + methods that operate on groups of features, enabling group-wise selection + and importance evaluation. + + Attributes + ---------- + n_features_groups : int, default=None + The number of feature groups. + features_groups : dict, default=None + A dictionary mapping group names or indices to lists of feature indices or names. + _features_groups_ids : array-like of shape (n_features_groups,), default=None + Internal representation of group indices for each group. + + Methods + ------- + fit(X, y=None, groups=None) + Identifies and stores feature groups based on input or provided grouping. + _check_fit(X) + Checks if the class has been fitted and validates group-feature correspondence. + """ + + def __init__(self): + super().__init__() + self.n_features_groups = None + self.features_groups = None + self._features_groups_ids = None + + def fit(self, X, y=None, features_groups=None): + """ + Base fit method for perturbation-based methods. Identifies the groups. + + Parameters + ---------- + X: array-like of shape (n_samples, n_features) + The input samples. + y: array-like of shape (n_samples,) + Not used, only present for consistency with the sklearn API. + features_groups: dict, optional + A dictionary where the keys are the group names and the values are the + list of column names corresponding to each group. If None, the groups are + identified based on the columns of X. + """ + if features_groups is None: + self.n_features_groups = X.shape[1] + self.features_groups = {j: [j] for j in range(self.n_features_groups)} + self._features_groups_ids = np.array( + list(self.features_groups.values()), dtype=int + ) + elif isinstance(features_groups, dict): + self.n_features_groups = len(features_groups) + self.features_groups = features_groups + if isinstance(X, pd.DataFrame): + self._features_groups_ids = [] + for features_group_key in self.features_groups.keys(): + self._features_groups_ids.append( + [ + i + for i, col in enumerate(X.columns) + if col in self.features_groups[features_group_key] + ] + ) + else: + self._features_groups_ids = [ + np.array(ids, dtype=int) + for ids in list(self.features_groups.values()) + ] + else: + raise ValueError("features_groups needs to be a dictionnary") + + def _check_fit(self, X): + """ + Check if the perturbation method has been properly fitted. + + This method verifies that the perturbation method has been fitted by checking + if required attributes are set and if the number of features matches + the feature grouped variables. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data to validate against the fitted model. + + Raises + ------ + ValueError + If the method has not been fitted (i.e., if n_features_groups, features_groups, + or _features_groups_ids attributes are missing). + AssertionError + If the number of features in X does not match the total number + of features in the grouped variables. + """ + if ( + self.n_features_groups is None + or not hasattr(self, "features_groups") + or not hasattr(self, "_features_groups_ids") + ): + raise ValueError( + "The class is not fitted. The fit method must be called" + " to set variable features_groups. If no grouping is needed," + " call fit with features_groups=None" + ) + if isinstance(X, pd.DataFrame): + names = list(X.columns) + elif isinstance(X, np.ndarray) and X.dtype.names is not None: + names = X.dtype.names + # transform Structured Array in pandas array for a better manipulation + X = pd.DataFrame(X) + elif isinstance(X, np.ndarray): + names = None + else: + raise ValueError("X should be a pandas dataframe or a numpy array.") + number_columns = X.shape[1] + for index_variables in self.features_groups.values(): + if type(index_variables[0]) is int or np.issubdtype( + type(index_variables[0]), int + ): + assert np.all( + np.array(index_variables, dtype=int) < number_columns + ), "X does not correspond to the fitting data." + elif type(index_variables[0]) is str or np.issubdtype( + type(index_variables[0]), str + ): + assert np.all( + [name in names for name in index_variables] + ), f"The array is missing at least one of the following columns {index_variables}." + else: + raise InternalError( + "A problem with indexing has happened during the fit." + ) + number_unique_feature_in_groups = np.unique( + np.concatenate([values for values in self.features_groups.values()]) + ).shape[0] + if X.shape[1] != number_unique_feature_in_groups: + warnings.warn( + f"The number of features in X: {X.shape[1]} differs from the" + " number of features for which importance is computed: " + f"{number_unique_feature_in_groups}" + ) diff --git a/src/hidimstat/conditional_feature_importance.py b/src/hidimstat/conditional_feature_importance.py index 9b0e7905f..1bd7f70ac 100644 --- a/src/hidimstat/conditional_feature_importance.py +++ b/src/hidimstat/conditional_feature_importance.py @@ -39,22 +39,22 @@ def __init__( "decision_function", "transform". n_jobs : int, default=1 The number of jobs to run in parallel. Parallelization is done over the - variables or groups of variables. + features or groups of features. n_permutations : int, default=50 - The number of permutations to perform. For each variable/group of variables, + The number of permutations to perform. For each feature/group of features, the mean of the losses over the `n_permutations` is computed. imputation_model_continuous : sklearn compatible estimator, optional The model used to estimate the conditional distribution of a given - continuous variable/group of variables given the others. + continuous features/group of features given the others. imputation_model_categorical : sklearn compatible estimator, optional The model used to estimate the conditional distribution of a given - categorical variable/group of variables given the others. Binary is + categorical features/group of features given the others. Binary is considered as a special case of categorical. random_state : int, default=None The random state to use for sampling. categorical_max_cardinality : int, default=10 - The maximum cardinality of a variable to be considered as categorical - when the variable type is inferred (set to "auto" or not provided). + The maximum cardinality of a feature to be considered as categorical + when the feature type is inferred (set to "auto" or not provided). References ---------- @@ -82,7 +82,7 @@ def __init__( self.imputation_model_continuous = imputation_model_continuous self.random_state = random_state - def fit(self, X, y=None, groups=None, var_type="auto"): + def fit(self, X, y=None, features_groups=None, features_type="auto"): """Fit the imputation models. Parameters @@ -91,12 +91,12 @@ def fit(self, X, y=None, groups=None, var_type="auto"): The input samples. y: array-like of shape (n_samples,) Not used, only present for consistency with the sklearn API. - groups: dict, optional + features_groups: dict, optional A dictionary where the keys are the group names and the values are the - list of column names corresponding to each group. If None, the groups are - identified based on the columns of X. - var_type: str or list, default="auto" - The variable type. Supported types include "auto", "continuous", and + list of column names corresponding to each features group. If None, + the features_groups are identified based on the columns of X. + features_type: str or list, default="auto" + The feature type. Supported types include "auto", "continuous", and "categorical". If "auto", the type is inferred from the cardinality of the unique values passed to the `fit` method. Returns @@ -105,15 +105,15 @@ def fit(self, X, y=None, groups=None, var_type="auto"): Returns the instance itself. """ self.random_state = check_random_state(self.random_state) - super().fit(X, None, groups=groups) - if isinstance(var_type, str): - self.var_type = [var_type for _ in range(self.n_groups)] + super().fit(X, None, features_groups=features_groups) + if isinstance(features_type, str): + self.features_type = [features_type for _ in range(self.n_features_groups)] else: - self.var_type = var_type + self.features_type = features_type self._list_imputation_models = [ ConditionalSampler( - data_type=self.var_type[groupd_id], + data_type=self.features_type[features_groupd_id], model_regression=( None if self.imputation_model_continuous is None @@ -127,25 +127,27 @@ def fit(self, X, y=None, groups=None, var_type="auto"): random_state=self.random_state, categorical_max_cardinality=self.categorical_max_cardinality, ) - for groupd_id in range(self.n_groups) + for features_groupd_id in range(self.n_features_groups) ] # Parallelize the fitting of the covariate estimators X_ = np.asarray(X) self._list_imputation_models = Parallel(n_jobs=self.n_jobs)( - delayed(self._joblib_fit_one_group)(estimator, X_, groups_ids) - for groups_ids, estimator in zip( - self._groups_ids, self._list_imputation_models + delayed(self._joblib_fit_one_features_group)( + imputation_model, X_, features_groups_ids + ) + for features_groups_ids, imputation_model in zip( + self._features_groups_ids, self._list_imputation_models ) ) return self - def _joblib_fit_one_group(self, estimator, X, groups_ids): - """Fit a single imputation model, for a single group of variables. This method + def _joblib_fit_one_features_group(self, estimator, X, features_groups_ids): + """Fit a single imputation model, for a single group of features. This method is parallelized.""" - X_j = X[:, groups_ids].copy() - X_minus_j = np.delete(X, groups_ids, axis=1) + X_j = X[:, features_groups_ids].copy() + X_minus_j = np.delete(X, features_groups_ids, axis=1) estimator.fit(X_minus_j, X_j) return estimator @@ -165,12 +167,12 @@ def _check_fit(self, X): Raises ------ ValueError - If the method has not been fitted (i.e., if n_groups, groups, - or _groups_ids attributes are missing) or if imputation models + If the method has not been fitted (i.e., if n_features_groups, features_groups, + or _features_groups_ids attributes are missing) or if imputation models are not fitted. AssertionError If the number of features in X does not match the total number - of features in the grouped variables. + of features in the grouped features. """ super()._check_fit(X) if len(self._list_imputation_models) == 0: @@ -180,11 +182,11 @@ def _check_fit(self, X): for m in self._list_imputation_models: check_is_fitted(m.model) - def _permutation(self, X, group_id): + def _permutation(self, X, features_group_id): """Sample from the conditional distribution using a permutation of the residuals.""" - X_j = X[:, self._groups_ids[group_id]].copy() - X_minus_j = np.delete(X, self._groups_ids[group_id], axis=1) - return self._list_imputation_models[group_id].sample( + X_j = X[:, self._features_groups_ids[features_group_id]].copy() + X_minus_j = np.delete(X, self._features_groups_ids[features_group_id], axis=1) + return self._list_imputation_models[features_group_id].sample( X_minus_j, X_j, n_samples=self.n_permutations ) diff --git a/src/hidimstat/leave_one_covariate_out.py b/src/hidimstat/leave_one_covariate_out.py index c9c64c464..554e6bc92 100644 --- a/src/hidimstat/leave_one_covariate_out.py +++ b/src/hidimstat/leave_one_covariate_out.py @@ -18,9 +18,9 @@ def __init__( """ Leave-One-Covariate-Out (LOCO) as presented in :footcite:t:`lei2018distribution` and :footcite:t:`verdinelli2024feature`. - The model is re-fitted for each variable/group of variables. The importance is + The model is re-fitted for each feature/group of features. The importance is then computed as the difference between the loss of the full model and the loss - of the model without the variable/group. + of the model without the feature/group. Parameters ---------- @@ -35,7 +35,7 @@ def __init__( "decision_function", "transform". n_jobs : int, default=1 The number of jobs to run in parallel. Parallelization is done over the - variables or groups of variables. + features or groups of features. Notes ----- @@ -55,7 +55,7 @@ def __init__( ) self._list_estimators = [] - def fit(self, X, y, groups=None): + def fit(self, X, y, features_groups=None): """Fit a model after removing each covariate/group of covariates. Parameters @@ -64,7 +64,7 @@ def fit(self, X, y, groups=None): The training input samples. y : array-like of shape (n_samples,) The target values. - groups : dict, default=None + features_groups : dict, default=None A dictionary where the keys are the group names and the values are the indices of the covariates in each group. @@ -73,32 +73,42 @@ def fit(self, X, y, groups=None): self : object Returns the instance itself. """ - super().fit(X, y, groups) + super().fit(X, y, features_groups) # create a list of covariate estimators for each group if not provided - self._list_estimators = [clone(self.estimator) for _ in range(self.n_groups)] + self._list_estimators = [ + clone(self.estimator) for _ in range(self.n_features_groups) + ] # Parallelize the fitting of the covariate estimators self._list_estimators = Parallel(n_jobs=self.n_jobs)( - delayed(self._joblib_fit_one_group)(estimator, X, y, key_groups) - for key_groups, estimator in zip(self.groups.keys(), self._list_estimators) + delayed(self._joblib_fit_one_features_group)( + estimator, X, y, key_features_groups + ) + for key_features_groups, estimator in zip( + self.features_groups.keys(), self._list_estimators + ) ) return self - def _joblib_fit_one_group(self, estimator, X, y, key_groups): + def _joblib_fit_one_features_group(self, estimator, X, y, key_features_groups): """Fit the estimator after removing a group of covariates. Used in parallel.""" if isinstance(X, pd.DataFrame): - X_minus_j = X.drop(columns=self.groups[key_groups]) + X_minus_j = X.drop(columns=self.features_groups[key_features_groups]) else: - X_minus_j = np.delete(X, self.groups[key_groups], axis=1) + X_minus_j = np.delete(X, self.features_groups[key_features_groups], axis=1) estimator.fit(X_minus_j, y) return estimator - def _joblib_predict_one_group(self, X, group_id, key_groups): - """Predict the target variable after removing a group of covariates. + def _joblib_predict_one_features_group( + self, X, features_group_id, key_features_groups + ): + """Predict the target feature after removing a group of covariates. Used in parallel.""" - X_minus_j = np.delete(X, self._groups_ids[group_id], axis=1) + X_minus_j = np.delete(X, self._features_groups_ids[features_group_id], axis=1) - y_pred_loco = getattr(self._list_estimators[group_id], self.method)(X_minus_j) + y_pred_loco = getattr(self._list_estimators[features_group_id], self.method)( + X_minus_j + ) return [y_pred_loco] diff --git a/src/hidimstat/marginal/__init__.py b/src/hidimstat/marginal/__init__.py new file mode 100644 index 000000000..42d9a366b --- /dev/null +++ b/src/hidimstat/marginal/__init__.py @@ -0,0 +1,3 @@ +from .leave_one_covariate_in import LeaveOneCovariateIn + +__all__ = ["LeaveOneCovariateIn"] diff --git a/src/hidimstat/marginal/leave_one_covariate_in.py b/src/hidimstat/marginal/leave_one_covariate_in.py new file mode 100644 index 000000000..1d56e4403 --- /dev/null +++ b/src/hidimstat/marginal/leave_one_covariate_in.py @@ -0,0 +1,243 @@ +import numpy as np +from joblib import Parallel, delayed +from sklearn.base import check_is_fitted, clone +from sklearn.metrics import root_mean_squared_error + +from hidimstat._utils.utils import _check_vim_predict_method +from hidimstat.base_variable_importance import ( + BaseVariableImportance, + VariableImportanceFeatureGroup, +) + + +class LeaveOneCovariateIn(BaseVariableImportance, VariableImportanceFeatureGroup): + def __init__( + self, + estimator, + loss: callable = root_mean_squared_error, + method: str = "predict", + n_jobs: int = 1, + ): + """ + Leave One Covariate In. + For more details, see the section 7.2 of :footcite:t:`ewald2024guide`. + + Parameters + ---------- + estimator : sklearn compatible estimator, optional + The estimator to use for the prediction. + loss : callable, default=root_mean_squared_error + The function to compute the loss when comparing the perturbed model + to the original model. + method : str, default="predict" + The method used for making predictions. This determines the predictions + passed to the loss function. Supported methods are "predict", + "predict_proba", "decision_function", "transform". + n_jobs : int, default=1 + The number of parallel jobs to run. Parallelization is done over the + features or groups of features. + """ + super().__init__() + check_is_fitted(estimator) + self.estimator = estimator + self.loss = loss + _check_vim_predict_method(method) + self.method = method + self.n_jobs = n_jobs + # generated attributes + self._list_univariate_model = [] + self.loss_reference_ = None + + def fit(self, X, y, features_groups=None): + """ + Fit the marginal information variable importance model. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The input samples. + y : array-like of shape (n_samples,) + The target values. + features_groups : dict, optional + A dictionary where the keys are group identifiers and the values are lists + of feature indices or names for each group. If None, each feature is + treated as its own group. + + Returns + ------- + self : object + Returns the instance itself. + """ + super().fit(X, y, features_groups) + X_ = np.asarray(X) + y_ = np.asarray(y) + + # Parallelize the computation of the importance scores for each group + self._list_univariate_model = Parallel(n_jobs=self.n_jobs)( + delayed(self._joblib_fit_one_features_group)(X_, y_, features_groups_ids) + for features_groups_ids in self._features_groups_ids + ) + + def predict(self, X): + """ + Compute the predictions after perturbation of the data for each group of + features. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The input samples. + y : array-like of shape (n_samples,) + The target values. + + Returns + ------- + out : array-like of shape (n_features_groups, n_samples) + The predictions for each group of features. + """ + self._check_fit(X) + X_ = np.asarray(X) + + # Parallelize the computation of the importance scores for each group + out_list = Parallel(n_jobs=self.n_jobs)( + delayed(self._joblib_predict_one_features_group)( + X_, features_group_id, features_groups_ids + ) + for features_group_id, features_groups_ids in enumerate( + self._features_groups_ids + ) + ) + return np.array(out_list) + + def importance(self, X, y): + """ + Compute the marginal importance scores for each group of features. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The input samples. + y : array-like of shape (n_samples,) + The target values. + + Returns + ------- + out_dict : dict + A dictionary containing: + - 'loss_reference' : float + Loss of the original model predictions + - 'loss' : dict + Losses for each group's univariate predictions + - 'importance' : ndarray of shape (n_features_groups,) + Marginal importance scores for each feature group + """ + self._check_fit(X) + + y_pred = self.predict(X) + + # reference to a dummy model + if len(y_pred[0].shape) == 1 or y_pred[0].shape[1] == 1: + # Regression: take the average value as reference + y_ref = np.mean(y) * np.ones_like(y_pred[0]) + self.loss_reference_ = self.loss(y, y_ref) + else: + # Classification: take the most frequent value + values, counts = np.unique(y, return_counts=True) + y_ref = np.zeros_like(y_pred[0]) + y_ref[:, np.argmax(counts)] = 1.0 + self.loss_reference_ = self.loss(y, y_ref) + + self.importances_ = [] + for y_pred_j in y_pred: + self.importances_.append(self.loss_reference_ - self.loss(y, y_pred_j)) + self.pvalues_ = None # estimated pvlaue for method + return self.importances_ + + def fit_importance(self, X, y, cv, features_groups=None): + """ + Fits the model to the data and computes feature importance. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The input data. + y : array-like of shape (n_samples,) + The target values. + cv : + Cross-validation parameter. + features_groups : dict, optional + A dictionary where the keys are group identifiers and the values are lists + of feature indices or names for each group. If None, each feature is + treated as its own group. + + Returns + ------- + importance : array-like + The computed feature importance scores. + """ + list_attribute_saved = ["importances_", "pvalues_", "_list_univariate_model"] + save_value_attributes = [] + for train_index, test_index in cv.split(X): + X_train, X_test = X[train_index], X[test_index] + y_train, y_test = y[train_index], y[test_index] + self.fit(X_train, y_train, features_groups=features_groups) + self.importance(X_test, y_test) + save_value_attributes.append( + [getattr(self, attribute) for attribute in list_attribute_saved] + ) + # create an array of attributes: + for attribute in list_attribute_saved: + setattr(self, attribute, []) + for value_attribute in save_value_attributes: + for attribute, value in zip(list_attribute_saved, value_attribute): + getattr(self, attribute).append(value) + + return np.mean(self.importances_, axis=0) + + def _joblib_fit_one_features_group(self, X, y, features_group_ids): + """ + Helper function to fit a univariate model for a single group. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The input samples. + y : array-like of shape (n_samples,) + The target values. + features_group_ids : array-like + The indices of features belonging to this group. + + Returns + ------- + object + The fitted univariate model for this group. + """ + univariate_model = clone(self.estimator) + return univariate_model.fit( + X[:, features_group_ids].reshape(-1, len(features_group_ids)), y + ) + + def _joblib_predict_one_features_group( + self, X, index_features_group, features_group_ids + ): + """ + Helper function to predict for a single group. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The input samples. + index_features_group : int + The index of the group in _list_univariate_model. + features_group_ids : array-like + The indices of features belonging to this group. + + Returns + ------- + float + The prediction score for this group. + """ + y_pred_loci = getattr( + self._list_univariate_model[index_features_group], self.method + )(X[:, features_group_ids].reshape(-1, len(features_group_ids))) + return y_pred_loci diff --git a/src/hidimstat/permutation_feature_importance.py b/src/hidimstat/permutation_feature_importance.py index 29d007656..58ae363eb 100644 --- a/src/hidimstat/permutation_feature_importance.py +++ b/src/hidimstat/permutation_feature_importance.py @@ -17,9 +17,9 @@ def __init__( ): """ Permutation Feature Importance algorithm as presented in - :footcite:t:`breimanRandomForests2001`. For each variable/group of variables, + :footcite:t:`breimanRandomForests2001`. For each feature/group of features, the importance is computed as the difference between the loss of the initial - model and the loss of the model with the variable/group permuted. + model and the loss of the model with the feature/group permuted. The method was also used in :footcite:t:`mi2021permutation` Parameters @@ -35,9 +35,9 @@ def __init__( "decision_function", "transform". n_jobs : int, default=1 The number of jobs to run in parallel. Parallelization is done over the - variables or groups of variables. + features or groups of features. n_permutations : int, default=50 - The number of permutations to perform. For each variable/group of variables, + The number of permutations to perform. For each feature/group of features, the mean of the losses over the `n_permutations` is computed. random_state : int, default=None The random state to use for sampling. @@ -55,12 +55,14 @@ def __init__( ) self.random_state = random_state - def _permutation(self, X, group_id): + def _permutation(self, X, features_group_id): """Create the permuted data for the j-th group of covariates""" self.random_state = check_random_state(self.random_state) X_perm_j = np.array( [ - self.random_state.permutation(X[:, self._groups_ids[group_id]].copy()) + self.random_state.permutation( + X[:, self._features_groups_ids[features_group_id]].copy() + ) for _ in range(self.n_permutations) ] ) diff --git a/test/marginal/test_leave_one_covariate_in.py b/test/marginal/test_leave_one_covariate_in.py new file mode 100644 index 000000000..57ea9a9de --- /dev/null +++ b/test/marginal/test_leave_one_covariate_in.py @@ -0,0 +1,346 @@ +from copy import deepcopy +import numpy as np +import pandas as pd +import pytest +from sklearn.exceptions import NotFittedError +from sklearn.linear_model import LinearRegression, LogisticRegression +from sklearn.model_selection import train_test_split +from sklearn.metrics import log_loss +from sklearn.model_selection import KFold +from sklearn.metrics import root_mean_squared_error + +from hidimstat import LOCI + + +def configure_linear_categorial_loci(X, y): + """ + Configure Leave One Covariate In (LOCI) model with linear regression + for feature importance analysis. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data matrix where each column represents a feature + and each row a sample. + y : array-like of shape (n_samples,) + Target variable array. + + Returns + ------- + importance : array-like + Array containing importance scores for each feature. + Higher values indicate greater feature importance in predicting + the target variable. + + Notes + ----- + The function performs the following steps: + 1. Splits data into training and test sets + 2. Fits a linear regression model on training data + 3. Configures LOCI with linear regression + 4. Calculates feature importance using the test set + The LOCI method is a marginal methods scoring with linear + regression as the base model. + """ + # split the data into training and test sets + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + + # create and fit a linear regression model on the training set + regression_model = LinearRegression() + regression_model.fit(X_train, y_train) + + # instantiate LOCI model with linear regression imputer + loci = LOCI( + estimator=regression_model, + method="predict", + n_jobs=1, + ) + # fit the model using the training set + loci.fit( + X_train, + y_train, + features_groups=None, + ) + # calculate feature importance using the test set + importance = loci.importance(X_test, y_test) + return np.array(importance) + + +parameter_exact = [ + ("HiDim", 150, 200, 1, 0.0, 42, 1.0, np.inf, 0.0), + ("HiDim with noise", 150, 200, 1, 0.0, 42, 1.0, 10.0, 0.0), + ("HiDim with correlated noise", 150, 200, 1, 0.0, 42, 1.0, 10.0, 0.5), + ("HiDim with correlated features", 150, 200, 1, 0.8, 42, 1.0, np.inf, 0.0), + ("HiDim with high level noise", 150, 200, 10, 0.2, 42, 1.0, 0.5, 0.0), +] + + +@pytest.mark.parametrize( + "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial", + zip(*(list(zip(*parameter_exact))[1:])), + ids=list(zip(*parameter_exact))[0], +) +def test_loci_linear_data_exact(data_generator): + """Tests the method on linear cases with noise and correlation""" + X, y, important_features, _ = data_generator + + importance = configure_linear_categorial_loci(X, y) + # check that importance scores are defined for each feature + assert importance.shape == (X.shape[1],) + # check that important features have the highest importance scores + assert np.all([int(i) in important_features for i in np.argsort(importance)[-1:]]) + + +parameter_bad_detection = [ + ("HiDim with high correlated features", 150, 200, 1, 1.0, 42, 1.0, 5.0, 0.0), +] + + +@pytest.mark.parametrize( + "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial", + zip(*(list(zip(*parameter_bad_detection))[1:])), + ids=list(zip(*parameter_bad_detection))[0], +) +def test_loci_linear_data_fail(data_generator): + """Tests the method on linear cases with noise and correlation""" + X, y, important_features, _ = data_generator + + importance = configure_linear_categorial_loci(X, y) + # check that importance scores are defined for each feature + assert importance.shape == (X.shape[1],) + # check that important features have the highest importance scores + assert np.any( + [int(i) not in important_features for i in np.argsort(importance)[-1:]] + ) + + +@pytest.mark.parametrize( + "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial", + zip(*(list(zip(*parameter_exact))[1:])), + ids=list(zip(*parameter_exact))[0], +) +def test_loci_classication(data_generator): + """Test LOCI for a classification problem""" + X, y, important_features, not_important_features = data_generator + # Create categories + y_clf = deepcopy(y) + y_clf[np.where(y > 4)] = 0 + y_clf[np.where(np.logical_and(y <= 4, y > 0))] = 1 + y_clf[np.where(np.logical_and(y <= 0, y > -4))] = 2 + y_clf[np.where(y <= -4)] = 3 + y_clf = np.array(y_clf, dtype=int) + + # Create and fit a logistic regression model on the training set + logistic_model = LogisticRegression() + logistic_model.fit(X, y_clf) + + loci = LOCI( + estimator=logistic_model, + n_jobs=1, + method="predict_proba", + loss=log_loss, + ) + importance = loci.fit_importance( + X, + y_clf, + cv=KFold(n_splits=5, shuffle=True, random_state=0), + ) + + # Check that importance scores are defined for each feature + assert importance.shape == (X.shape[1],) + # Check that important features have higher mean importance scores + assert ( + importance[important_features].mean() + > importance[not_important_features].mean() + ) + + +############################################################################## +## Test specific options of loci +@pytest.mark.parametrize( + "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial", + [(150, 200, 10, 0.0, 42, 1.0, np.inf, 0.0)], + ids=["high dimension"], +) +def test_loci_group(data_generator): + """Test LOCI with groups using pandas objects""" + X, y, important_features, not_important_features = data_generator + + # Create groups and convert to pandas DataFrame + groups = { + "group_0": [f"col_{i}" for i in important_features], + "the_group_1": [f"col_{i}" for i in not_important_features], + } + X_df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])]) + # Split data into training and test sets + X_train_df, X_test_df, y_train, y_test = train_test_split(X_df, y, random_state=0) + + # Create and fit linear regression model on training set + regression_model = LinearRegression() + regression_model.fit(X_train_df, y_train) + + loci = LOCI( + estimator=regression_model, + method="predict", + n_jobs=1, + ) + loci.fit( + X_train_df, + y_train, + features_groups=groups, + ) + importance = np.array(loci.importance(X_test_df, y_test)) + + # Check if importance scores are computed for each feature + assert importance.shape == (2,) + # Verify that important feature group has higher score + # than non-important feature group + assert importance[0] > importance[1] + + +############################################################################## +@pytest.mark.parametrize( + "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial", + [(150, 200, 1, 0.0, 42, 1.0, 0.0, 0.0)], + ids=["default data"], +) +class TestLOCIClass: + """Test the element of the class""" + + def test_loci_init(self, data_generator): + """Test LOCI initialization""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + loci = LOCI( + estimator=fitted_model, + method="predict", + ) + assert loci.n_jobs == 1 + assert loci.loss == root_mean_squared_error + assert loci.method == "predict" + + def test_loci_fit(self, data_generator): + """Test fitting LOCI""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + loci = LOCI( + estimator=fitted_model, + ) + + # Test fit with auto var_type + loci.fit(X, y) + assert len(loci._list_univariate_model) == X.shape[1] + assert loci.n_features_groups == X.shape[1] + + # Test fit with specified groups + groups = {"g1": [0, 1], "g2": [2, 3, 4]} + loci.fit(X, y, features_groups=groups) + assert len(loci._list_univariate_model) == 2 + assert loci.n_features_groups == 2 + + def test_loci_categorical( + self, + n_samples, + n_features, + support_size, + rho, + seed, + value, + signal_noise_ratio, + rho_serial, + ): + """Test LOCI with categorical variables""" + rng = np.random.default_rng(seed) + X_cont = rng.random((n_samples, 2)) + X_cat = rng.integers(low=0, high=3, size=(n_samples, 1)) + X = np.hstack([X_cont, X_cat]) + y = rng.random((n_samples, 1)) + fitted_model = LinearRegression().fit(X, y) + + loci = LOCI( + estimator=fitted_model, + ) + + importances = loci.fit_importance(X, y, cv=KFold()) + assert len(importances) == 3 + assert np.all(importances < 0) # no informative, worse than dummy model + + +############################################################################## +@pytest.mark.parametrize( + "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial", + [(150, 200, 10, 0.0, 42, 1.0, 0.0, 0.0)], + ids=["default data"], +) +class TestLOCIExceptions: + """Test class for LOCI exceptions""" + + def test_unfitted_estimator(self, data_generator): + """Test when using an unfitted estimator""" + with pytest.raises(NotFittedError): + LOCI( + estimator=LinearRegression(), + method="predict", + ) + + def test_unknown_predict_method(self, data_generator): + """Test when an unknown prediction method is provided""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + + with pytest.raises(ValueError): + LOCI( + estimator=fitted_model, + method="unknown method", + ) + + def test_unfitted_predict(self, data_generator): + """Test predict method with unfitted model""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + loci = LOCI( + estimator=fitted_model, + method="predict", + ) + + with pytest.raises(ValueError, match="The class is not fitted."): + loci.predict(X) + + def test_unfitted_importance(self, data_generator): + """Test importance method with unfitted model""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + loci = LOCI( + estimator=fitted_model, + method="predict", + ) + + with pytest.raises(ValueError, match="The class is not fitted."): + loci.importance(X, y) + + def test_invalid_groups_format(self, data_generator): + """Test when groups are provided in invalid format""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + loci = LOCI(estimator=fitted_model, method="predict") + + invalid_groups = ["group1", "group2"] # Should be dictionary + with pytest.raises(ValueError, match="groups needs to be a dictionnary"): + loci.fit(X, y, features_groups=invalid_groups) + + def test_groups_warning(self, data_generator): + """Test if a subgroup raise a warning""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + loci = LOCI( + estimator=fitted_model, + method="predict", + ) + subgroups = {"group1": [0, 1], "group2": [2, 3]} + loci.fit(X, y, features_groups=subgroups) + + with pytest.warns( + UserWarning, + match="The number of features in X: 200 differs from the number of features for which importance is computed: 4", + ): + loci.importance(X, y) diff --git a/test/test_base_perturbation.py b/test/test_base_perturbation.py index dd3ff6d6c..b65dc671d 100644 --- a/test/test_base_perturbation.py +++ b/test/test_base_perturbation.py @@ -11,4 +11,4 @@ def test_no_implemented_methods(): estimator.fit(X[:, 0], X[:, 1]) basic_class = BasePerturbation(estimator=estimator) with pytest.raises(NotImplementedError): - basic_class._permutation(X, group_id=None) + basic_class._permutation(X, features_group_id=None) diff --git a/test/test_conditional_feature_importance.py b/test/test_conditional_feature_importance.py index f3bec9735..497266936 100644 --- a/test/test_conditional_feature_importance.py +++ b/test/test_conditional_feature_importance.py @@ -62,8 +62,8 @@ def run_cfi(X, y, n_permutation, seed): # fit the model using the training set cfi.fit( X_train, - groups=None, - var_type="auto", + features_groups=None, + features_type="auto", ) # calculate feature importance using the test set vim = cfi.importance(X_test, y_test) @@ -194,8 +194,8 @@ def test_group(data_generator): ) cfi.fit( X_train_df, - groups=groups, - var_type="continuous", + features_groups=groups, + features_type="continuous", ) # Warning expected since column names in pandas are not considered with pytest.warns(UserWarning, match="X does not have valid feature names, but"): @@ -245,8 +245,8 @@ def test_classication(data_generator): ) cfi.fit( X_train, - groups=None, - var_type=["continuous"] * X.shape[1], + features_groups=None, + features_type=["continuous"] * X.shape[1], ) vim = cfi.importance(X_test, y_test_clf) importance = vim["importance"] @@ -297,13 +297,13 @@ def test_fit(self, data_generator): # Test fit with auto var_type cfi.fit(X) assert len(cfi._list_imputation_models) == X.shape[1] - assert cfi.n_groups == X.shape[1] + assert cfi.n_features_groups == X.shape[1] # Test fit with specified groups groups = {"g1": [0, 1], "g2": [2, 3, 4]} - cfi.fit(X, groups=groups) + cfi.fit(X, features_groups=groups) assert len(cfi._list_imputation_models) == 2 - assert cfi.n_groups == 2 + assert cfi.n_features_groups == 2 def test_categorical( self, @@ -331,8 +331,8 @@ def test_categorical( random_state=seed + 1, ) - var_type = ["continuous", "continuous", "categorical"] - cfi.fit(X, y, var_type=var_type) + features_type = ["continuous", "continuous", "categorical"] + cfi.fit(X, y, features_type=features_type) importances = cfi.importance(X, y)["importance"] assert len(importances) == 3 @@ -415,7 +415,7 @@ def test_invalid_type(self, data_generator): # Test error when passing invalid var_type with pytest.raises(ValueError, match="type of data 'invalid' unknow."): - cfi.fit(X, var_type="invalid") + cfi.fit(X, features_type="invalid") def test_invalid_n_permutations(self, data_generator): """Test when invalid number of permutations is provided""" @@ -434,7 +434,7 @@ def test_not_good_type_X(self, data_generator): imputation_model_continuous=LinearRegression(), method="predict", ) - cfi.fit(X, groups=None, var_type="auto") + cfi.fit(X, features_groups=None, features_type="auto") with pytest.raises( ValueError, match="X should be a pandas dataframe or a numpy array." @@ -450,7 +450,7 @@ def test_mismatched_features(self, data_generator): imputation_model_continuous=LinearRegression(), method="predict", ) - cfi.fit(X, groups=None, var_type="auto") + cfi.fit(X, features_groups=None, features_type="auto") with pytest.raises( AssertionError, match="X does not correspond to the fitting data." @@ -473,7 +473,7 @@ def test_mismatched_features_string(self, data_generator): "col_" + str(i) for i in range(int(X.shape[1] / 2), X.shape[1] - 3) ], } - cfi.fit(X, groups=subgroups, var_type="auto") + cfi.fit(X, features_groups=subgroups, features_type="auto") with pytest.raises( AssertionError, @@ -499,8 +499,8 @@ def test_internal_error(self, data_generator): "col_" + str(i) for i in range(int(X.shape[1] / 2), X.shape[1] - 3) ], } - cfi.fit(X, groups=subgroups, var_type="auto") - cfi.groups["group1"] = [None for i in range(100)] + cfi.fit(X, features_groups=subgroups, features_type="auto") + cfi.features_groups["group1"] = [None for i in range(100)] X = X.to_records(index=False) X = np.array(X, dtype=X.dtype.descr) @@ -517,7 +517,9 @@ def test_invalid_var_type(self, data_generator): cfi = CFI(estimator=fitted_model, method="predict") with pytest.raises(ValueError, match="type of data 'invalid_type' unknow."): - cfi.fit(X, groups=None, var_type=["invalid_type"] * X.shape[1]) + cfi.fit( + X, features_groups=None, features_type=["invalid_type"] * X.shape[1] + ) def test_incompatible_imputer(self, data_generator): """Test when incompatible imputer is provided""" @@ -548,7 +550,7 @@ def test_invalid_groups_format(self, data_generator): invalid_groups = ["group1", "group2"] # Should be dictionary with pytest.raises(ValueError, match="groups needs to be a dictionnary"): - cfi.fit(X, groups=invalid_groups, var_type="auto") + cfi.fit(X, features_groups=invalid_groups, features_type="auto") def test_groups_warning(self, data_generator): """Test if a subgroup raise a warning""" @@ -560,7 +562,7 @@ def test_groups_warning(self, data_generator): method="predict", ) subgroups = {"group1": [0, 1], "group2": [2, 3]} - cfi.fit(X, y, groups=subgroups, var_type="auto") + cfi.fit(X, y, features_groups=subgroups, features_type="auto") with pytest.warns( UserWarning, diff --git a/test/test_leave_one_covariate_out.py b/test/test_leave_one_covariate_out.py index d8fd2a763..d4a6bf3ee 100644 --- a/test/test_leave_one_covariate_out.py +++ b/test/test_leave_one_covariate_out.py @@ -36,7 +36,7 @@ def test_loco(): loco.fit( X_train, y_train, - groups=None, + features_groups=None, ) vim = loco.importance(X_test, y_test) @@ -63,7 +63,7 @@ def test_loco(): loco.fit( X_train_df, y_train, - groups=groups, + features_groups=groups, ) # warnings because we doesn't considere the name of columns of pandas with pytest.warns(UserWarning, match="X does not have valid feature names, but"): @@ -87,7 +87,10 @@ def test_loco(): loco_clf.fit( X_train, y_train_clf, - groups={"group_0": important_features, "the_group_1": non_important_features}, + features_groups={ + "group_0": important_features, + "the_group_1": non_important_features, + }, ) vim_clf = loco_clf.importance(X_test, y_test_clf) diff --git a/test/test_permutation_feature_importance.py b/test/test_permutation_feature_importance.py index b9639f359..eb200f63b 100644 --- a/test/test_permutation_feature_importance.py +++ b/test/test_permutation_feature_importance.py @@ -37,7 +37,7 @@ def test_permutation_importance(): pfi.fit( X_train, y_train, - groups=None, + features_groups=None, ) vim = pfi.importance(X_test, y_test) @@ -66,7 +66,7 @@ def test_permutation_importance(): pfi.fit( X_train_df, y_train, - groups=groups, + features_groups=groups, ) # warnings because we doesn't considere the name of columns of pandas with pytest.warns(UserWarning, match="X does not have valid feature names, but"): @@ -93,7 +93,7 @@ def test_permutation_importance(): pfi_clf.fit( X_train, y_train_clf, - groups=None, + features_groups=None, ) vim_clf = pfi_clf.importance(X_test, y_test_clf)