diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index 3e8adf338..e7b9a7e25 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -6,6 +6,99 @@ from sklearn.base import BaseEstimator from hidimstat._utils.exception import InternalError +from hidimstat.statistical_tools.multiple_testing import fdr_threshold + + +def _selection_generic( + values, + k_best=None, + k_lowest=None, + percentile=None, + threshold_max=None, + threshold_min=None, +): + """ + Helper function for selecting features based on multiple criteria. + + Parameters + ---------- + values : array-like of shape (n_features,) + Values to use for feature selection (e.g., importance scores or p-values) + k_best : int, default=None + Selects the top k features based on values. + k_lowest : int, default=None + Selects the lowest k features based on values. + percentile : float, default=None + Selects features based on a specified percentile of values. + threshold_max : float, default=None + Selects features with values below the specified maximum threshold. + threshold_min : float, default=None + Selects features with values above the specified minimum threshold. + + Returns + ------- + selection : array-like of shape (n_features,) + Boolean array indicating the selected features. + """ + n_criteria = np.sum( + [ + criteria is not None + for criteria in [k_best, k_lowest, percentile, threshold_max, threshold_min] + ] + ) + assert n_criteria <= 1, "Only support selection based on one criteria." + if k_best is not None: + assert k_best >= 1, "k_best needs to be positive or None" + if k_best > values.shape[0]: + warnings.warn( + f"k={k_best} is greater than n_features={values.shape[0]}. " + "All the features will be returned." + ) + mask_k_best = np.zeros_like(values, dtype=bool) + + # based on SelectKBest in Scikit-Learn + # Request a stable sort. Mergesort takes more memory (~40MB per + # megafeature on x86-64). + mask_k_best[np.argsort(values, kind="mergesort")[-k_best:]] = 1 + return mask_k_best + elif k_lowest is not None: + assert k_lowest >= 1, "k_lowest needs to be positive or None" + if k_lowest > values.shape[0]: + warnings.warn( + f"k={k_lowest} is greater than n_features={values.shape[0]}. " + "All the features will be returned." + ) + mask_k_lowest = np.zeros_like(values, dtype=bool) + + # based on SelectKBest in Scikit-Learn + # Request a stable sort. Mergesort takes more memory (~40MB per + # megafeature on x86-64). + mask_k_lowest[np.argsort(values, kind="mergesort")[:k_lowest]] = 1 + return mask_k_lowest + elif percentile is not None: + assert ( + 0 < percentile < 100 + ), "percentile must be between 0 and 100 (exclusive). Got {}.".format( + percentile + ) + # based on SelectPercentile in Scikit-Learn + threshold_percentile = np.percentile(values, 100 - percentile) + mask_percentile = values > threshold_percentile + ties = np.where(values == threshold_percentile)[0] + if len(ties): + max_feats = int(len(values) * percentile / 100) + kept_ties = ties[: max_feats - mask_percentile.sum()] + mask_percentile[kept_ties] = True + return mask_percentile + elif threshold_max is not None: + mask_threshold_max = values < threshold_max + return mask_threshold_max + elif threshold_min is not None: + mask_threshold_min = values > threshold_min + return mask_threshold_min + else: + no_mask = np.ones_like(values, dtype=bool) + return no_mask class BaseVariableImportance(BaseEstimator): @@ -21,8 +114,6 @@ class BaseVariableImportance(BaseEstimator): The computed importance scores for each feature. pvalues_ : array-like of shape (n_features,), default=None The computed p-values for each feature. - selections_ : array-like of shape (n_features,), default=None - Binary mask indicating selected features. Methods ------- @@ -37,26 +128,32 @@ def __init__(self): super().__init__() self.importances_ = None self.pvalues_ = None - self.selections_ = None - def selection( - self, k_best=None, percentile=None, threshold=None, threshold_pvalue=None + def _check_importance(self): + """ + Checks if the importance scores have been computed. + """ + if self.importances_ is None: + raise ValueError( + "The importances need to be called before calling this method" + ) + + def importance_selection( + self, k_best=None, percentile=None, threshold_max=None, threshold_min=None ): """ Selects features based on variable importance. - In case several arguments are different from None, - the returned selection is the conjunction of all of them. Parameters ---------- - k_best : int, optional, default=None + k_best : int, default=None Selects the top k features based on importance scores. - percentile : float, optional, default=None + percentile : float, default=None Selects features based on a specified percentile of importance scores. - threshold : float, optional, default=None - Selects features with importance scores above the specified threshold. - threshold_pvalue : float, optional, default=None - Selects features with p-values below the specified threshold. + threshold_max : float, default=None + Selects features with importance scores below the specified maximum threshold. + threshold_min : float, default=None + Selects features with importance scores above the specified minimum threshold. Returns ------- @@ -64,77 +161,145 @@ def selection( Binary array indicating the selected features. """ self._check_importance() - if k_best is not None: - if not isinstance(k_best, str) and k_best > self.importances_.shape[1]: - warnings.warn( - f"k={k_best} is greater than n_features={self.importances_.shape[1]}. " - "All the features will be returned." - ) - assert k_best > 0, "k_best needs to be positive and not null" - if percentile is not None: - assert ( - 0 < percentile and percentile < 100 - ), "percentile needs to be between 0 and 100" - if threshold_pvalue is not None: - assert ( - 0 < threshold_pvalue and threshold_pvalue < 1 - ), "threshold_pvalue needs to be between 0 and 1" - - # base on SelectKBest of Scikit-Learn - if k_best == "all": - mask_k_best = np.ones(self.importances_.shape, dtype=bool) - elif k_best == 0: - mask_k_best = np.zeros(self.importances_.shape, dtype=bool) - elif k_best is not None: - mask_k_best = np.zeros(self.importances_.shape, dtype=bool) - - # Request a stable sort. Mergesort takes more memory (~40MB per - # megafeature on x86-64). - mask_k_best[np.argsort(self.importances_, kind="mergesort")[-k_best:]] = 1 - else: - mask_k_best = np.ones(self.importances_.shape, dtype=bool) - - # base on SelectPercentile of Scikit-Learn - if percentile == 100: - mask_percentile = np.ones(len(self.importances_), dtype=bool) - elif percentile == 0: - mask_percentile = np.zeros(len(self.importances_), dtype=bool) - elif percentile is not None: - threshold = np.percentile(self.importances_, 100 - percentile) - mask_percentile = self.importances_ > threshold - ties = np.where(self.importances_ == threshold)[0] - if len(ties): - max_feats = int(len(self.importances_) * percentile / 100) - kept_ties = ties[: max_feats - mask_percentile.sum()] - mask_percentile[kept_ties] = True - else: - mask_percentile = np.ones(self.importances_.shape, dtype=bool) + return _selection_generic( + self.importances_, + k_best=k_best, + percentile=percentile, + threshold_max=threshold_max, + threshold_min=threshold_min, + ) - if threshold is not None: - mask_threshold = self.importances_ < threshold - else: - mask_threshold = np.ones(self.importances_.shape, dtype=bool) + def pvalue_selection( + self, + k_lowest=None, + percentile=None, + threshold_max=0.05, + threshold_min=None, + alternative_hypothesis=False, + ): + """ + Selects features based on p-values. - # base on SelectFpr of Scikit-Learn - if threshold_pvalue is not None: - mask_threshold_pvalue = self.pvalues_ < threshold_pvalue - else: - mask_threshold_pvalue = np.ones(self.importances_.shape, dtype=bool) + Parameters + ---------- + k_lowest : int, default=None + Selects the k features with lowest p-values. + percentile : float, default=None + Selects features based on a specified percentile of p-values. + threshold_max : float, default=0.05 + Selects features with p-values below the specified maximum threshold (0 to 1). + threshold_min : float, default=None + Selects features with p-values above the specified minimum threshold (0 to 1). + alternative_hypothesis : bool, default=False + If True, selects based on 1-pvalues instead of p-values. - self.selections_ = ( - mask_k_best & mask_percentile & mask_threshold & mask_threshold_pvalue + Returns + ------- + selection : array-like of shape (n_features,) + Binary array indicating the selected features (True for selected). + """ + self._check_importance() + assert ( + self.pvalues_ is not None + ), "The selection on p-value can't be done because the current method does not compute p-values." + if threshold_min is not None: + assert ( + 0 < threshold_min and threshold_min < 1 + ), "threshold_min needs to be between 0 and 1" + if threshold_max is not None: + assert ( + 0 < threshold_max and threshold_max < 1 + ), "threshold_max needs to be between 0 and 1" + assert alternative_hypothesis is None or isinstance( + alternative_hypothesis, bool + ), "alternative_hippothesis can have only three values: True, False and None." + return _selection_generic( + self.pvalues_ if not alternative_hypothesis else 1 - self.pvalues_, + k_lowest=k_lowest, + percentile=percentile, + threshold_max=threshold_max, + threshold_min=threshold_min, ) - return self.selections_ - - def _check_importance(self): + def fdr_selection( + self, + fdr, + fdr_control="bhq", + reshaping_function=None, + alternative_hypothesis=False, + ): """ - Checks if the importance scores have been computed. + Performs feature selection based on False Discovery Rate (FDR) control. + + Parameters + ---------- + fdr : float + The target false discovery rate level (between 0 and 1) + fdr_control: {'bhq', 'bhy'}, default='bhq' + The FDR control method to use: + - 'bhq': Benjamini-Hochberg procedure + - 'bhy': Benjamini-Hochberg-Yekutieli procedure + reshaping_function: callable or None, default=None + Optional reshaping function for FDR control methods. + If None, defaults to sum of reciprocals for 'bhy'. + alternative_hippothesis: bool or None, default=False + If False, selects features with small p-values. + If True, selects features with large p-values (close to 1). + If None, selects features that have either small or large p-values. + + Returns + ------- + selected : ndarray of bool + Boolean mask of selected features. + True indicates selected features, False indicates non-selected features. + + Raises + ------ + ValueError + If `importances_` haven't been computed yet + AssertionError + If `pvalues_` are missing or fdr_control is invalid """ - if self.importances_ is None: - raise ValueError( - "The importances need to be called before calling this method" + self._check_importance() + assert 0 < fdr and fdr < 1, "FDR needs to be between 0 and 1 excluded" + assert ( + self.pvalues_ is not None + ), "FDR-based selection requires p-values to be computed first. The current method does not support p-values." + assert ( + fdr_control == "bhq" or fdr_control == "bhy" + ), "only 'bhq' and 'bhy' are supported" + assert alternative_hypothesis is None or isinstance( + alternative_hypothesis, bool + ), "alternative_hippothesis can have only three values: True, False and None." + + # selection on pvalue + if alternative_hypothesis is None or not alternative_hypothesis: + threshold_pvalues = fdr_threshold( + self.pvalues_, + fdr=fdr, + method=fdr_control, + reshaping_function=reshaping_function, ) + selected_pvalues = self.pvalues_ <= threshold_pvalues + else: + selected_pvalues = np.zeros_like(self.pvalues_, dtype=bool) + + # selection on 1-pvalue + if alternative_hypothesis is None or alternative_hypothesis: + threshold_one_minus_pvalues = fdr_threshold( + 1 - self.pvalues_, + fdr=fdr, + method=fdr_control, + reshaping_function=reshaping_function, + ) + selected_one_minus_pvalues = ( + 1 - self.pvalues_ + ) <= threshold_one_minus_pvalues + else: + selected_one_minus_pvalues = np.zeros_like(self.pvalues_, dtype=bool) + + selected = selected_pvalues | selected_one_minus_pvalues + return selected def plot_importance( self, diff --git a/src/hidimstat/distilled_conditional_randomization_test.py b/src/hidimstat/distilled_conditional_randomization_test.py index 7cf58a315..c3cd1102d 100644 --- a/src/hidimstat/distilled_conditional_randomization_test.py +++ b/src/hidimstat/distilled_conditional_randomization_test.py @@ -664,10 +664,11 @@ def d0crt( scaled_statistics=False, random_state=None, reuse_screening_model=True, - k_best=None, + k_lowest=None, percentile=None, - threshold=None, - threshold_pvalue=None, + threshold_min=None, + threshold_max=None, + alternative_hypothesis=False, ): methods = D0CRT( estimator=estimator, @@ -687,11 +688,12 @@ def d0crt( random_state=random_state, ) methods.fit_importance(X, y, cv=cv) - selection = methods.selection( - k_best=k_best, + selection = methods.pvalue_selection( + k_lowest=k_lowest, percentile=percentile, - threshold=threshold, - threshold_pvalue=threshold_pvalue, + threshold_min=threshold_min, + threshold_max=threshold_max, + alternative_hypothesis=alternative_hypothesis, ) return selection, methods.importances_, methods.pvalues_ @@ -702,7 +704,7 @@ def d0crt( D0CRT.__doc__, D0CRT.__init__.__doc__, D0CRT.fit_importance.__doc__, - D0CRT.selection.__doc__, + D0CRT.pvalue_selection.__doc__, ], """ Returns diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index e48538d30..ec8b739d1 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -458,13 +458,13 @@ def _stat_coefficient_diff(X, X_tilde, y, estimator, fdr, preconfigure_estimator test_score = np.abs(coef[:n_features]) - np.abs(coef[n_features:]) # Compute the threshold level and select the important variables - ko_thr = _knockoff_threshold(test_score, fdr=fdr) + ko_thr = _fdr_threshold_on_symmetric_null(test_score, fdr=fdr) selected = np.where(test_score >= ko_thr)[0] return test_score, ko_thr, selected -def _knockoff_threshold(test_score, fdr=0.1): +def _fdr_threshold_on_symmetric_null(test_score, fdr=0.1): """ Calculate the knockoff threshold based on the procedure stated in the article. diff --git a/src/hidimstat/statistical_tools/aggregation.py b/src/hidimstat/statistical_tools/aggregation.py index a9a85a4e3..21aa44f3c 100644 --- a/src/hidimstat/statistical_tools/aggregation.py +++ b/src/hidimstat/statistical_tools/aggregation.py @@ -1,7 +1,7 @@ import numpy as np -def quantile_aggregation(pvals, gamma=0.05, adaptive=False): +def quantile_aggregation(pvals, gamma=0.5, adaptive=False): """ Implements the quantile aggregation method for p-values. @@ -15,7 +15,7 @@ def quantile_aggregation(pvals, gamma=0.05, adaptive=False): pvals : ndarray of shape (n_sampling*2, n_test) Matrix of p-values to aggregate. Each row represents a sampling instance and each column a hypothesis test. - gamma : float, default=0.05 + gamma : float, default=0.5 Quantile level for aggregation. Must be in range (0,1]. adaptive : bool, default=False If True, uses adaptive quantile aggregation which optimizes over multiple gamma values. diff --git a/test/test_base_importance.py b/test/test_base_importance.py deleted file mode 100644 index f50487fbb..000000000 --- a/test/test_base_importance.py +++ /dev/null @@ -1,74 +0,0 @@ -import matplotlib.pyplot as plt -import numpy as np -import pytest - -from hidimstat.base_variable_importance import BaseVariableImportance - - -def test_plot_importance_axis(): - """Test argument axis of plot function""" - n_features = 10 - vi = BaseVariableImportance() - # Make the plot independent of data / randomness to test only the plotting function - vi.importances_ = np.arange(n_features) - ax_1 = vi.plot_importance(ax=None) - assert isinstance(ax_1, plt.Axes) - - _, ax_2 = plt.subplots() - vi.importances_ = np.random.standard_normal((3, n_features)) - ax_2_bis = vi.plot_importance(ax=ax_2) - assert isinstance(ax_2_bis, plt.Axes) - assert ax_2_bis == ax_2 - - -def test_plot_importance_ascending(): - """Test argument ascending of plot function""" - n_features = 10 - vi = BaseVariableImportance() - - # Make the plot independent of data / randomness to test only the plotting function - vi.importances_ = np.arange(n_features) - np.random.shuffle(vi.importances_) - - ax_decending = vi.plot_importance(ascending=False) - assert np.all( - ax_decending.containers[0].datavalues == np.flip(np.sort(vi.importances_)) - ) - - ax_ascending = vi.plot_importance(ascending=True) - assert np.all(ax_ascending.containers[0].datavalues == np.sort(vi.importances_)) - - -def test_plot_importance_feature_names(): - """Test argument feature of plot function""" - n_features = 10 - vi = BaseVariableImportance() - - # Make the plot independent of data / randomness to test only the plotting function - vi.importances_ = np.arange(n_features) - np.random.shuffle(vi.importances_) - - features_name = [str(j) for j in np.flip(np.argsort(vi.importances_))] - ax_none = vi.plot_importance(feature_names=None) - assert np.all( - np.array([label.get_text() for label in ax_none.get_yticklabels()]) - == features_name - ) - - features_name = ["features_" + str(j) for j in np.flip(np.sort(vi.importances_))] - ax_setup = vi.plot_importance(feature_names=features_name) - assert np.all( - np.array([label.get_text() for label in ax_setup.get_yticklabels()]) - == np.flip(np.array(features_name)[np.argsort(vi.importances_)]) - ) - - vi.features_groups = {str(j * 2): [] for j in np.flip(np.sort(vi.importances_))} - features_name = [str(j * 2) for j in np.flip(np.sort(vi.importances_))] - ax_none_group = vi.plot_importance(feature_names=None) - assert np.all( - np.array([label.get_text() for label in ax_none_group.get_yticklabels()]) - == np.flip(np.array(features_name)[np.argsort(vi.importances_)]) - ) - - with pytest.raises(ValueError, match="feature_names should be a list"): - ax_none_group = vi.plot_importance(feature_names="ttt") diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py new file mode 100644 index 000000000..a507f26de --- /dev/null +++ b/test/test_base_variable_importance.py @@ -0,0 +1,393 @@ +import matplotlib.pyplot as plt +import numpy as np +import pytest + +from hidimstat.base_variable_importance import BaseVariableImportance + + +@pytest.fixture +def set_100_variable_sorted(): + """Create a BaseVariableImportance instance with test data for testing purposes. + + Parameters + ---------- + pvalues : bool + If True, generate random p-values for testing. + test_score : bool + If True, generate random test scores for testing. + seed : int + Random seed for reproducibility. + + Returns + ------- + BaseVariableImportance + A BaseVariableImportance instance with test data. + """ + seed = 0 + n_features = 100 + rng = np.random.RandomState(seed) + vi = BaseVariableImportance() + vi.importances_ = np.arange(n_features) + rng.shuffle(vi.importances_) + vi.pvalues_ = np.flip(np.sort(rng.random(n_features)))[vi.importances_] + return vi + + +class TestSelection: + """Test selection based on importance""" + + def test_selection_k_best(self, set_100_variable_sorted): + "test selection of the k_best" + vi = set_100_variable_sorted + true_value = vi.importances_ >= 95 + selection = vi.importance_selection(k_best=5) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_k_best_none(self, set_100_variable_sorted): + "test selection when there none" + vi = set_100_variable_sorted + true_value = np.ones_like(vi.importances_, dtype=bool) + selection = vi.importance_selection(k_best=None) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_k_lowest(self, set_100_variable_sorted): + "test selection of the k_lowest" + vi = set_100_variable_sorted + true_value = vi.pvalues_ < vi.pvalues_[np.argsort(vi.pvalues_)[5]] + selection = vi.pvalue_selection(k_lowest=5, threshold_max=None) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_k_lowest_none(self, set_100_variable_sorted): + "test selection when there none" + vi = set_100_variable_sorted + true_value = np.ones_like(vi.pvalues_ > 0, dtype=bool) + selection = vi.pvalue_selection(k_lowest=None, threshold_max=None) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_percentile(self, set_100_variable_sorted): + "test selection bae on percentile" + vi = set_100_variable_sorted + true_value = vi.importances_ >= 50 + selection = vi.importance_selection(percentile=50) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_percentile_all(self, set_100_variable_sorted): + "test selection when percentile is 100" + vi = set_100_variable_sorted + true_value = np.ones_like(vi.importances_, dtype=bool) + true_value[np.argsort(vi.importances_)[0]] = False + selection = vi.importance_selection(percentile=99.99) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_percentile_none(self, set_100_variable_sorted): + "test selection when percentile is 0" + vi = set_100_variable_sorted + true_value = np.zeros_like(vi.importances_, dtype=bool) + true_value[np.argsort(vi.importances_)[-1:]] = True + selection = vi.importance_selection(percentile=0.1) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_percentile_threshols_value(self, set_100_variable_sorted): + "test selection when percentile when the percentile equal on value" + vi = set_100_variable_sorted + mask = np.ones_like(vi.importances_, dtype=bool) + mask[np.where(vi.importances_ == 99)] = False + vi.importances_ = vi.importances_[mask] + true_value = vi.importances_ >= 50 + selection = vi.importance_selection(percentile=50) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_threshold_min(self, set_100_variable_sorted): + "test threshold minimal on importance" + vi = set_100_variable_sorted + true_value = vi.importances_ > 5 + selection = vi.importance_selection(threshold_min=5) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_threshold_max(self, set_100_variable_sorted): + "test threshold maximal on importance" + vi = set_100_variable_sorted + true_value = vi.importances_ < 5 + selection = vi.importance_selection(threshold_max=5) + np.testing.assert_array_equal(true_value, selection) + + +class TestSelectionFDR: + """Test selection based on fdr""" + + def test_selection_fdr_default(self, set_100_variable_sorted): + "test selection of the default" + vi = set_100_variable_sorted + selection = vi.fdr_selection(0.2) + assert np.all( + [ + i >= (vi.importances_ - np.sum(selection)) + for i in vi.importances_[selection] + ] + ) + + def test_selection_fdr_default_1(self, set_100_variable_sorted): + "test selection of the default" + vi = set_100_variable_sorted + vi.pvalues_ = np.random.rand(vi.importances_.shape[0]) * 30 + true_value = np.zeros_like(vi.importances_, dtype=bool) # selected any + selection = vi.fdr_selection(0.2) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_fdr_bhy(self, set_100_variable_sorted): + "test selection with bhy" + vi = set_100_variable_sorted + selection = vi.fdr_selection(0.2, fdr_control="bhy") + assert np.all( + [ + i >= (vi.importances_ - np.sum(selection)) + for i in vi.importances_[selection] + ] + ) + + def test_selection_fdr_alternative_hypothesis(self, set_100_variable_sorted): + "test selection fdr_control wrong" + vi = set_100_variable_sorted + with pytest.raises( + AssertionError, + match="alternative_hippothesis can have only three values: True, False and None.", + ): + vi.fdr_selection(fdr=0.1, alternative_hypothesis="alt") + + def test_selection_fdr_pvalue(self, set_100_variable_sorted): + "test selection fdr without 1-pvalue" + vi = set_100_variable_sorted + true_value = np.arange(100) <= 4 + selection = vi.fdr_selection(fdr=0.9, alternative_hypothesis=False) + np.testing.assert_equal( + true_value, np.flip(selection[np.argsort(vi.importances_)]) + ) + + def test_selection_fdr_one_minus_pvalue(self, set_100_variable_sorted): + "test selection fdr without 1-pvalue" + vi = set_100_variable_sorted + true_value = np.arange(100) >= 34 + selection = vi.fdr_selection(fdr=0.9, alternative_hypothesis=True) + np.testing.assert_equal( + true_value, np.flip(selection[np.argsort(vi.importances_)]) + ) + + def test_selection_fdr_two_side(self, set_100_variable_sorted): + "test selection fdr without 1-pvalue" + vi = set_100_variable_sorted + true_value = np.logical_or(np.arange(100) <= 4, np.arange(100) >= 34) + selection = vi.fdr_selection(fdr=0.9, alternative_hypothesis=None) + np.testing.assert_equal( + true_value, np.flip(selection[np.argsort(vi.importances_)]) + ) + + +class TestBVIExceptions: + """Test class for BVI Exception""" + + def test_not_fit(self): + "test detection unfit" + vi = BaseVariableImportance() + with pytest.raises( + ValueError, + match="The importances need to be called before calling this method", + ): + vi._check_importance() + with pytest.raises( + ValueError, + match="The importances need to be called before calling this method", + ): + vi.importance_selection() + + def test_selection_k_best(self, set_100_variable_sorted): + "test selection k_best wrong" + vi = set_100_variable_sorted + with pytest.raises(AssertionError, match="k_best needs to be positive"): + vi.importance_selection(k_best=-10) + with pytest.warns(Warning, match="k=1000 is greater than n_features="): + vi.importance_selection(k_best=1000) + + def test_selection_k_lowest(self, set_100_variable_sorted): + "test selection k_lowest wrong" + vi = set_100_variable_sorted + with pytest.raises(AssertionError, match="k_lowest needs to be positive"): + vi.pvalue_selection(k_lowest=-10, threshold_max=None) + with pytest.warns(Warning, match="k=1000 is greater than n_features="): + vi.pvalue_selection(k_lowest=1000, threshold_max=None) + + def test_selection_percentile(self, set_100_variable_sorted): + "test selection percentile wrong" + vi = set_100_variable_sorted + with pytest.raises( + AssertionError, + match=r"percentile must be between 0 and 100 \(exclusive\). Got -1.", + ): + vi.importance_selection(percentile=-1) + with pytest.raises( + AssertionError, + match=r"percentile must be between 0 and 100 \(exclusive\). Got 102.", + ): + vi.importance_selection(percentile=102) + with pytest.raises( + AssertionError, + match=r"percentile must be between 0 and 100 \(exclusive\). Got 0.", + ): + vi.importance_selection(percentile=0) + with pytest.raises( + AssertionError, + match=r"percentile must be between 0 and 100 \(exclusive\). Got 100", + ): + vi.importance_selection(percentile=100) + + def test_selection_pvalue_None(self, set_100_variable_sorted): + "test selection on pvalue without it" + vi = set_100_variable_sorted + vi.pvalues_ = None + with pytest.raises( + AssertionError, + match="The selection on p-value can't be done because the current method does not compute p-values.", + ): + vi.pvalue_selection(threshold_min=-1) + + def test_selection_threshold(self, set_100_variable_sorted): + "test selection threshold wrong" + vi = set_100_variable_sorted + with pytest.raises( + AssertionError, match="threshold_min needs to be between 0 and 1" + ): + vi.pvalue_selection(threshold_min=-1) + with pytest.raises( + AssertionError, match="threshold_min needs to be between 0 and 1" + ): + vi.pvalue_selection(threshold_min=1.1) + with pytest.raises( + AssertionError, match="threshold_max needs to be between 0 and 1" + ): + vi.pvalue_selection(threshold_max=-1) + with pytest.raises( + AssertionError, match="threshold_max needs to be between 0 and 1" + ): + vi.pvalue_selection(threshold_max=1.1) + with pytest.raises( + AssertionError, match="Only support selection based on one criteria." + ): + vi.pvalue_selection(threshold_max=0.5, threshold_min=0.9) + + +class TestSelectionFDRExceptions: + def test_not_fit(self): + "test detection unfit" + + vi = BaseVariableImportance() + with pytest.raises( + ValueError, + match="The importances need to be called before calling this method", + ): + vi.fdr_selection(0.1) + + def test_selection_fdr_wrong_fdr(self, set_100_variable_sorted): + "test selection fdr with wrong fdr" + vi = set_100_variable_sorted + with pytest.raises( + AssertionError, + match="FDR needs to be between 0 and 1 excluded", + ): + vi.fdr_selection(fdr=0.0) + with pytest.raises( + AssertionError, + match="FDR needs to be between 0 and 1 excluded", + ): + vi.fdr_selection(fdr=1.0) + with pytest.raises( + AssertionError, + match="FDR needs to be between 0 and 1 excluded", + ): + vi.fdr_selection(fdr=-1.0) + + def test_selection_fdr_pvalue_None(self, set_100_variable_sorted): + "test selection fdr without pvalue" + vi = set_100_variable_sorted + vi.pvalues_ = None + with pytest.raises( + AssertionError, + match="FDR-based selection requires p-values to be computed first. The current method does not support p-values.", + ): + vi.fdr_selection(fdr=0.1) + + def test_selection_fdr_fdr_control(self, set_100_variable_sorted): + "test selection fdr_control wrong" + vi = set_100_variable_sorted + with pytest.raises( + AssertionError, + match="only 'bhq' and 'bhy' are supported", + ): + vi.fdr_selection(fdr=0.1, fdr_control="ehb") + + +def test_plot_importance_axis(): + """Test argument axis of plot function""" + n_features = 10 + vi = BaseVariableImportance() + # Make the plot independent of data / randomness to test only the plotting function + vi.importances_ = np.arange(n_features) + ax_1 = vi.plot_importance(ax=None) + assert isinstance(ax_1, plt.Axes) + + _, ax_2 = plt.subplots() + vi.importances_ = np.random.standard_normal((3, n_features)) + ax_2_bis = vi.plot_importance(ax=ax_2) + assert isinstance(ax_2_bis, plt.Axes) + assert ax_2_bis == ax_2 + + +def test_plot_importance_ascending(): + """Test argument ascending of plot function""" + n_features = 10 + vi = BaseVariableImportance() + + # Make the plot independent of data / randomness to test only the plotting function + vi.importances_ = np.arange(n_features) + np.random.shuffle(vi.importances_) + + ax_decending = vi.plot_importance(ascending=False) + assert np.all( + ax_decending.containers[0].datavalues == np.flip(np.sort(vi.importances_)) + ) + + ax_ascending = vi.plot_importance(ascending=True) + assert np.all(ax_ascending.containers[0].datavalues == np.sort(vi.importances_)) + + +def test_plot_importance_feature_names(): + """Test argument feature of plot function""" + n_features = 10 + vi = BaseVariableImportance() + + # Make the plot independent of data / randomness to test only the plotting function + vi.importances_ = np.arange(n_features) + np.random.shuffle(vi.importances_) + + features_name = [str(j) for j in np.flip(np.argsort(vi.importances_))] + ax_none = vi.plot_importance(feature_names=None) + assert np.all( + np.array([label.get_text() for label in ax_none.get_yticklabels()]) + == features_name + ) + + features_name = ["features_" + str(j) for j in np.flip(np.sort(vi.importances_))] + ax_setup = vi.plot_importance(feature_names=features_name) + assert np.all( + np.array([label.get_text() for label in ax_setup.get_yticklabels()]) + == np.flip(np.array(features_name)[np.argsort(vi.importances_)]) + ) + + vi.features_groups = {str(j * 2): [] for j in np.flip(np.sort(vi.importances_))} + features_name = [str(j * 2) for j in np.flip(np.sort(vi.importances_))] + ax_none_group = vi.plot_importance(feature_names=None) + assert np.all( + np.array([label.get_text() for label in ax_none_group.get_yticklabels()]) + == np.flip(np.array(features_name)[np.argsort(vi.importances_)]) + ) + + with pytest.raises(ValueError, match="feature_names should be a list"): + ax_none_group = vi.plot_importance(feature_names="ttt") diff --git a/test/test_conditional_feature_importance.py b/test/test_conditional_feature_importance.py index 9f6d82163..459525131 100644 --- a/test/test_conditional_feature_importance.py +++ b/test/test_conditional_feature_importance.py @@ -552,7 +552,6 @@ def test_incompatible_imputer(self, data_generator): imputation_model_continuous="invalid_imputer", method="predict", ) - cfi.fit(X, y) with pytest.raises(AssertionError, match="Categorial imputation model invalid"): cfi = CFI( @@ -560,7 +559,6 @@ def test_incompatible_imputer(self, data_generator): imputation_model_categorical="invalid_imputer", method="predict", ) - cfi.fit(X, y) def test_invalid_groups_format(self, data_generator): """Test when groups are provided in invalid format""" diff --git a/test/test_distilled_conditional_randomization_test.py b/test/test_distilled_conditional_randomization_test.py index 53a8d2035..9b40565df 100644 --- a/test/test_distilled_conditional_randomization_test.py +++ b/test/test_distilled_conditional_randomization_test.py @@ -4,7 +4,6 @@ import numpy as np import pytest -from numpy.random import RandomState from sklearn.covariance import LedoitWolf from sklearn.datasets import make_classification, make_regression from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor @@ -33,13 +32,13 @@ def test_dcrt_lasso_screening(generate_regression_dataset): screening_threshold=None, ) pvalue_no_screening = d0crt_no_screening.fit_importance(X, y) - sv_no_screening = d0crt_no_screening.selection(threshold_pvalue=0.05) + sv_no_screening = d0crt_no_screening.pvalue_selection(threshold_max=0.05) d0crt_screening = D0CRT( estimator=LassoCV(n_jobs=1), screening_threshold=10, ) pvalue_screening = d0crt_screening.fit_importance(X, y) - sv_screening = d0crt_screening.selection(threshold_pvalue=0.05) + sv_screening = d0crt_screening.pvalue_selection(threshold_max=0.05) assert np.sum(d0crt_no_screening.importances_ != 0) <= 10 assert np.sum(d0crt_screening.importances_ != 0) <= 10 assert len(sv_no_screening) <= 10 @@ -57,7 +56,7 @@ def test_dcrt_lasso_screening(generate_regression_dataset): ) d0crt_no_screening.fit_importance(X, y) pvalue_no_screening = d0crt_no_screening.importance(X, y) - sv_no_screening = d0crt_no_screening.selection(threshold_pvalue=0.05) + sv_no_screening = d0crt_no_screening.pvalue_selection(threshold_max=0.05) assert len(sv_no_screening) <= 10 assert len(pvalue_no_screening) == 10 assert len(d0crt_no_screening.importances_) == 10 @@ -79,7 +78,7 @@ def test_dcrt_lasso_with_estimed_coefficient(generate_regression_dataset): ) d0crt.fit(X, y) pvalue = d0crt.importance(X, y) - sv = d0crt.selection(threshold_pvalue=0.05) + sv = d0crt.pvalue_selection(threshold_max=0.05) assert len(sv) <= 10 assert len(pvalue) == 10 assert len(d0crt.importances_) == 10 @@ -97,7 +96,7 @@ def test_dcrt_lasso_with_refit(generate_regression_dataset): screening_threshold=None, ) pvalue = d0crt_refit.fit_importance(X, y) - sv = d0crt_refit.selection(threshold_pvalue=0.05) + sv = d0crt_refit.pvalue_selection(threshold_max=0.05) assert len(sv) <= 10 assert len(pvalue) == 10 assert len(d0crt_refit.importances_) == 10 @@ -115,7 +114,7 @@ def test_dcrt_lasso_with_no_cv(generate_regression_dataset): screening_threshold=None, ) pvalue = d0crt_use_cv.fit_importance(X, y) - sv = d0crt_use_cv.selection(threshold_pvalue=0.05) + sv = d0crt_use_cv.pvalue_selection(threshold_max=0.05) assert len(sv) <= 10 assert len(pvalue) == 10 assert len(d0crt_use_cv.importances_) == 10 @@ -135,7 +134,7 @@ def test_dcrt_lasso_with_covariance(generate_regression_dataset): screening_threshold=None, ) pvalue = d0crt_covariance.fit_importance(X, y) - sv = d0crt_covariance.selection(threshold_pvalue=0.05) + sv = d0crt_covariance.pvalue_selection(threshold_max=0.05) assert len(sv) <= 10 assert len(pvalue) == 10 assert len(d0crt_covariance.importances_) == 10 @@ -153,7 +152,7 @@ def test_dcrt_lasso_center(): scaled_statistics=True, ) pvalue = d0crt.fit_importance(X, y) - sv = d0crt.selection(threshold_pvalue=0.05) + sv = d0crt.pvalue_selection(threshold_max=0.05) assert len(sv) <= 10 assert len(pvalue) == 10 assert len(d0crt.importances_) == 10 @@ -171,7 +170,7 @@ def test_dcrt_lasso_refit(): scaled_statistics=True, ) pvalue = d0crt.fit_importance(X, y) - sv = d0crt.selection(threshold_pvalue=0.05) + sv = d0crt.pvalue_selection(threshold_max=0.05) assert len(sv) <= 10 assert len(pvalue) == 10 assert len(d0crt.importances_) == 10 @@ -203,7 +202,7 @@ def test_dcrt_distillation_x_different(): scaled_statistics=True, ) pvalue = d0crt.fit_importance(X, y) - sv = d0crt.selection(threshold_pvalue=0.05) + sv = d0crt.pvalue_selection(threshold_max=0.05) assert np.where(d0crt.importances_ != 0)[0].shape[0] <= 10 assert len(sv) <= 10 assert len(pvalue) == 10 @@ -221,7 +220,7 @@ def test_dcrt_distillation_y_different(): scaled_statistics=True, ) pvalue = d0crt.fit_importance(X, y) - sv = d0crt.selection(threshold_pvalue=0.05) + sv = d0crt.pvalue_selection(threshold_max=0.05) assert np.where(d0crt.importances_ != 0)[0].shape[0] <= 10 assert len(sv) <= 10 assert len(pvalue) == 10 @@ -241,7 +240,7 @@ def test_dcrt_lasso_fit_with_no_cv(): scaled_statistics=True, ) pvalue = d0crt.fit_importance(X, y) - sv = d0crt.selection(threshold_pvalue=0.05) + sv = d0crt.pvalue_selection(threshold_max=0.05) assert np.sum(d0crt.importances_ != 0) <= 10 assert len(sv) <= 10 assert len(pvalue) == 10 @@ -261,7 +260,7 @@ def test_dcrt_RF_regression(): scaled_statistics=True, ) pvalue = d0crt.fit_importance(X, y) - sv = d0crt.selection(threshold_pvalue=0.05) + sv = d0crt.pvalue_selection(threshold_max=0.05) assert np.where(d0crt.importances_ != 0)[0].shape[0] <= 10 assert len(sv) <= 10 assert len(pvalue) == 10 @@ -280,7 +279,7 @@ def test_dcrt_RF_classification(): scaled_statistics=True, ) pvalue = d0crt.fit_importance(X, y) - sv = d0crt.selection(threshold_pvalue=0.05) + sv = d0crt.pvalue_selection(threshold_max=0.05) assert np.where(d0crt.importances_ != 0)[0].shape[0] <= 10 assert len(sv) <= 10 assert len(pvalue) == 10 @@ -464,7 +463,7 @@ def test_d0crt_linear(): screening_threshold=90, ) importances = d0crt.fit_importance(X, y) - sv = d0crt.selection(threshold_pvalue=0.05) + sv = d0crt.pvalue_selection(threshold_max=0.05) assert np.mean(importances[important_ids]) > np.mean(importances[~important_ids]) assert np.array_equal(np.where(sv)[0], important_ids) @@ -490,7 +489,7 @@ def test_d0crt_rf(): random_state=0, ) importances = d0crt.fit_importance(X, y) - sv = d0crt.selection(threshold_pvalue=0.05) + sv = d0crt.pvalue_selection(threshold_max=0.05) assert np.mean(importances[important_ids]) > np.mean(importances[~important_ids]) assert np.array_equal(np.where(sv)[0], important_ids)