From 9c2d77cdc2aa6994f3ca27224cae087039c7154a Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 28 Aug 2025 10:48:16 +0200 Subject: [PATCH 01/93] add method for selection base on FDR --- src/hidimstat/base_variable_importance.py | 198 +++++++++++++++++++++- 1 file changed, 189 insertions(+), 9 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index b4f539024..e4967a727 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -3,6 +3,9 @@ from sklearn.base import BaseEstimator import numpy as np +from hidimstat.statistical_tools.multiple_testing import fdr_threshold +from hidimstat.statistical_tools.aggregation import quantile_aggregation + class BaseVariableImportance(BaseEstimator): """ @@ -34,15 +37,22 @@ def __init__(self): self.importances_ = None self.pvalues_ = None self.selections_ = None + self.test_scores_ = None + + def _check_importance(self): + """ + Checks if the importance scores and p-values have been computed. + """ + if self.importances_ is None: + raise ValueError( + "The importances need to be called before calling this method" + ) def selection( self, k_best=None, percentile=None, threshold=None, threshold_pvalue=None ): """ Selects features based on variable importance. - In case several arguments are different from None, - the returned selection is the conjunction of all of them. - Parameters ---------- k_best : int, optional, default=None @@ -53,7 +63,6 @@ def selection( Selects features with importance scores above the specified threshold. threshold_pvalue : float, optional, default=None Selects features with p-values below the specified threshold. - Returns ------- selection : array-like of shape (n_features,) @@ -123,11 +132,182 @@ def selection( return self.selections_ - def _check_importance(self): + def selection_fdr( + self, + fdr, + fdr_control="bhq", + evalues=False, + reshaping_function=None, + adaptive_aggregation=False, + gamma=0.5, + ): """ - Checks if the importance scores and p-values have been computed. + Performs feature selection based on False Discovery Rate (FDR) control. + + This method selects features by controlling the FDR using either p-values or e-values + derived from test scores. It supports different FDR control methods and optional + adaptive aggregation of the statistical values. + + Parameters + ---------- + fdr : float, default=None + The target false discovery rate level (between 0 and 1) + fdr_control: string, default="bhq" + The FDR control method to use. Options are: + - "bhq": Benjamini-Hochberg procedure + - 'bhy': Benjamini-Hochberg-Yekutieli procedure + - "ebh": e-BH procedure (only for e-values) + evalues: boolean, default=False + If True, uses e-values for selection. If False, uses p-values. + reshaping_function: callable, default=None + Reshaping function for BHY method, default uses sum of reciprocals + adaptive_aggregation: boolean, default=False + If True, uses adaptive weights for p-value aggregation. + Only applicable when evalues=False. + gamma: boolean, default=0.5 + The gamma parameter for quantile aggregation of p-values. + Only used when evalues=False. + + Returns + ------- + numpy.ndarray + Boolean array indicating selected features (True for selected, False for not selected) + + Raises + ------ + AssertionError + If test_scores_ is None or if incompatible combinations of parameters are provided """ - if self.importances_ is None: - raise ValueError( - "The importances need to be called before calling this method" + self._check_importance() + assert ( + self.test_scores_ is not None + ), "this method doesn't support selection base on FDR" + + if not evalues: + assert fdr_control != "ebh", "for p-value, the fdr control can't be 'ebh'" + pvalues = np.array( + [ + _empirical_pval(self.test_scores_[i]) + for i in range(len(self.test_scores_)) + ] ) + aggregated_pval = quantile_aggregation( + pvalues, gamma=gamma, adaptive=adaptive_aggregation + ) + threshold = fdr_threshold( + aggregated_pval, + fdr=fdr, + method=fdr_control, + reshaping_function=reshaping_function, + ) + selected = aggregated_pval <= threshold + else: + assert fdr_control == "ebh", "for e-value, the fdr control need to be 'ebh'" + ko_threshold = [] + for test_score in self.test_scores_: + ko_threshold.append(_estimated_threshold(test_score, fdr=fdr)) + evalues = np.array( + [ + _empirical_eval(self.test_scores_[i], ko_threshold[i]) + for i in range(len(self.test_scores_)) + ] + ) + aggregated_eval = np.mean(evalues, axis=0) + threshold = fdr_threshold( + aggregated_eval, + fdr=fdr, + method=fdr_control, + reshaping_function=reshaping_function, + ) + selected = aggregated_eval >= threshold + return selected + + +def _estimated_threshold(test_score, fdr=0.1): + """ + Calculate the threshold based on the procedure stated in the knockoff article. + Original code: + https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/knockoff_filter.R + Parameters + ---------- + test_score : 1D ndarray, shape (n_features, ) + Vector of test statistic. + fdr : float + Desired controlled FDR (false discovery rate) level. + Returns + ------- + threshold : float or np.inf + Threshold level. + """ + offset = 1 # Offset equals 1 is the knockoff+ procedure. + + threshold_mesh = np.sort(np.abs(test_score[test_score != 0])) + np.concatenate( + [[0], threshold_mesh, [np.inf]] + ) # if there is no solution, the threshold is inf + # find the right value of t for getting a good fdr + # Equation 1.8 of barber2015controlling and 3.10 in Candès 2018 + threshold = 0.0 + for threshold in threshold_mesh: + false_pos = np.sum(test_score <= -threshold) + selected = np.sum(test_score >= threshold) + if (offset + false_pos) / np.maximum(selected, 1) <= fdr: + break + return threshold + + +def _empirical_pval(test_score): + """ + Compute the empirical p-values from the test based on knockoff+. + Parameters + ---------- + test_score : 1D ndarray, shape (n_features, ) + Vector of test statistics. + Returns + ------- + pvals : 1D ndarray, shape (n_features, ) + Vector of empirical p-values. + """ + pvals = [] + n_features = test_score.size + + offset = 1 # Offset equals 1 is the knockoff+ procedure. + + test_score_inv = -test_score + for i in range(n_features): + if test_score[i] <= 0: + pvals.append(1) + else: + pvals.append( + (offset + np.sum(test_score_inv >= test_score[i])) / n_features + ) + + return np.array(pvals) + + +def _empirical_eval(test_score, ko_threshold): + """ + Compute the empirical e-values from the test based on knockoff. + Parameters + ---------- + test_score : 1D ndarray, shape (n_features, ) + Vector of test statistics. + ko_threshold : float + Threshold level. + Returns + ------- + evals : 1D ndarray, shape (n_features, ) + Vector of empirical e-values. + """ + evals = [] + n_features = test_score.size + + offset = 1 # Offset equals 1 is the knockoff+ procedure. + + for i in range(n_features): + if test_score[i] < ko_threshold: + evals.append(0) + else: + evals.append(n_features / (offset + np.sum(test_score <= -ko_threshold))) + + return np.array(evals) From 5314c37d15e3e3a607c90bf7c86748b68ea508cf Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 28 Aug 2025 19:40:17 +0200 Subject: [PATCH 02/93] fix default of the qunatile aggragation --- src/hidimstat/statistical_tools/aggregation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hidimstat/statistical_tools/aggregation.py b/src/hidimstat/statistical_tools/aggregation.py index a9a85a4e3..21aa44f3c 100644 --- a/src/hidimstat/statistical_tools/aggregation.py +++ b/src/hidimstat/statistical_tools/aggregation.py @@ -1,7 +1,7 @@ import numpy as np -def quantile_aggregation(pvals, gamma=0.05, adaptive=False): +def quantile_aggregation(pvals, gamma=0.5, adaptive=False): """ Implements the quantile aggregation method for p-values. @@ -15,7 +15,7 @@ def quantile_aggregation(pvals, gamma=0.05, adaptive=False): pvals : ndarray of shape (n_sampling*2, n_test) Matrix of p-values to aggregate. Each row represents a sampling instance and each column a hypothesis test. - gamma : float, default=0.05 + gamma : float, default=0.5 Quantile level for aggregation. Must be in range (0,1]. adaptive : bool, default=False If True, uses adaptive quantile aggregation which optimizes over multiple gamma values. From be837e097d208034167475ad9beac7ae1463c3d7 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 28 Aug 2025 19:41:07 +0200 Subject: [PATCH 03/93] fix selection --- src/hidimstat/base_variable_importance.py | 20 +- test/test_base_variable_importance.py | 226 ++++++++++++++++++++++ 2 files changed, 239 insertions(+), 7 deletions(-) create mode 100644 test/test_base_variable_importance.py diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index e4967a727..b6e02eaa0 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -70,17 +70,23 @@ def selection( """ self._check_importance() if k_best is not None: - if not isinstance(k_best, str) and k_best > self.importances_.shape[1]: + if not isinstance(k_best, str) and k_best > self.importances_.shape[0]: warnings.warn( - f"k={k_best} is greater than n_features={self.importances_.shape[1]}. " + f"k={k_best} is greater than n_features={self.importances_.shape[0]}. " "All the features will be returned." ) - assert k_best > 0, "k_best needs to be positive and not null" + if isinstance(k_best, str): + assert k_best == "all" + else: + assert k_best >= 0, "k_best needs to be positive or null" if percentile is not None: assert ( - 0 < percentile and percentile < 100 + 0 <= percentile and percentile <= 100 ), "percentile needs to be between 0 and 100" if threshold_pvalue is not None: + assert ( + self.pvalues_ is not None + ), "This method doesn't support a threshold on p-values" assert ( 0 < threshold_pvalue and threshold_pvalue < 1 ), "threshold_pvalue needs to be between 0 and 1" @@ -105,9 +111,9 @@ def selection( elif percentile == 0: mask_percentile = np.zeros(len(self.importances_), dtype=bool) elif percentile is not None: - threshold = np.percentile(self.importances_, 100 - percentile) - mask_percentile = self.importances_ > threshold - ties = np.where(self.importances_ == threshold)[0] + threshold_percentile = np.percentile(self.importances_, 100 - percentile) + mask_percentile = self.importances_ > threshold_percentile + ties = np.where(self.importances_ == threshold_percentile)[0] if len(ties): max_feats = int(len(self.importances_) * percentile / 100) kept_ties = ties[: max_feats - mask_percentile.sum()] diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py new file mode 100644 index 000000000..e3395c4ea --- /dev/null +++ b/test/test_base_variable_importance.py @@ -0,0 +1,226 @@ +import pytest +import numpy as np + +from hidimstat import BaseVariableImportance + + +@pytest.fixture +def set_BaseVariableImportance(pvalues, test_score, seed): + nb_features = 100 + rng = np.random.RandomState(seed) + vi = BaseVariableImportance() + vi.importances_ = np.arange(nb_features) + rng.shuffle(vi.importances_) + if pvalues or test_score: + vi.pvalues_ = np.sort(rng.rand(nb_features))[vi.importances_] + if test_score: + vi.test_scores_ = [] + for i in range(10): + score = np.random.rand(nb_features) * 30 + vi.test_scores_.append(score) + for i in range(1, 30): + score = np.random.rand(nb_features) + 1 + score[-i:] = np.arange(30 - i, 30) * 2 + score[:i] = -np.arange(30 - i, 30) + vi.test_scores_.append(score[vi.importances_]) + return vi + + +@pytest.mark.parametrize( + "pvalues, test_score, seed", + [(False, False, 0), (True, False, 1), (True, True, 2)], + ids=["only importance", "p-value", "test-score"], +) +class TestSelection: + """Test selection base on importance""" + + def test_selection_k_best(self, set_BaseVariableImportance): + "test selection of the k_best" + vi = set_BaseVariableImportance + true_value = vi.importances_ >= 95 + selection = vi.selection(k_best=5) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_k_best_all(self, set_BaseVariableImportance): + "test selection to all base on string" + vi = set_BaseVariableImportance + true_value = np.ones_like(vi.importances_, dtype=bool) + selection = vi.selection(k_best="all") + np.testing.assert_array_equal(true_value, selection) + + def test_selection_k_best_none(self, set_BaseVariableImportance): + "test selection when there none" + vi = set_BaseVariableImportance + true_value = np.zeros_like(vi.importances_, dtype=bool) + selection = vi.selection(k_best=0) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_percentile(self, set_BaseVariableImportance): + "test selection bae on percentile" + vi = set_BaseVariableImportance + true_value = vi.importances_ >= 50 + selection = vi.selection(percentile=50) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_percentile_all(self, set_BaseVariableImportance): + "test selection when percentile is 100" + vi = set_BaseVariableImportance + true_value = np.ones_like(vi.importances_, dtype=bool) + selection = vi.selection(percentile=100) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_percentile_none(self, set_BaseVariableImportance): + "test selection when percentile is 0" + vi = set_BaseVariableImportance + true_value = np.zeros_like(vi.importances_, dtype=bool) + selection = vi.selection(percentile=0) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_percentile_threshols_value(self, set_BaseVariableImportance): + "test selection when percentile when the percentile equal on value" + vi = set_BaseVariableImportance + mask = np.ones_like(vi.importances_, dtype=bool) + mask[np.where(vi.importances_ == 99)] = False + vi.importances_ = vi.importances_[mask] + true_value = vi.importances_ >= 50 + selection = vi.selection(percentile=50) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_threshold(self, set_BaseVariableImportance): + "test threshold on importance" + vi = set_BaseVariableImportance + true_value = vi.importances_ < 5 + selection = vi.selection(threshold=5) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_threshold_pvalue(self, set_BaseVariableImportance): + "test threshold vbse on pvalues" + vi = set_BaseVariableImportance + if vi.pvalues_ is not None: + true_value = vi.importances_ < 5 + print(vi.pvalues_) + selection = vi.selection( + threshold_pvalue=vi.pvalues_[np.argsort(vi.importances_)[5]] + ) + np.testing.assert_array_equal(true_value, selection) + + +@pytest.mark.parametrize( + "pvalues, test_score, seed", [(True, True, 10)], ids=["default"] +) +class TestSelectionFDR: + """Test selection base on fdr""" + + def test_selection_fdr_default(self, set_BaseVariableImportance): + "test selection of the default" + vi = set_BaseVariableImportance + true_value = vi.importances_ >= 85 + selection = vi.selection_fdr(0.2) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_fdr_adaptation(self, set_BaseVariableImportance): + "test selection of the adaptation" + vi = set_BaseVariableImportance + true_value = vi.importances_ >= 85 + selection = vi.selection_fdr(0.2, adaptive_aggregation=True) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_fdr_bhy(self, set_BaseVariableImportance): + "test selection of the adaptation" + vi = set_BaseVariableImportance + true_value = vi.importances_ >= 85 + selection = vi.selection_fdr(0.8, fdr_control="bhy") + np.testing.assert_array_equal(true_value, selection) + + def test_selection_fdr_ebh(self, set_BaseVariableImportance): + "test selection of the adaptation" + vi = set_BaseVariableImportance + true_value = vi.importances_ >= 2 + selection = vi.selection_fdr(0.037, fdr_control="ebh", evalues=True) + np.testing.assert_array_equal(true_value, selection) + + +@pytest.mark.parametrize( + "pvalues, test_score, seed", + [(False, False, 0), (True, False, 0), (True, True, 0)], + ids=["only importance", "p-value", "test-score"], +) +class TestBVIExceptions: + """Test class for BVI Exception""" + + def test_not_fit(self, pvalues, test_score, seed): + "test detection unfit" + vi = BaseVariableImportance() + with pytest.raises( + ValueError, + match="The importances need to be called before calling this method", + ): + vi._check_importance() + with pytest.raises( + ValueError, + match="The importances need to be called before calling this method", + ): + vi.selection() + with pytest.raises( + ValueError, + match="The importances need to be called before calling this method", + ): + vi.selection_fdr(0.1) + + def test_selection_k_best(self, set_BaseVariableImportance): + "test selection k_best wrong" + vi = set_BaseVariableImportance + with pytest.raises(AssertionError, match="k_best needs to be positive or null"): + vi.selection(k_best=-10) + with pytest.warns(Warning, match="k=1000 is greater than n_features="): + vi.selection(k_best=1000) + + def test_selection_percentile(self, set_BaseVariableImportance): + "test selection percentile wrong" + vi = set_BaseVariableImportance + with pytest.raises( + AssertionError, match="percentile needs to be between 0 and 100" + ): + vi.selection(percentile=-1) + with pytest.raises( + AssertionError, match="percentile needs to be between 0 and 100" + ): + vi.selection(percentile=102) + + def test_selection_threshold(self, set_BaseVariableImportance): + "test selection threshold wrong" + vi = set_BaseVariableImportance + if vi.pvalues_ is None: + with pytest.raises( + AssertionError, + match="This method doesn't support a threshold on p-values", + ): + vi.selection(threshold_pvalue=-1) + else: + with pytest.raises( + AssertionError, match="threshold_pvalue needs to be between 0 and 1" + ): + vi.selection(threshold_pvalue=-1) + with pytest.raises( + AssertionError, match="threshold_pvalue needs to be between 0 and 1" + ): + vi.selection(threshold_pvalue=1.1) + + def test_selection_fdr_fdr_control(self, set_BaseVariableImportance): + "test selection fdr_control wrong" + vi = set_BaseVariableImportance + if vi.test_scores_ is None: + with pytest.raises( + AssertionError, + match="this method doesn't support selection base on FDR", + ): + vi.selection_fdr(fdr=0.1) + else: + with pytest.raises( + AssertionError, match="for e-value, the fdr control need to be 'ebh'" + ): + vi.selection_fdr(fdr=0.1, evalues=True) + with pytest.raises( + AssertionError, match="for p-value, the fdr control can't be 'ebh'" + ): + vi.selection_fdr(fdr=0.1, fdr_control="ebh", evalues=False) From 1a4259200f2e26c5b4bb9b0028470e8f8a42e417 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 28 Aug 2025 19:46:36 +0200 Subject: [PATCH 04/93] update docstring --- test/test_base_variable_importance.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py index e3395c4ea..0d4ff4a81 100644 --- a/test/test_base_variable_importance.py +++ b/test/test_base_variable_importance.py @@ -6,6 +6,22 @@ @pytest.fixture def set_BaseVariableImportance(pvalues, test_score, seed): + """Create a BaseVariableImportance instance with test data for testing purposes. + + Parameters + ---------- + pvalues : bool + If True, generate random p-values for testing. + test_score : bool + If True, generate random test scores for testing. + seed : int + Random seed for reproducibility. + + Returns + ------- + BaseVariableImportance + A BaseVariableImportance instance with test data. + """ nb_features = 100 rng = np.random.RandomState(seed) vi = BaseVariableImportance() @@ -14,6 +30,7 @@ def set_BaseVariableImportance(pvalues, test_score, seed): if pvalues or test_score: vi.pvalues_ = np.sort(rng.rand(nb_features))[vi.importances_] if test_score: + # TODO: this can be improved. vi.test_scores_ = [] for i in range(10): score = np.random.rand(nb_features) * 30 @@ -32,7 +49,7 @@ def set_BaseVariableImportance(pvalues, test_score, seed): ids=["only importance", "p-value", "test-score"], ) class TestSelection: - """Test selection base on importance""" + """Test selection based on importance""" def test_selection_k_best(self, set_BaseVariableImportance): "test selection of the k_best" @@ -109,7 +126,7 @@ def test_selection_threshold_pvalue(self, set_BaseVariableImportance): "pvalues, test_score, seed", [(True, True, 10)], ids=["default"] ) class TestSelectionFDR: - """Test selection base on fdr""" + """Test selection based on fdr""" def test_selection_fdr_default(self, set_BaseVariableImportance): "test selection of the default" @@ -126,14 +143,14 @@ def test_selection_fdr_adaptation(self, set_BaseVariableImportance): np.testing.assert_array_equal(true_value, selection) def test_selection_fdr_bhy(self, set_BaseVariableImportance): - "test selection of the adaptation" + "test selection with bhy" vi = set_BaseVariableImportance true_value = vi.importances_ >= 85 selection = vi.selection_fdr(0.8, fdr_control="bhy") np.testing.assert_array_equal(true_value, selection) def test_selection_fdr_ebh(self, set_BaseVariableImportance): - "test selection of the adaptation" + "test selection with e-values" vi = set_BaseVariableImportance true_value = vi.importances_ >= 2 selection = vi.selection_fdr(0.037, fdr_control="ebh", evalues=True) From 5854f2e7fdc3dffcb4a305504070a85529f1d082 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 28 Aug 2025 19:58:21 +0200 Subject: [PATCH 05/93] fix docstring --- src/hidimstat/base_variable_importance.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index b6e02eaa0..9e0f4dbb5 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -53,16 +53,18 @@ def selection( ): """ Selects features based on variable importance. + Parameters ---------- - k_best : int, optional, default=None + k_best : int, default=None Selects the top k features based on importance scores. - percentile : float, optional, default=None + percentile : float, default=None Selects features based on a specified percentile of importance scores. - threshold : float, optional, default=None + threshold : float, default=None Selects features with importance scores above the specified threshold. - threshold_pvalue : float, optional, default=None + threshold_pvalue : float, default=None Selects features with p-values below the specified threshold. + Returns ------- selection : array-like of shape (n_features,) @@ -182,7 +184,7 @@ def selection_fdr( Raises ------ AssertionError - If test_scores_ is None or if incompatible combinations of parameters are provided + If test_scores\_ is None or if incompatible combinations of parameters are provided """ self._check_importance() assert ( From 3c08f757398e799e964b4d82f2b3a83e1a71a0f6 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 29 Aug 2025 18:46:54 +0200 Subject: [PATCH 06/93] Add test for 1 test_score --- src/hidimstat/base_variable_importance.py | 40 +++++++++++------------ test/test_base_variable_importance.py | 9 +++++ 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index 9e0f4dbb5..911435a8e 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -38,6 +38,9 @@ def __init__(self): self.pvalues_ = None self.selections_ = None self.test_scores_ = None + self.threshold_fdr_ = None + self.aggregated_pval_ = None + self.aggregated_eval_ = None def _check_importance(self): """ @@ -191,43 +194,38 @@ def selection_fdr( self.test_scores_ is not None ), "this method doesn't support selection base on FDR" - if not evalues: + if self.test_scores_.shape[0] == 1: + self.threshold_fdr_ = _estimated_threshold(self.test_scores_, fdr=fdr) + selected = self.test_scores_[0] >= self.threshold_fdr_ + elif not evalues: assert fdr_control != "ebh", "for p-value, the fdr control can't be 'ebh'" pvalues = np.array( - [ - _empirical_pval(self.test_scores_[i]) - for i in range(len(self.test_scores_)) - ] + [_empirical_pval(test_score) for test_score in self.test_scores_] ) - aggregated_pval = quantile_aggregation( + self.aggregated_pval_ = quantile_aggregation( pvalues, gamma=gamma, adaptive=adaptive_aggregation ) - threshold = fdr_threshold( - aggregated_pval, + self.threshold_fdr_ = fdr_threshold( + self.aggregated_pval_, fdr=fdr, method=fdr_control, reshaping_function=reshaping_function, ) - selected = aggregated_pval <= threshold + selected = self.aggregated_pval_ <= self.threshold_fdr_ else: assert fdr_control == "ebh", "for e-value, the fdr control need to be 'ebh'" - ko_threshold = [] + evalues = [] for test_score in self.test_scores_: - ko_threshold.append(_estimated_threshold(test_score, fdr=fdr)) - evalues = np.array( - [ - _empirical_eval(self.test_scores_[i], ko_threshold[i]) - for i in range(len(self.test_scores_)) - ] - ) - aggregated_eval = np.mean(evalues, axis=0) - threshold = fdr_threshold( - aggregated_eval, + ko_threshold = _estimated_threshold(test_score, fdr=fdr) + evalues.append(_empirical_eval(test_score, ko_threshold)) + self.aggregated_eval_ = np.mean(evalues, axis=0) + self.threshold_fdr_ = fdr_threshold( + self.aggregated_eval_, fdr=fdr, method=fdr_control, reshaping_function=reshaping_function, ) - selected = aggregated_eval >= threshold + selected = self.aggregated_eval_ >= self.threshold_fdr_ return selected diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py index 0d4ff4a81..5f154f91a 100644 --- a/test/test_base_variable_importance.py +++ b/test/test_base_variable_importance.py @@ -40,6 +40,7 @@ def set_BaseVariableImportance(pvalues, test_score, seed): score[-i:] = np.arange(30 - i, 30) * 2 score[:i] = -np.arange(30 - i, 30) vi.test_scores_.append(score[vi.importances_]) + vi.test_scores_ = np.array(vi.test_scores_) return vi @@ -135,6 +136,14 @@ def test_selection_fdr_default(self, set_BaseVariableImportance): selection = vi.selection_fdr(0.2) np.testing.assert_array_equal(true_value, selection) + def test_selection_fdr_default_1(self, set_BaseVariableImportance): + "test selection of the default" + vi = set_BaseVariableImportance + vi.test_scores_ = np.array([vi.test_scores_[0, :]]) + true_value = vi.importances_ > -1 # all selected + selection = vi.selection_fdr(0.2) + np.testing.assert_array_equal(true_value, selection) + def test_selection_fdr_adaptation(self, set_BaseVariableImportance): "test selection of the adaptation" vi = set_BaseVariableImportance From 7f3a117c00100a08cd86a375125ee502208b8295 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 1 Sep 2025 12:02:38 +0200 Subject: [PATCH 07/93] change the usage of test fdr without aggregation --- src/hidimstat/base_variable_importance.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index 911435a8e..cd8d64b2a 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -195,7 +195,12 @@ def selection_fdr( ), "this method doesn't support selection base on FDR" if self.test_scores_.shape[0] == 1: - self.threshold_fdr_ = _estimated_threshold(self.test_scores_, fdr=fdr) + self.threshold_fdr_ = fdr_threshold( + self.pvalues_, + fdr=fdr, + method=fdr_control, + reshaping_function=reshaping_function, + ) selected = self.test_scores_[0] >= self.threshold_fdr_ elif not evalues: assert fdr_control != "ebh", "for p-value, the fdr control can't be 'ebh'" From 21250b4725c94dfd6bb4027c575f6c3d1654f4e8 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 1 Sep 2025 18:51:11 +0200 Subject: [PATCH 08/93] remove a print in test --- test/test_base_variable_importance.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py index 5f154f91a..3f18b78dc 100644 --- a/test/test_base_variable_importance.py +++ b/test/test_base_variable_importance.py @@ -116,7 +116,6 @@ def test_selection_threshold_pvalue(self, set_BaseVariableImportance): vi = set_BaseVariableImportance if vi.pvalues_ is not None: true_value = vi.importances_ < 5 - print(vi.pvalues_) selection = vi.selection( threshold_pvalue=vi.pvalues_[np.argsort(vi.importances_)[5]] ) From 17d9d9506bd71d604633621fad084794bb822b28 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 2 Sep 2025 11:32:26 +0200 Subject: [PATCH 09/93] Update selection --- src/hidimstat/base_variable_importance.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index cd8d64b2a..e2d699ead 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -194,13 +194,20 @@ def selection_fdr( self.test_scores_ is not None ), "this method doesn't support selection base on FDR" - if self.test_scores_.shape[0] == 1: - self.threshold_fdr_ = fdr_threshold( - self.pvalues_, - fdr=fdr, - method=fdr_control, - reshaping_function=reshaping_function, - ) + if self.test_scores_ is None: + if self.pvalues_ is None: + raise ValueError( + "For using a selection with FDR, it require a method which compute at least FDR." + ) + else: + self.threshold_fdr_ = fdr_threshold( + self.pvalues_, + fdr=fdr, + method=fdr_control, + reshaping_function=reshaping_function, + ) + elif self.test_scores_.shape[0] == 1: + self.threshold_fdr_ = _estimated_threshold(self.test_scores_, fdr=fdr) selected = self.test_scores_[0] >= self.threshold_fdr_ elif not evalues: assert fdr_control != "ebh", "for p-value, the fdr control can't be 'ebh'" From 4b7abd6a271460bbbc7d66bfe157718cb2b9a328 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 4 Sep 2025 21:09:54 +0200 Subject: [PATCH 10/93] update Desparsified Lasso --- docs/src/api.rst | 4 +- src/hidimstat/__init__.py | 10 +- src/hidimstat/desparsified_lasso.py | 867 ++++++++++-------- src/hidimstat/ensemble_clustered_inference.py | 95 +- src/hidimstat/noise_std.py | 110 +-- test/test_desparsified_lasso.py | 100 +- test/test_ensemble_clustered_inference.py | 63 +- test/test_noise_std.py | 33 +- 8 files changed, 664 insertions(+), 618 deletions(-) diff --git a/docs/src/api.rst b/docs/src/api.rst index 51074f1dc..0e14b6106 100644 --- a/docs/src/api.rst +++ b/docs/src/api.rst @@ -19,9 +19,6 @@ Functions quantile_aggregation clustered_inference clustered_inference_pvalue - desparsified_lasso - desparsified_lasso_pvalue - desparsified_group_lasso_pvalue ensemble_clustered_inference ensemble_clustered_inference_pvalue model_x_knockoff @@ -40,3 +37,4 @@ Classes CFI PFI D0CRT + DesparsifiedLasso diff --git a/src/hidimstat/__init__.py b/src/hidimstat/__init__.py index 81d5a0cce..63d3e438c 100644 --- a/src/hidimstat/__init__.py +++ b/src/hidimstat/__init__.py @@ -8,11 +8,7 @@ ensemble_clustered_inference, ensemble_clustered_inference_pvalue, ) -from .desparsified_lasso import ( - desparsified_lasso, - desparsified_lasso_pvalue, - desparsified_group_lasso_pvalue, -) +from .desparsified_lasso import desparsified_lasso, DesparsifiedLasso from .distilled_conditional_randomization_test import d0crt, D0CRT from .conditional_feature_importance import CFI from .knockoffs import ( @@ -41,9 +37,7 @@ "d0crt", "D0CRT", "desparsified_lasso", - "desparsified_lasso_pvalue", - "desparsified_group_lasso_pvalue", - "reid", + "DesparsifiedLasso" "reid", "model_x_knockoff", "model_x_knockoff_pvalue", "model_x_knockoff_bootstrap_quantile", diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index 427e89229..3fcd89ba3 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -1,39 +1,30 @@ +import warnings + import numpy as np from joblib import Parallel, delayed from numpy.linalg import multi_dot from scipy import stats from scipy.linalg import inv +from sklearn.base import clone, check_is_fitted from sklearn.linear_model import Lasso from sklearn.utils.validation import check_memory +from sklearn.linear_model import LassoCV, MultiTaskLassoCV +from sklearn.model_selection import KFold +from sklearn.exceptions import NotFittedError +from sklearn.utils import check_random_state +from sklearn.preprocessing import StandardScaler +from hidimstat.base_variable_importance import BaseVariableImportance from hidimstat.noise_std import reid from hidimstat.statistical_tools.p_values import ( pval_from_two_sided_pval_and_sign, pval_from_cb, ) from hidimstat._utils.regression import _alpha_max +from hidimstat._utils.docstring import _aggregate_docstring -def desparsified_lasso( - X, - y, - dof_ajdustement=False, - max_iteration=5000, - tolerance=1e-3, - alpha_max_fraction=0.01, - epsilon=1e-2, - tolerance_reid=1e-4, - n_splits=5, - n_jobs=1, - seed=0, - memory=None, - verbose=0, - multioutput=False, - covariance=None, - noise_method="AR", - order=1, - stationary=True, -): +class DesparsifiedLasso(BaseVariableImportance): """ Desparsified Lasso @@ -42,61 +33,34 @@ def desparsified_lasso( Parameters ---------- - X : ndarray, shape (n_samples, n_features) - Input data matrix. - - y : ndarray, shape (n_samples,) or (n_samples, n_times) - Target vector for single response or matrix for multiple - responses. - - dof_ajdustement : bool, optional (default=False) - If True, applies degrees of freedom adjustment from :footcite:t:`bellec2022biasing`. - If False, computes original Desparsified Lasso estimator. - - max_iteration : int, optional (default=5000) - Maximum iterations for Nodewise Lasso regressions. + lasso_cv : LassoCV or MultiTaskLassoCV instance, default=LassoCV() + CV object used for initial Lasso fit. - tolerance : float, optional (default=1e-3) - Convergence tolerance for optimization. + lasso : Lasso instance, default=Lasso() + Base Lasso estimator used for nodewise regressions. - alpha_max_fraction : float, optional (default=0.01) - Fraction of max alpha used for Lasso regularization. + centered : bool, default=True + Whether to center X and y. - epsilon : float, optional (default=1e-2) - Small constant used in noise estimation. - - tolerance_reid : float, optional (default=1e-4) - Tolerance for variance estimation with the Reid method. - - n_splits : int, optional (default=5) - Number of splits for cross-validation in Reid procedure. - - n_jobs : int, optional (default=1) - Number of parallel jobs. Use -1 for all CPUs. - - seed : int, default=0 - Random seed for reproducibility. + dof_ajdustement : bool, default=False + If True, applies degrees of freedom adjustment from :footcite:t:`bellec2022biasing`. - memory : str or joblib.Memory object, optional (default=None) - Used to cache the output of the computation of the Nodewise Lasso. - By default, no caching is done. If a string is given, it is the path - to the caching directory. + alpha_max_fraction : float, default=0.01 + Fraction of max alpha used for nodewise Lasso regularization. - verbose : int, default=0 - Verbosity level for logging. + tolerance_reid : float, default=1e-4 + Tolerance for Reid variance estimation method. - multioutput : bool, default=False - If True, use group Lasso for multiple responses. + random_state : int, RandomState instance or None, default=None + Controls randomization in CV splitter and Lasso fits. - covariance : ndarray, shape (n_times, n_times), default=None - Temporal covariance matrix of the noise. - If None, it is estimated. + covariance : ndarray of shape (n_times, n_times) or None, default=None + Temporal noise covariance matrix. If None, estimated from data. noise_method : {'AR', 'median'}, default='AR' Method to estimate noise covariance: - - 'median': Uses median correlation between consecutive - timepoints - - 'AR': Fits autoregressive model of specified order + - 'AR': Autoregressive model + - 'median': Median correlation between consecutive timepoints order : int, default=1 Order of AR model when noise_method='AR'. Must be < n_times. @@ -104,350 +68,403 @@ def desparsified_lasso( stationary : bool, default=True Whether to assume stationary noise in estimation. - Returns - ------- - beta_hat : ndarray, shape (n_features,) or (n_features, n_times) - Desparsified Lasso coefficient estimates. - - sigma_hat/theta_hat : float or ndarray, shape (n_times, n_times) - Estimated noise level (single response) or precision matrix - (multiple responses). - - precision_diagonal : ndarray, shape (n_features,) - Diagonal elements of the precision matrix. - - Notes - ----- - The columns of `X` and `y` are always centered, this ensures that - the intercepts of the Nodewise Lasso problems are all equal to zero - and the intercept of the noise model is also equal to zero. Since - the values of the intercepts are not of interest, the centering avoids - the consideration of unecessary additional parameters. - Also, you may consider to center and scale `X` beforehand, notably if - the data contained in `X` has not been prescaled from measurements. - Other relevant references: :footcite:t:`van2014asymptotically`, - :footcite:t:`zhang2014confidence`. - - References - ---------- - .. footbibliography:: - """ - memory = check_memory(memory) - - X_ = np.asarray(X) - - n_samples, n_features = X_.shape - if multioutput: - n_times = y.shape[1] - if covariance is not None and covariance.shape != (n_times, n_times): - raise ValueError( - f'Shape of "cov" should be ({n_times}, {n_times}),' - + f' the shape of "cov" was ({covariance.shape}) instead' - ) - - # centering the data and the target variable - y_ = y - np.mean(y) - X_ = X_ - np.mean(X_, axis=0) - - # Lasso regression and noise standard deviation estimation - sigma_hat, beta_reid = memory.cache(reid, ignore=["n_jobs"])( - X_, - y_, - epsilon=epsilon, - tolerance=tolerance_reid, - max_iterance=max_iteration, - n_splits=n_splits, - n_jobs=n_jobs, - seed=seed, - # for group - multioutput=multioutput, - method=noise_method, - order=order, - stationary=stationary, - ) - - # define the alphas for the Nodewise Lasso - list_alpha_max = _alpha_max(X_, X_, fill_diagonal=True, axis=0) - alphas = alpha_max_fraction * list_alpha_max - - # Calculating precision matrix (Nodewise Lasso) - Z, precision_diagonal = memory.cache( - _compute_all_residuals, ignore=["n_jobs", "verbose"] - )( - X_, - alphas, - np.dot(X_.T, X_), # Gram matrix - max_iteration=max_iteration, - tolerance=tolerance, - n_jobs=n_jobs, - verbose=verbose, - ) - - # Computing the degrees of freedom adjustement - if dof_ajdustement: - coefficient_max = np.max(np.abs(beta_reid)) - support = np.sum(np.abs(beta_reid) > 0.01 * coefficient_max) - support = min(support, n_samples - 1) - dof_factor = n_samples / (n_samples - support) - else: - dof_factor = 1 - - # Computing Desparsified Lasso estimator and confidence intervals - # Estimating the coefficient vector - beta_bias = dof_factor * np.dot(y_.T, Z) / np.sum(X_ * Z, axis=0) - - # beta hat - P = (np.dot(X_.T, Z) / np.sum(X_ * Z, axis=0)).T - P_nodiagonal = P - np.diag(np.diag(P)) - Id = np.identity(n_features) - P_nodiagonal = dof_factor * P_nodiagonal + (dof_factor - 1) * Id - beta_hat = beta_bias.T - P_nodiagonal.dot(beta_reid.T) - # confidence intervals - precision_diagonal = precision_diagonal * dof_factor**2 - - if not multioutput: - return beta_hat, sigma_hat, precision_diagonal - else: - covariance_hat = sigma_hat - if covariance is not None: - covariance_hat = covariance - theta_hat = n_samples * inv(covariance_hat) - return beta_hat, theta_hat, precision_diagonal - - -def desparsified_lasso_pvalue( - n_samples, - beta_hat, - sigma_hat, - precision_diagonal, - confidence=0.95, - distribution="norm", - epsilon=1e-14, -): - """ - Calculate confidence intervals and p-values for desparsified lasso estimators. - - This function computes confidence intervals for the desparsified lasso - estimator beta_hat. - It can also return p-values derived from these confidence intervals. - - Parameters - ---------- - n_samples : float - The number of samples - beta_hat : ndarray, shape (n_features,) - The desparsified lasso coefficient estimates. - sigma_hat : float - Estimated noise level. - precision_diagonal : ndarray, shape (n_features,) - Diagonal elements of the precision matrix estimate. confidence : float, default=0.95 Confidence level for intervals, must be in [0, 1]. - distribution : str, default="norm" - Distribution to use for p-value calculation. - Currently only "norm" supported. - epsilon : float, default=1e-14 - Small value to avoid numerical issues in p-value calculation. - - Returns - ------- - pval : ndarray, shape (n_features,) - P-values - pval_corr : ndarray, shape (n_features,) - Corrected p-values - one_minus_pval : ndarray, shape (n_features,) - 1 - p-values - one_minus_pval_corr : ndarray, shape (n_features,) - 1 - corrected p-values - confidence_bound_min : ndarray, shape (n_features,) - Lower bounds of confidence intervals - confidence_bound_max : ndarray, shape (n_features,) - Upper bounds of confidence intervals - """ - # define the quantile for the confidence intervals - quantile = stats.norm.ppf(1 - (1 - confidence) / 2) - # see definition of lower and upper bound in algorithm 1 - # in `chevalier2020statisticalthesis`: - # quantile_(1-alpha/2) * (n**(-1/2)) * sigma * (precision_diagonal**(1/2)) - confint_radius = np.abs( - quantile * sigma_hat * np.sqrt(precision_diagonal) / np.sqrt(n_samples) - ) - confidence_bound_max = beta_hat + confint_radius - confidence_bound_min = beta_hat - confint_radius - - pval, pval_corr, one_minus_pval, one_minus_pval_corr = pval_from_cb( - confidence_bound_min, - confidence_bound_max, - confidence=confidence, - distribution=distribution, - eps=epsilon, - ) - return ( - pval, - pval_corr, - one_minus_pval, - one_minus_pval_corr, - confidence_bound_min, - confidence_bound_max, - ) - - -def desparsified_group_lasso_pvalue( - beta_hat, theta_hat, precision_diagonal, test="chi2" -): - """ - Compute p-values for the desparsified group Lasso estimator using - chi-squared or F tests - - Parameters - ---------- - beta_hat : ndarray, shape (n_features, n_times) - Estimated parameter matrix from desparsified group Lasso. - theta_hat : ndarray, shape (n_times, n_times) - Estimated precision matrix (inverse covariance). + distribution : str, default='norm' + Distribution for p-value calculation. Only 'norm' supported. - precision_diagonal : ndarray, shape (n_features,) - Diagonal elements of the precision matrix. + epsilon_pvalue : float, default=1e-14 + Small value to avoid numerical issues in p-values. test : {'chi2', 'F'}, default='chi2' - Statistical test for computing p-values: - - 'chi2': Chi-squared test (recommended for large samples) - - 'F': F-test + Test for p-values: + - 'chi2': Chi-squared test (large samples) + - 'F': F-test (small samples) - Returns - ------- - pval : ndarray, shape (n_features,) - Raw p-values, numerically accurate for positive effects - (p-values close to 0). - - pval_corr : ndarray, shape (n_features,) - P-values corrected for multiple testing using - Benjamini-Hochberg procedure. - - one_minus_pval : ndarray, shape (n_features,) - 1 - p-values, numerically accurate for negative effects - (p-values close to 1). - - one_minus_pval_corr : ndarray, shape (n_features,) - 1 - corrected p-values. - - Notes - ----- - The Chi-squared test assumes asymptotic normality while the F-test - is preferable for small sample sizes. - P-values are computed based on score statistics from the estimated - coefficients and precision matrix. - """ - n_features, n_times = beta_hat.shape - n_samples = precision_diagonal.shape[0] - - # Compute the two-sided p-values - if test == "chi2": - chi2_scores = ( - np.diag(multi_dot([beta_hat, theta_hat, beta_hat.T])) / precision_diagonal - ) - two_sided_pval = np.minimum(2 * stats.chi2.sf(chi2_scores, df=n_times), 1.0) - elif test == "F": - f_scores = ( - np.diag(multi_dot([beta_hat, theta_hat, beta_hat.T])) - / precision_diagonal - / n_times - ) - two_sided_pval = np.minimum( - 2 * stats.f.sf(f_scores, dfd=n_samples, dfn=n_times), 1.0 - ) - else: - raise ValueError(f"Unknown test '{test}'") - - # Compute the p-values - sign_beta = np.sign(np.sum(beta_hat, axis=1)) - pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( - pval_from_two_sided_pval_and_sign(two_sided_pval, sign_beta) - ) + n_jobs : int, default=1 + Number of parallel jobs. -1 means all CPUs. - return pval, pval_corr, one_minus_pval, one_minus_pval_corr + memory : str or Memory object, default=None + Used to cache nodewise Lasso computations. + verbose : int, default=0 + Verbosity level. -def _compute_all_residuals( - X, alphas, gram, max_iteration=5000, tolerance=1e-3, n_jobs=1, verbose=0 -): - """ - Nodewise Lasso for computing residuals and precision matrix diagonal. - - For each feature, fits a Lasso regression against all other features - to estimate the precision matrix and residuals needed for the - desparsified Lasso estimator. - - Parameters + Attributes ---------- - X : ndarray, shape (n_samples, n_features) - Input data matrix. - - alphas : ndarray, shape (n_features,) - Lasso regularization parameters, one per feature. - - gram : ndarray, shape (n_features, n_features) - Precomputed Gram matrix X.T @ X to speed up computations. + importances_ : ndarray of shape (n_features,) or (n_features, n_times) + Desparsified Lasso coefficient estimates. - max_itereration : int, optional (default=5000) - Maximum number of iterations for Lasso optimization. + pvalues_ : ndarray of shape (n_features,) + Two-sided p-values. - tolerance : float, optional (default=1e-3) - Convergence tolerance for Lasso optimization. + pvalues_corr_ : ndarray of shape (n_features,) + Multiple testing corrected p-values. - n_jobs : int or None, optional (default=1) - Number of parallel jobs. None means using all processors. + sigma_hat_ : float or ndarray of shape (n_times, n_times) + Estimated noise level or precision matrix. - verbose : int, optional (default=0) - Controls the verbosity when fitting the models: - 0 = silent - 1 = progress bar - >1 = more detailed output + confidence_bound_min_ : ndarray of shape (n_features,) + Lower confidence bounds. - Returns - ------- - Z : ndarray, shape (n_samples, n_features) - Matrix of residuals from nodewise regressions. - - precision_diagonal : ndarray, shape (n_features,) - Diagonal entries of the precision matrix estimate. + confidence_bound_max_ : ndarray of shape (n_features,) + Upper confidence bounds. Notes ----- - This implements the nodewise Lasso procedure from :footcite:t:`chevalier2020statisticalthesis` - for estimating entries of the precision matrix needed in the - desparsified Lasso. The procedure regresses each feature against - all others using Lasso to obtain residuals and precision matrix estimates. + X and y are always centered. Consider pre-scaling X if not already scaled. + Chi-squared test assumes asymptotic normality, F-test preferred for small samples. References ---------- .. footbibliography:: """ - n_samples, n_features = X.shape + def __init__( + self, + lasso_cv=LassoCV( + eps=1e-2, + fit_intercept=False, + cv=KFold(n_splits=5, shuffle=True, random_state=0), + tol=1e-4, + max_iter=5000, + random_state=1, + n_jobs=1, + ), + lasso=Lasso(max_iter=5000, tol=1e-3), + centered=True, + dof_ajdustement=False, + alpha_max_fraction=0.01, + tolerance_reid=1e-4, + random_state=None, + covariance=None, + noise_method="AR", + order=1, + stationary=True, + confidence=0.95, + distribution="norm", + epsilon_pvalue=1e-14, + test="chi2", + n_jobs=1, + memory=None, + verbose=0, + ): + + assert issubclass( + Lasso, lasso.__class__ + ), "lasso needs to be a Lasso or a MultiTaskLassoCV" + self.lasso = lasso + if issubclass(LassoCV, lasso_cv.__class__): + self.n_times_ = 1 + elif issubclass(MultiTaskLassoCV, lasso_cv.__class__): + self.n_times_ = -1 + else: + raise ValueError("lasso_cv need to be a Lasso or a MultiTaskLassoCV") + self.lasso_cv = lasso_cv + self.centered = centered + self.dof_ajdustement = dof_ajdustement + self.alpha_max_fraction = alpha_max_fraction + self.tolerance_reid = tolerance_reid + self.covariance = covariance + self.noise_method = noise_method + self.order = order + self.stationary = stationary + self.confidence = confidence + self.distribution = distribution + self.epsilon_pvalue = epsilon_pvalue + assert test == "chi2" or test == "F", f"Unknown test '{test}'" + self.test = test + self.n_jobs = n_jobs + self.random_state = random_state + self.memory = memory + self.verbose = verbose + + self.sigma_hat_ = None + self.confidence_bound_min_ = None + self.confidence_bound_max_ = None + self.pvalues_corr_ = None + + def fit(self, X, y): + """ + Fit the Desparsified Lasso model. + + This method fits the Desparsified Lasso model, which provides debiased estimates + and statistical inference for high-dimensional linear models through a two-step + procedure involving initial Lasso estimation followed by bias correction. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data matrix. + y : array-like of shape (n_samples,) or (n_samples, n_times) + Target values. For single task, y should be 1D or (n_samples, 1). + For multi-task, y should be (n_samples, n_times). + + Returns + ------- + self : object + Returns the fitted instance. + + Notes + ----- + Main steps: + 1. Optional data centering + 2. Initial Lasso fit using cross-validation + 3. Computation of residuals + 4. Estimation of noise standard deviation + 5. Preparation for subsequent importance score calculation + """ + memory = check_memory(self.memory) + if self.n_times_ == -1: + self.n_times_ = y.shape[1] + + # centering the data and the target variable + if self.centered: + X_ = StandardScaler(with_std=False).fit_transform(X) + y_ = y - np.mean(y) + else: + X_ = X + y_ = y + _, n_features = X_.shape + + try: + check_is_fitted(self.lasso_cv) + except NotFittedError: + # check if max_iter is large enough + if self.lasso_cv.max_iter // self.lasso_cv.cv.n_splits <= n_features: + self.lasso_cv.set_params(max_iter=n_features * self.lasso_cv.n_splits) + Warning( + f"'max_iter' has been increased to {self.lasso_cv.max_iterance}" + ) + # use the cross-validation for define the best alpha of Lasso + self.lasso_cv.set_params(n_jobs=self.n_jobs) + self.lasso_cv.fit(X_, y_) + + # Estimate the support of the variable importance + residual = self.lasso_cv.predict(X_) - y_ + + # Lasso regression and noise standard deviation estimation + self.sigma_hat_ = memory.cache(reid, ignore=["n_jobs"])( + self.lasso_cv.coef_, + residual, + tolerance=self.tolerance_reid, + # for group + multioutput=self.n_times_ > 1, + method=self.noise_method, + order=self.order, + stationary=self.stationary, + ) - results = Parallel(n_jobs=n_jobs, verbose=verbose)( - delayed(_compute_residuals)( - X=X, - id_column=i, - alpha=alphas[i], - gram=gram, - max_iteration=max_iteration, - tolerance=tolerance, + return self + + def _check_fit(self): + """ + Check if the model has been fit properly. + + This method verifies that the model has been fitted by checking + essential attributes (sigma_hat_ and lasso_cv). + + Raises + ------ + ValueError + If model hasn't been fit or required attributes are missing. + """ + if self.sigma_hat_ is None: + raise ValueError( + "The Desparsified Lasso requires to be fit before any analysis" + ) + try: + check_is_fitted(self.lasso_cv) + except NotFittedError: + raise ValueError( + "The Desparsified Lasso requires to be fit before any analysis" + ) + + def importance(self, X, y): + """ + Compute desparsified lasso estimates and confidence intervals. + + Calculates debiased coefficients, confidence intervals and p-values + using the desparsified lasso method. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data matrix. + y : array-like of shape (n_samples,) or (n_samples, n_times) + Target values. For single task, y should be 1D or (n_samples, 1). + For multi-task, y should be (n_samples, n_times). + + Returns + ------- + importances_ : ndarray of shape (n_features,) or (n_features, n_times) + Desparsified lasso coefficient estimates. + + Attributes + ---------- + importances_ : same as return value + pvalues_ : ndarray of shape (n_features,) + Two-sided p-values for each feature. + pvalues_corr_ : ndarray of shape (n_features,) + Multiple testing corrected p-values. + confidence_bound_min_ : ndarray of shape (n_features,) + Lower confidence bounds (only for single task). + confidence_bound_max_ : ndarray of shape (n_features,) + Upper confidence bounds (only for single task). + + Notes + ----- + The method: + 1. Performs nodewise lasso regressions to estimate precision matrix + 2. Debiases initial lasso estimates + 3. Computes confidence intervals and p-values + 4. For multi-task case, uses chi-squared or F test + """ + self._check_fit() + rng = check_random_state(self.random_state) + + # centering the data and the target variable + if self.centered: + X_ = StandardScaler(with_std=False).fit_transform(X) + y_ = y - np.mean(y) + else: + X_ = X + y_ = y + n_samples, n_features = X_.shape + assert X_.shape[1] == self.lasso_cv.coef_.shape[-1] + assert self.n_times_ == 1 or self.n_times_ == y.shape[1] + if self.n_times_ > 1: + if self.covariance is not None and self.covariance.shape != ( + self.n_times_, + self.n_times_, + ): + raise ValueError( + f'Shape of "cov" should be ({self.n_times_}, {self.n_times_}),' + + f' the shape of "cov" was ({self.covariance.shape}) instead' + ) + assert y_.shape[1] == self.lasso_cv.coef_.shape[0] + + # define the alphas for the Nodewise Lasso + list_alpha_max = _alpha_max(X_, X_, fill_diagonal=True, axis=0) + alphas = self.alpha_max_fraction * list_alpha_max + gram = np.dot(X_.T, X_) # Gram matrix + + # base on the recomendation of numpy for paralellization of random generator + # see https://numpy.org/doc/stable/reference/random/parallel.html + streams = np.random.SeedSequence(rng.randint(np.iinfo(np.int32).max)).spawn( + n_features ) - for i in range(n_features) - ) - # Unpacking the results - results = np.asarray(results, dtype=object) - Z = np.stack(results[:, 0], axis=1) - precision_diagonal = np.stack(results[:, 1]) + # Calculating precision matrix (Nodewise Lasso) + results = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)( + delayed(_compute_residuals)( + X=X_, + id_column=i, + clf=clone(self.lasso).set_params( + alpha=alphas[i], + precompute=np.delete(np.delete(gram, i, axis=0), i, axis=1), + random_state=np.random.RandomState( + np.random.default_rng(streams[i]).bit_generator + ), + ), + ) + for i in range(n_features) + ) + # Unpacking the results + results = np.asarray(results, dtype=object) + Z = np.stack(results[:, 0], axis=1) + precision_diagonal = np.stack(results[:, 1]) + + # Computing the degrees of freedom adjustement + if self.dof_ajdustement: + coefficient_max = np.max(np.abs(self.lasso_cv.coef_)) + support = np.sum(np.abs(self.lasso_cv.coef_) > 0.01 * coefficient_max) + support = min(support, n_samples - 1) + dof_factor = n_samples / (n_samples - support) + else: + dof_factor = 1 + + # Computing Desparsified Lasso estimator and confidence intervals + # Estimating the coefficient vector + beta_bias = dof_factor * np.dot(y_.T, Z) / np.sum(X_ * Z, axis=0) + + # beta hat + P = (np.dot(X_.T, Z) / np.sum(X_ * Z, axis=0)).T + P_nodiagonal = P - np.diag(np.diag(P)) + Id = np.identity(n_features) + P_nodiagonal = dof_factor * P_nodiagonal + (dof_factor - 1) * Id + beta_hat = beta_bias.T - P_nodiagonal.dot(self.lasso_cv.coef_.T) + # confidence intervals + precision_diagonal = precision_diagonal * dof_factor**2 + + if self.n_times_ == 1: + # define the quantile for the confidence intervals + quantile = stats.norm.ppf(1 - (1 - self.confidence) / 2) + # see definition of lower and upper bound in algorithm 1 + # in `chevalier2020statisticalthesis`: + # quantile_(1-alpha/2) * (n**(-1/2)) * sigma * (precision_diagonal**(1/2)) + confint_radius = np.abs( + quantile + * self.sigma_hat_ + * np.sqrt(precision_diagonal) + / np.sqrt(n_samples) + ) + self.confidence_bound_max_ = beta_hat + confint_radius + self.confidence_bound_min_ = beta_hat - confint_radius + + pval, pval_corr, one_minus_pval, one_minus_pval_corr = pval_from_cb( + self.confidence_bound_min_, + self.confidence_bound_max_, + confidence=self.confidence, + distribution=self.distribution, + eps=self.epsilon_pvalue, + ) + else: + covariance_hat = self.sigma_hat_ + if self.covariance is not None: + covariance_hat = self.covariance + theta_hat = n_samples * inv(covariance_hat) + # Compute the two-sided p-values + if self.test == "chi2": + chi2_scores = ( + np.diag(multi_dot([beta_hat, theta_hat, beta_hat.T])) + / precision_diagonal + ) + two_sided_pval = np.minimum( + 2 * stats.chi2.sf(chi2_scores, df=self.n_times_), 1.0 + ) + elif self.test == "F": + f_scores = ( + np.diag(multi_dot([beta_hat, theta_hat, beta_hat.T])) + / precision_diagonal + / self.n_times_ + ) + two_sided_pval = np.minimum( + 2 * stats.f.sf(f_scores, dfd=n_samples, dfn=self.n_times_), 1.0 + ) + else: + raise ValueError(f"Unknown test '{self.test}'") + + # Compute the p-values + sign_beta = np.sign(np.sum(beta_hat, axis=1)) + pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( + pval_from_two_sided_pval_and_sign(two_sided_pval, sign_beta) + ) - return Z, precision_diagonal + self.importances_ = beta_hat + self.pvalues_ = pval + self.pvalues_corr_ = pval_corr + return self.importances_ + def fit_importance(self, X, y, cv=None): + if cv is not None: + warnings.warn("cv won't be used") + self.fit(X, y) + return self.importance(X, y) -def _compute_residuals(X, id_column, alpha, gram, max_iteration=5000, tolerance=1e-3): + +def _compute_residuals(X, id_column, clf): """ Compute nodewise Lasso regression for desparsified Lasso estimation @@ -488,17 +505,12 @@ def _compute_residuals(X, id_column, alpha, gram, max_iteration=5000, tolerance= Uses sklearn's Lasso with precomputed Gram matrix for efficiency. """ - n_samples, n_features = X.shape + n_samples, _ = X.shape # Removing the column to regress against the others X_minus_i = np.delete(X, id_column, axis=1) X_i = np.copy(X[:, id_column]) - # Method used for computing the residuals of the Nodewise Lasso. - # here we use the Lasso method - gram_ = np.delete(np.delete(gram, id_column, axis=0), id_column, axis=1) - clf = Lasso(alpha=alpha, precompute=gram_, max_iter=max_iteration, tol=tolerance) - # Fitting the Lasso model and computing the residuals clf.fit(X_minus_i, X_i) z = X_i - clf.predict(X_minus_i) @@ -508,3 +520,90 @@ def _compute_residuals(X, id_column, alpha, gram, max_iteration=5000, tolerance= precision_diagonal_i = n_samples * np.sum(z**2) / np.dot(X_i, z) ** 2 return z, precision_diagonal_i + + +def desparsified_lasso( + X, + y, + cv=None, + lasso_cv=LassoCV( + eps=1e-2, + fit_intercept=False, + cv=KFold(n_splits=5, shuffle=True, random_state=0), + tol=1e-3, + max_iter=5000, + random_state=0, + ), + lasso=Lasso(max_iter=5000, tol=1e-3), + centered=True, + dof_ajdustement=False, + alpha_max_fraction=0.01, + tolerance_reid=1e-4, + random_state=None, + covariance=None, + noise_method="AR", + order=1, + stationary=True, + confidence=0.95, + distribution="norm", + epsilon_pvalue=1e-14, + test="chi2", + n_jobs=1, + memory=None, + verbose=0, + k_best=None, + percentile=None, + threshold=None, + threshold_pvalue=None, +): + methods = DesparsifiedLasso( + lasso_cv=lasso_cv, + lasso=lasso, + centered=centered, + dof_ajdustement=dof_ajdustement, + alpha_max_fraction=alpha_max_fraction, + tolerance_reid=tolerance_reid, + random_state=random_state, + covariance=covariance, + noise_method=noise_method, + order=order, + stationary=stationary, + confidence=confidence, + distribution=distribution, + epsilon_pvalue=epsilon_pvalue, + test=test, + n_jobs=n_jobs, + memory=memory, + verbose=verbose, + ) + methods.fit_importance(X, y, cv=cv) + selection = methods.selection( + k_best=k_best, + percentile=percentile, + threshold=threshold, + threshold_pvalue=threshold_pvalue, + ) + return selection, methods.importances_, methods.pvalues_ + + +# use the docstring of the class for the function +desparsified_lasso.__doc__ = _aggregate_docstring( + [ + DesparsifiedLasso.__doc__, + DesparsifiedLasso.__init__.__doc__, + DesparsifiedLasso.fit_importance.__doc__, + DesparsifiedLasso.selection.__doc__, + ], + """ + Returns + ------- + selection : ndarray of shape (n_features,) + Boolean array indicating selected features (True = selected) + importances : ndarray of shape (n_features,) + Feature importance scores/test statistics. For features not selected + during screening, scores are set to 0. + pvalues : ndarray of shape (n_features,) + Two-sided p-values for each feature under Gaussian null hypothesis. + For features not selected during screening, p-values are set to 1. + """, +) diff --git a/src/hidimstat/ensemble_clustered_inference.py b/src/hidimstat/ensemble_clustered_inference.py index f0a79db4e..7151b190a 100644 --- a/src/hidimstat/ensemble_clustered_inference.py +++ b/src/hidimstat/ensemble_clustered_inference.py @@ -3,12 +3,10 @@ from joblib import Parallel, delayed from sklearn.utils.validation import check_memory from sklearn.cluster import FeatureAgglomeration +from sklearn.linear_model import LassoCV, MultiTaskLassoCV +from sklearn.model_selection import KFold -from hidimstat.desparsified_lasso import ( - desparsified_lasso, - desparsified_lasso_pvalue, - desparsified_group_lasso_pvalue, -) +from hidimstat.desparsified_lasso import DesparsifiedLasso from hidimstat._utils.bootstrap import _subsampling from hidimstat.statistical_tools.aggregation import quantile_aggregation from hidimstat.statistical_tools.multiple_testing import fdr_threshold @@ -259,24 +257,45 @@ def clustered_inference( X_reduced = clone(scaler_sampling).fit_transform(X_reduced) # inference methods - beta_hat, theta_hat, precision_diag = memory.cache( - desparsified_lasso, ignore=["n_jobs", "verbose", "memory"] + multitasklassoCV = MultiTaskLassoCV( + eps=1e-2, + fit_intercept=False, + cv=KFold(n_splits=5, shuffle=True, random_state=0), + tol=1e-4, + max_iter=5000, + random_state=1, + n_jobs=1, + ) + lasso_cv = LassoCV( + eps=1e-2, + fit_intercept=False, + cv=KFold(n_splits=5, shuffle=True, random_state=0), + tol=1e-4, + max_iter=5000, + random_state=1, + n_jobs=1, + ) + desparsified_lassos = memory.cache( + DesparsifiedLasso( + lasso_cv=( + multitasklassoCV if len(y.shape) > 1 and y.shape[1] > 1 else lasso_cv + ), + n_jobs=n_jobs, + memory=memory, + verbose=verbose, + **kwargs, + ).fit, + ignore=["n_jobs", "verbose", "memory"], )( X_reduced, y, - multioutput=len(y.shape) > 1 and y.shape[1] > 1, # detection of multiOutput - n_jobs=n_jobs, - memory=memory, - verbose=verbose, - **kwargs, ) + desparsified_lassos.importance(X_reduced, y) - return ward_, beta_hat, theta_hat, precision_diag + return ward_, desparsified_lassos -def clustered_inference_pvalue( - n_samples, group, ward, beta_hat, theta_hat, precision_diag, **kwargs -): +def clustered_inference_pvalue(n_samples, group, ward, desparsified_lassos, **kwargs): """ Compute corrected p-values at the cluster level and transform them back to feature level. @@ -312,31 +331,15 @@ def clustered_inference_pvalue( 1 - corrected p-values """ # corrected cluster-wise p-values - if not group: - pval, pval_corr, one_minus_pval, one_minus_pval_corr, cb_min, cb_max = ( - desparsified_lasso_pvalue( - n_samples, - beta_hat, - theta_hat, - precision_diag, - **kwargs, - ) - ) - else: - pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( - desparsified_group_lasso_pvalue( - beta_hat, theta_hat, precision_diag, **kwargs - ) - ) # De-grouping beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = _degrouping( ward, - beta_hat, - pval, - pval_corr, - one_minus_pval, - one_minus_pval_corr, + desparsified_lassos.importances_, + desparsified_lassos.pvalues_, + desparsified_lassos.pvalues_corr_, + 1 - desparsified_lassos.pvalues_, + 1 - desparsified_lassos.pvalues_corr_, ) return beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr @@ -485,22 +488,18 @@ def ensemble_clustered_inference( ) for i in np.arange(seed, seed + n_bootstraps) ) - list_ward, list_beta_hat, list_theta_hat, list_precision_diag = [], [], [], [] - for ward, beta_hat, theta_hat, precision_diag in results: + list_ward, list_desparsified_lassos = [], [] + for ward, desparsified_lassos in results: list_ward.append(ward) - list_beta_hat.append(beta_hat) - list_theta_hat.append(theta_hat) - list_precision_diag.append(precision_diag) - return list_ward, list_beta_hat, list_theta_hat, list_precision_diag + list_desparsified_lassos.append(desparsified_lassos) + return list_ward, list_desparsified_lassos def ensemble_clustered_inference_pvalue( n_samples, group, list_ward, - list_beta_hat, - list_theta_hat, - list_precision_diag, + list_desparsified_lassos, fdr=0.1, fdr_control="bhq", reshaping_function=None, @@ -571,9 +570,7 @@ def ensemble_clustered_inference_pvalue( n_samples, group, list_ward[i], - list_beta_hat[i], - list_theta_hat[i], - list_precision_diag[i], + list_desparsified_lassos[i], **kwargs, ) for i in range(len(list_ward)) diff --git a/src/hidimstat/noise_std.py b/src/hidimstat/noise_std.py index 3696704e6..6d2fb1369 100644 --- a/src/hidimstat/noise_std.py +++ b/src/hidimstat/noise_std.py @@ -1,116 +1,68 @@ import numpy as np from numpy.linalg import norm from scipy.linalg import solve, toeplitz -from sklearn.linear_model import LassoCV, MultiTaskLassoCV -from sklearn.model_selection import KFold def reid( - X, - y, - epsilon=1e-2, + beta_hat, + residual, tolerance=1e-4, - max_iterance=10000, - n_splits=5, - n_jobs=1, - seed=0, multioutput=False, stationary=True, method="median", order=1, ): """ - Residual sum of squares based estimators for noise standard deviation + Residual sum of squares-based estimators for noise standard deviation estimation. This implementation follows the procedure described in - :footcite:t:`fan2012variance` and :footcite:t:`reid2016study`. It uses Lasso with - cross-validation to estimate both the noise standard deviation and model - coefficients. + :footcite:t:`fan2012variance` and :footcite:t:`reid2016study`. + The beta_hat should correspond to the coefficient of Lasso with + cross-validation, and the residual is based on this model. - For group, the implementation is based on the procedure + For groups, the implementation is based on the procedure from :footcite:t:`chevalier2020statistical`. Parameters ---------- - X : ndarray, shape (n_samples, n_features) - Input data matrix. - - y : ndarray, shape (n_samples,)/(n_samples, n_times) - Target vector. The time means the presence of groups. - - epsilon : float, optional (default=1e-2) - Length of the cross-validation path, where alpha_min / alpha_max = eps. - Smaller values create a finer grid. - - tolerance : float, optional (default=1e-4) - Tolerance for optimization convergence. The algorithm stops - when updates are smaller than tol and dual gap is smaller than tol. - - max_iteration : int, optional (default=10000) - Maximum number of iterations for the optimization algorithm. - - n_splits : int, optional (default=5) - Number of folds for cross-validation. - - n_jobs : int, optional (default=1) - Number of parallel jobs for cross-validation. - -1 means using all processors. - - seed : int, optional (default=0) - Random seed for reproducible cross-validation splits. - - stationary : bool, (default=True) + beta_hat : ndarray, shape (n_features,) or (n_times, n_features) + Estimated sparse coefficient vector from regression. + residual : ndarray, shape (n_samples,) or (n_samples, n_times) + Residuals from the regression model. + tolerance : float, default=1e-4 + Threshold for considering coefficients as non-zero. + multioutput : bool, default=False + If True, handles multiple outputs (group case). + stationary : bool, default=True Whether noise has constant magnitude across time steps. - - method : {'median', 'AR'}, (default='simple') - Covariance estimation method: + method : {'median', 'AR'}, default='median' + Method for covariance estimation in multioutput case: - 'median': Uses median correlation between consecutive time steps - 'AR': Uses Yule-Walker method with specified order - order : int, default=1 Order of AR model when method='AR'. Must be < n_times. Returns ------- - sigma_hat/cov_hat : float/ndarray, shape (n_times, n_times) - Estimated noise standard deviation based on residuals - or estimated covariance matrix for group. + sigma_hat_raw or covariance_hat : float or ndarray + For single output: estimated noise standard deviation + For multiple outputs: estimated (n_times, n_times) covariance matrix - beta_hat : ndarray, shape (n_features,)/(n_features, n_times) - Estimated sparse coefficient vector from Lasso regression. + Notes + ----- + Implementation based on :footcite:t:`reid2016study` for single output + and :footcite:t:`chevalier2020statistical` for multiple outputs. References ---------- .. footbibliography:: """ - - X_ = np.asarray(X) - n_samples, n_features = X_.shape if multioutput: - n_times = y.shape[1] - - # check if max_iter is large enough - if max_iterance // n_splits <= n_features: - max_iterance = n_features * n_splits - print(f"'max_iter' has been increased to {max_iterance}") - - # use the cross-validation for define the best alpha of Lasso - cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed) - Refit_CV = MultiTaskLassoCV if multioutput else LassoCV - clf_cv = Refit_CV( - eps=epsilon, - fit_intercept=False, - cv=cv, - tol=tolerance, - max_iter=max_iterance, - n_jobs=n_jobs, - ) - clf_cv.fit(X_, y) - - # Estimate the support of the variable importance - beta_hat = clf_cv.coef_ - residual = clf_cv.predict(X_) - y + n_times = beta_hat.shape[0] + else: + n_times = None + n_samples = residual.shape[0] # get the number of non-zero coefficients # we consider that the coefficient with a value under @@ -129,7 +81,7 @@ def reid( sigma_hat_raw = norm(residual, axis=0) / np.sqrt(n_samples - size_support) if not multioutput: - return sigma_hat_raw, beta_hat + return sigma_hat_raw ## Computation of the covariance matrix for group else: @@ -214,7 +166,7 @@ def reid( # COV(X_t, X_t) = COR(X_t, X_t) * \sigma^2 covariance_hat = np.outer(sigma_hat, sigma_hat) * correlation_hat - return covariance_hat, beta_hat + return covariance_hat def empirical_snr(X, y, beta, noise=None): diff --git a/test/test_desparsified_lasso.py b/test/test_desparsified_lasso.py index 44d24634b..61fd31b3f 100644 --- a/test/test_desparsified_lasso.py +++ b/test/test_desparsified_lasso.py @@ -6,12 +6,10 @@ import numpy as np from numpy.testing import assert_almost_equal, assert_equal from scipy.linalg import toeplitz +from sklearn.linear_model import MultiTaskLassoCV +from sklearn.model_selection import KFold -from hidimstat.desparsified_lasso import ( - desparsified_lasso, - desparsified_lasso_pvalue, - desparsified_group_lasso_pvalue, -) +from hidimstat.desparsified_lasso import DesparsifiedLasso, desparsified_lasso from hidimstat._utils.scenario import multivariate_simulation @@ -37,27 +35,23 @@ def test_desparsified_lasso(): expected_pval_corr = np.ones_like(beta) * 0.5 expected_pval_corr[beta != 0] = 0.0 - beta_hat, sigma_hat, precision_diag = desparsified_lasso(X, y) - pval, pval_corr, one_minus_pval, one_minus_pval_corr, cb_min, cb_max = ( - desparsified_lasso_pvalue( - X.shape[0], beta_hat, sigma_hat, precision_diag, confidence=0.99 - ) - ) - assert_almost_equal(beta_hat, beta, decimal=1) - assert_equal(cb_min < beta, True) - assert_equal(cb_max > beta, True) - assert_almost_equal(pval_corr, expected_pval_corr, decimal=1) - - beta_hat, sigma_hat, precision_diag = desparsified_lasso(X, y, dof_ajdustement=True) - pval, pval_corr, one_minus_pval, one_minus_pval_corr, cb_min, cb_max = ( - desparsified_lasso_pvalue( - X.shape[0], beta_hat, sigma_hat, precision_diag, confidence=0.99 - ) + desparsified_lasso = DesparsifiedLasso(confidence=0.99, random_state=2).fit(X, y) + importances = desparsified_lasso.importance(X, y) + + assert_almost_equal(importances, beta, decimal=1) + assert_equal(desparsified_lasso.confidence_bound_min_ < beta, True) + assert_equal(desparsified_lasso.confidence_bound_max_ > beta, True) + assert_almost_equal(desparsified_lasso.pvalues_corr_, expected_pval_corr, decimal=1) + + desparsified_lasso = DesparsifiedLasso(dof_ajdustement=True, confidence=0.99).fit( + X, y ) - assert_almost_equal(beta_hat, beta, decimal=1) - assert_equal(cb_min < beta, True) - assert_equal(cb_max > beta, True) - assert_almost_equal(pval_corr, expected_pval_corr, decimal=1) + importances = desparsified_lasso.importance(X, y) + + assert_almost_equal(importances, beta, decimal=1) + assert_equal(desparsified_lasso.confidence_bound_min_ < beta, True) + assert_equal(desparsified_lasso.confidence_bound_max_ > beta, True) + assert_almost_equal(desparsified_lasso.pvalues_corr_, expected_pval_corr, decimal=1) def test_desparsified_group_lasso(): @@ -73,8 +67,17 @@ def test_desparsified_group_lasso(): signal_noise_ratio = 5000 rho_serial = 0.9 corr = toeplitz(np.geomspace(1, rho_serial ** (n_target - 1), n_target)) + multitasklassoCV = MultiTaskLassoCV( + eps=1e-2, + fit_intercept=False, + cv=KFold(n_splits=5, shuffle=True, random_state=0), + tol=1e-4, + max_iter=5000, + random_state=1, + n_jobs=1, + ) - X, Y, beta, noise = multivariate_simulation( + X, y, beta, noise = multivariate_simulation( n_samples=n_samples, n_features=n_features, n_targets=n_target, @@ -84,32 +87,39 @@ def test_desparsified_group_lasso(): seed=10, ) - beta_hat, theta_hat, precision_diag = desparsified_lasso( - X, Y, multioutput=True, covariance=corr - ) - pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( - desparsified_group_lasso_pvalue(beta_hat, theta_hat, precision_diag) - ) + desparsified_lasso = DesparsifiedLasso( + lasso_cv=multitasklassoCV, covariance=corr + ).fit(X, y) + importances = desparsified_lasso.importance(X, y) + + assert_almost_equal(importances, beta, decimal=1) expected_pval_corr = np.ones_like(beta[:, 0]) * 0.5 expected_pval_corr[beta[:, 0] != 0] = 0.0 - assert_almost_equal(beta_hat, beta, decimal=1) - assert_almost_equal(pval_corr, expected_pval_corr, decimal=1) + assert_almost_equal(importances, beta, decimal=1) + assert_almost_equal(desparsified_lasso.pvalues_corr_, expected_pval_corr, decimal=1) - beta_hat, theta_hat, precision_diag = desparsified_lasso(X, Y, multioutput=True) - pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( - desparsified_group_lasso_pvalue(beta_hat, theta_hat, precision_diag, test="F") + desparsified_lasso = DesparsifiedLasso(lasso_cv=multitasklassoCV, test="F").fit( + X, y ) + importances = desparsified_lasso.importance(X, y) - assert_almost_equal(beta_hat, beta, decimal=1) - assert_almost_equal(pval_corr, expected_pval_corr, decimal=1) + assert_almost_equal(importances, beta, decimal=1) + assert_almost_equal(desparsified_lasso.pvalues_corr_, expected_pval_corr, decimal=1) # Testing error is raised when the covariance matrix has wrong shape bad_cov = np.delete(corr, 0, axis=1) - np.testing.assert_raises( - ValueError, desparsified_lasso, X=X, y=Y, multioutput=True, covariance=bad_cov - ) - - with pytest.raises(ValueError, match="Unknown test 'r2'"): - desparsified_group_lasso_pvalue(beta_hat, theta_hat, precision_diag, test="r2") + # np.testing.assert_raises( + # ValueError, desparsified_lasso, X=X, y=y, multioutput=True, covariance=bad_cov + # ) + desparsified_lasso = DesparsifiedLasso( + lasso_cv=multitasklassoCV, covariance=bad_cov + ).fit(X, y) + with pytest.raises(ValueError): + desparsified_lasso.importance(X, y) + + with pytest.raises(AssertionError, match="Unknown test 'r2'"): + DesparsifiedLasso(lasso_cv=multitasklassoCV, covariance=bad_cov, test="r2").fit( + X, y + ) diff --git a/test/test_ensemble_clustered_inference.py b/test/test_ensemble_clustered_inference.py index 2642fed9b..76e33bb7c 100644 --- a/test/test_ensemble_clustered_inference.py +++ b/test/test_ensemble_clustered_inference.py @@ -58,14 +58,12 @@ def test_clustered_inference_no_temporal(): n_clusters=n_clusters, connectivity=connectivity, linkage="ward" ) - ward_, beta_hat, theta_hat, precision_diag = clustered_inference( + ward_, desparsified_lassos = clustered_inference( X_init, y, ward, n_clusters, scaler_sampling=StandardScaler() ) beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( - clustered_inference_pvalue( - n_samples, None, ward_, beta_hat, theta_hat, precision_diag - ) + clustered_inference_pvalue(n_samples, None, ward_, desparsified_lassos) ) expected = 0.5 * np.ones(n_features) @@ -115,14 +113,12 @@ def test_clustered_inference_temporal(): n_clusters=n_clusters, connectivity=connectivity, linkage="ward" ) - ward_, beta_hat, theta_hat, precision_diag = clustered_inference( + ward_, desparsified_lassos = clustered_inference( X, y, ward, n_clusters, scaler_sampling=StandardScaler() ) beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( - clustered_inference_pvalue( - n_samples, True, ward_, beta_hat, theta_hat, precision_diag - ) + clustered_inference_pvalue(n_samples, True, ward_, desparsified_lassos) ) expected = 0.5 * np.ones(n_features) @@ -186,13 +182,13 @@ def test_clustered_inference_no_temporal_groups(): n_clusters=n_clusters, connectivity=connectivity, linkage="ward" ) - ward_, beta_hat, theta_hat, precision_diag = clustered_inference( + ward_, desparsified_lassos = clustered_inference( X_, y_, ward, n_clusters, groups=groups, scaler_sampling=StandardScaler() ) beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( clustered_inference_pvalue( - n_groups * n_samples, False, ward_, beta_hat, theta_hat, precision_diag + n_groups * n_samples, False, ward_, desparsified_lassos ) ) @@ -243,23 +239,16 @@ def test_ensemble_clustered_inference(): n_clusters=n_clusters, connectivity=connectivity, linkage="ward" ) - list_ward, list_beta_hat, list_theta_hat, list_precision_diag = ( - ensemble_clustered_inference( - X_init, - y, - ward, - n_clusters, - scaler_sampling=StandardScaler(), - n_bootstraps=n_bootstraps, - ) + list_ward, list_desparsified_lassos = ensemble_clustered_inference( + X_init, + y, + ward, + n_clusters, + scaler_sampling=StandardScaler(), + n_bootstraps=n_bootstraps, ) beta_hat, selected = ensemble_clustered_inference_pvalue( - n_samples, - False, - list_ward, - list_beta_hat, - list_theta_hat, - list_precision_diag, + n_samples, False, list_ward, list_desparsified_lassos ) expected = np.zeros(n_features) @@ -308,23 +297,19 @@ def test_ensemble_clustered_inference_temporal_data(): n_clusters=n_clusters, connectivity=connectivity, linkage="ward" ) - list_ward, list_beta_hat, list_theta_hat, list_precision_diag = ( - ensemble_clustered_inference( - X, - y, - ward, - n_clusters, - scaler_sampling=StandardScaler(), - n_bootstraps=n_bootstraps, - ) + list_ward, list_desparsified_lassos = ensemble_clustered_inference( + X, + y, + ward, + n_clusters, + scaler_sampling=StandardScaler(), + n_bootstraps=n_bootstraps, ) beta_hat, selected = ensemble_clustered_inference_pvalue( n_samples, True, list_ward, - list_beta_hat, - list_theta_hat, - list_precision_diag, + list_desparsified_lassos, fdr_control="bhq", ) @@ -343,9 +328,7 @@ def test_ensemble_clustered_inference_temporal_data(): n_samples, True, list_ward, - list_beta_hat, - list_theta_hat, - list_precision_diag, + list_desparsified_lassos, fdr_control="bhy", ) diff --git a/test/test_noise_std.py b/test/test_noise_std.py index de4f04637..be2a56132 100644 --- a/test/test_noise_std.py +++ b/test/test_noise_std.py @@ -10,6 +10,9 @@ from hidimstat.noise_std import empirical_snr, reid from hidimstat._utils.scenario import multivariate_simulation +from sklearn.linear_model import LassoCV, MultiTaskLassoCV +from sklearn.model_selection import KFold + def test_reid(): """Estimating noise standard deviation in two scenarios. @@ -31,9 +34,11 @@ def test_reid(): signal_noise_ratio=signal_noise_ratio, seed=0, ) + lasso_cv = LassoCV(n_jobs=1).fit(X, y) + residual = lasso_cv.predict(X) - y # max_iter=1 to get a better coverage - sigma_hat, _ = reid(X, y, tolerance=1e-3, max_iterance=1) + sigma_hat = reid(lasso_cv.coef_, residual, tolerance=1e-3) expected_sigma = support_size / signal_noise_ratio error_relative = np.abs(sigma_hat - expected_sigma) / expected_sigma assert error_relative < 0.3 @@ -49,8 +54,10 @@ def test_reid(): signal_noise_ratio=signal_noise_ratio, seed=2, ) + lasso_cv = LassoCV(n_jobs=1).fit(X, y) + residual = lasso_cv.predict(X) - y - sigma_hat, _ = reid(X, y) + sigma_hat = reid(lasso_cv.coef_, residual) expected_sigma = 1.0 # when there is no signal, the variance is 1.0 error_relative = np.abs(sigma_hat - expected_sigma) / expected_sigma assert error_relative < 0.2 @@ -70,7 +77,7 @@ def test_group_reid(): # First expe # ########## support_size = 2 - X, Y, beta, noise = multivariate_simulation( + X, y, beta, noise = multivariate_simulation( n_samples=n_samples, n_features=n_features, n_targets=n_target, @@ -83,16 +90,19 @@ def test_group_reid(): corr = toeplitz(np.geomspace(1, rho_serial ** (n_target - 1), n_target)) cov = support_size / signal_noise_ratio * corr + lasso_cv = MultiTaskLassoCV(n_jobs=1).fit(X, y) + residual = lasso_cv.predict(X) - y + # max_iter=1 to get a better coverage - cov_hat, _ = reid(X, Y, multioutput=True, tolerance=1e-3, max_iterance=1) + cov_hat = reid(lasso_cv.coef_, residual, multioutput=True, tolerance=1e-3) error_relative = np.abs(cov_hat - cov) / cov assert np.max(error_relative) < 0.3 - cov_hat, _ = reid(X, Y, multioutput=True, method="AR") + cov_hat = reid(lasso_cv.coef_, residual, multioutput=True, method="AR") error_relative = np.abs(cov_hat - cov) / cov assert np.max(error_relative) < 0.3 - cov_hat, _ = reid(X, Y, multioutput=True, stationary=False) + cov_hat = reid(lasso_cv.coef_, residual, multioutput=True, stationary=False) error_relative = np.abs(cov_hat - cov) / cov assert np.max(error_relative) > 0.3 @@ -111,7 +121,7 @@ def test_group_reid_2(): # Second expe # ########### support_size = 0 - X, Y, beta, noise = multivariate_simulation( + X, y, beta, noise = multivariate_simulation( n_samples=n_samples, n_features=n_features, n_targets=n_target, @@ -124,15 +134,18 @@ def test_group_reid_2(): corr = toeplitz(rho_serial ** np.arange(0, n_target)) # covariance matrix of time cov = 1.0 * corr - cov_hat, _ = reid(X, Y, multioutput=True) + lasso_cv = MultiTaskLassoCV(n_jobs=1).fit(X, y) + residual = lasso_cv.predict(X) - y + + cov_hat = reid(lasso_cv.coef_, residual, multioutput=True) error_relative = np.abs(cov_hat - cov) / cov assert np.max(error_relative) < 0.3 - cov_hat, _ = reid(X, Y, multioutput=True, method="AR") + cov_hat = reid(lasso_cv.coef_, residual, multioutput=True, method="AR") error_relative = np.abs(cov_hat - cov) / cov assert np.max(error_relative) < 0.3 - cov_hat, _ = reid(X, Y, multioutput=True, stationary=False) + cov_hat = reid(lasso_cv.coef_, residual, multioutput=True, stationary=False) error_relative = np.abs(cov_hat - cov) / cov assert np.max(error_relative) > 0.3 From ac987d8e975597f459e2e908b381b86f5c989a61 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 5 Sep 2025 13:44:03 +0200 Subject: [PATCH 11/93] fix desparsified lasso and the example --- examples/plot_2D_simulation_example.py | 40 +++++++++++--------------- src/hidimstat/desparsified_lasso.py | 6 ++-- 2 files changed, 20 insertions(+), 26 deletions(-) diff --git a/examples/plot_2D_simulation_example.py b/examples/plot_2D_simulation_example.py index 82b888146..fc0b437e2 100644 --- a/examples/plot_2D_simulation_example.py +++ b/examples/plot_2D_simulation_example.py @@ -55,11 +55,9 @@ from sklearn.cluster import FeatureAgglomeration from sklearn.preprocessing import StandardScaler from sklearn.feature_extraction import image +from sklearn.linear_model import MultiTaskLassoCV -from hidimstat.desparsified_lasso import ( - desparsified_lasso, - desparsified_lasso_pvalue, -) +from hidimstat import DesparsifiedLasso from hidimstat.ensemble_clustered_inference import ( clustered_inference, clustered_inference_pvalue, @@ -242,18 +240,18 @@ def plot(maps, titles): # and referred to as Desparsified Lasso. # compute desparsified lasso -beta_hat, sigma_hat, precision_diagonal = desparsified_lasso(X_init, y, n_jobs=n_jobs) -pval, pval_corr, one_minus_pval, one_minus_pval_corr, cb_min, cb_max = ( - desparsified_lasso_pvalue(X_init.shape[0], beta_hat, sigma_hat, precision_diagonal) -) +desparsified_lasso = DesparsifiedLasso(n_jobs=n_jobs).fit(X_init, y) +desparsified_lasso.importance(X_init, y) + # compute estimated support (first method) -zscore = zscore_from_pval(pval, one_minus_pval) +zscore = zscore_from_pval(desparsified_lasso.pvalues_, 1 - desparsified_lasso.pvalues_) selected_dl = zscore > thr_nc # use the "no clustering threshold" # compute estimated support (second method) selected_dl = np.logical_or( - pval_corr < fwer_target / 2, one_minus_pval_corr < fwer_target / 2 + desparsified_lasso.pvalues_corr_ < fwer_target / 2, + 1 - desparsified_lasso.pvalues_corr_ < fwer_target / 2, ) ############################################################################# @@ -269,11 +267,11 @@ def plot(maps, titles): ) # clustered desparsified lasso (CluDL) -ward_, beta_hat, theta_hat, omega_diag = clustered_inference( +ward_, desparsified_lasso = clustered_inference( X_init, y, ward, n_clusters, scaler_sampling=StandardScaler() ) beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( - clustered_inference_pvalue(n_samples, False, ward_, beta_hat, theta_hat, omega_diag) + clustered_inference_pvalue(n_samples, False, ward_, desparsified_lasso) ) # compute estimated support (first method) @@ -293,22 +291,18 @@ def plot(maps, titles): # solutions are then aggregated into one. # ensemble of clustered desparsified lasso (EnCluDL) -list_ward, list_beta_hat, list_theta_hat, list_omega_diag = ( - ensemble_clustered_inference( - X_init, - y, - ward, - n_clusters, - scaler_sampling=StandardScaler(), - ) +list_ward, list_desparsified_lasso = ensemble_clustered_inference( + X_init, + y, + ward, + n_clusters, + scaler_sampling=StandardScaler(), ) beta_hat, selected_ecdl = ensemble_clustered_inference_pvalue( n_samples, False, list_ward, - list_beta_hat, - list_theta_hat, - list_omega_diag, + list_desparsified_lasso, fdr=fwer_target, ) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index 3fcd89ba3..76c9e1ea7 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -233,10 +233,10 @@ def fit(self, X, y): except NotFittedError: # check if max_iter is large enough if self.lasso_cv.max_iter // self.lasso_cv.cv.n_splits <= n_features: - self.lasso_cv.set_params(max_iter=n_features * self.lasso_cv.n_splits) - Warning( - f"'max_iter' has been increased to {self.lasso_cv.max_iterance}" + self.lasso_cv.set_params( + max_iter=n_features * self.lasso_cv.cv.n_splits ) + Warning(f"'max_iter' has been increased to {self.lasso_cv.max_iter}") # use the cross-validation for define the best alpha of Lasso self.lasso_cv.set_params(n_jobs=self.n_jobs) self.lasso_cv.fit(X_, y_) From bfa57185c92cdf756529e3791819c0b4bc2688a5 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 8 Sep 2025 16:25:14 +0200 Subject: [PATCH 12/93] fix example --- examples/plot_fmri_data_example.py | 67 +++++++++++-------- src/hidimstat/desparsified_lasso.py | 2 +- src/hidimstat/ensemble_clustered_inference.py | 44 ++++++------ 3 files changed, 62 insertions(+), 51 deletions(-) diff --git a/examples/plot_fmri_data_example.py b/examples/plot_fmri_data_example.py index 6ca4d1d14..fa8055724 100644 --- a/examples/plot_fmri_data_example.py +++ b/examples/plot_fmri_data_example.py @@ -40,7 +40,9 @@ from matplotlib.pyplot import get_cmap from nilearn import datasets from nilearn.image import mean_img +from sklearn.linear_model import LassoCV from nilearn.maskers import NiftiMasker +from sklearn.model_selection import KFold from nilearn.plotting import plot_stat_map, show from sklearn.cluster import FeatureAgglomeration from sklearn.preprocessing import StandardScaler @@ -55,10 +57,7 @@ ensemble_clustered_inference, ensemble_clustered_inference_pvalue, ) -from hidimstat.desparsified_lasso import ( - desparsified_lasso, - desparsified_lasso_pvalue, -) +from hidimstat.desparsified_lasso import DesparsifiedLasso from hidimstat.statistical_tools.p_values import zscore_from_pval @@ -144,6 +143,16 @@ def preprocess_haxby(subject=2, memory=None): # Making the inference with several algorithms # -------------------------------------------- +estimator = LassoCV( + eps=1e-2, + fit_intercept=False, + cv=KFold(n_splits=5, shuffle=True, random_state=0), + tol=1e-2, + max_iter=4000, + random_state=1, + n_jobs=1, +) + ############################################################################# # First, we try to recover the discriminative pattern by computing # p-values from desparsified lasso. @@ -151,12 +160,10 @@ def preprocess_haxby(subject=2, memory=None): # of 5 G for memory. To handle this problem, the following methods use some # feature aggregation methods. try: - beta_hat, sigma_hat, precision_diagonal = desparsified_lasso( - X, y, noise_method="median", max_iteration=1000 - ) - pval_dl, _, one_minus_pval_dl, _, cb_min, cb_max = desparsified_lasso_pvalue( - X.shape[0], beta_hat, sigma_hat, precision_diagonal - ) + desparsified_lasso = DesparsifiedLasso(noise_method="median", lasso_cv=estimator) + desparsified_lasso.fit_importance(X, y) + pval_dl = desparsified_lasso.pvalues_ + one_minus_pval_dl = 1 - pval_dl except MemoryError as err: pval_dl = None one_minus_pval_dl = None @@ -165,11 +172,16 @@ def preprocess_haxby(subject=2, memory=None): ############################################################################# # Now, the clustered inference algorithm which combines parcellation # and high-dimensional inference (c.f. References). -ward_, beta_hat, theta_hat, omega_diag = clustered_inference( - X, y, ward, n_clusters, scaler_sampling=StandardScaler(), tolerance=1e-2 +ward_, cl_desparsified_lasso = clustered_inference( + X, + y, + ward, + n_clusters, + scaler_sampling=StandardScaler(), + lasso_cv=estimator, # , tolerance=1e-2 ) beta_hat, pval_cdl, _, one_minus_pval_cdl, _ = clustered_inference_pvalue( - X.shape[0], None, ward_, beta_hat, theta_hat, omega_diag + X.shape[0], None, ward_, cl_desparsified_lasso ) ############################################################################# @@ -180,27 +192,24 @@ def preprocess_haxby(subject=2, memory=None): # then 5 statistical maps are produced and aggregated into one. # However you might benefit from clustering randomization taking # `n_bootstraps=25` or `n_bootstraps=100`, also we set `n_jobs=2`. -list_ward, list_beta_hat, list_theta_hat, list_omega_diag = ( - ensemble_clustered_inference( - X, - y, - ward, - n_clusters, - groups=groups, - scaler_sampling=StandardScaler(), - n_bootstraps=5, - max_iteration=6000, - tolerance=1e-2, - n_jobs=2, - ) +list_ward, list_cl_desparsified_lasso = ensemble_clustered_inference( + X, + y, + ward, + n_clusters, + groups=groups, + scaler_sampling=StandardScaler(), + n_bootstraps=5, + lasso_cv=estimator, + # max_iteration=6000, + # tolerance=1e-2, + n_jobs=2, ) beta_hat, selected = ensemble_clustered_inference_pvalue( X.shape[0], False, list_ward, - list_beta_hat, - list_theta_hat, - list_omega_diag, + list_cl_desparsified_lasso, fdr=0.1, ) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index 76c9e1ea7..7b65e8be6 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -160,7 +160,7 @@ def __init__( elif issubclass(MultiTaskLassoCV, lasso_cv.__class__): self.n_times_ = -1 else: - raise ValueError("lasso_cv need to be a Lasso or a MultiTaskLassoCV") + raise ValueError("lasso_cv need to be a LassoCV or a MultiTaskLassoCV") self.lasso_cv = lasso_cv self.centered = centered self.dof_ajdustement = dof_ajdustement diff --git a/src/hidimstat/ensemble_clustered_inference.py b/src/hidimstat/ensemble_clustered_inference.py index 7151b190a..25ba91688 100644 --- a/src/hidimstat/ensemble_clustered_inference.py +++ b/src/hidimstat/ensemble_clustered_inference.py @@ -257,29 +257,31 @@ def clustered_inference( X_reduced = clone(scaler_sampling).fit_transform(X_reduced) # inference methods - multitasklassoCV = MultiTaskLassoCV( - eps=1e-2, - fit_intercept=False, - cv=KFold(n_splits=5, shuffle=True, random_state=0), - tol=1e-4, - max_iter=5000, - random_state=1, - n_jobs=1, - ) - lasso_cv = LassoCV( - eps=1e-2, - fit_intercept=False, - cv=KFold(n_splits=5, shuffle=True, random_state=0), - tol=1e-4, - max_iter=5000, - random_state=1, - n_jobs=1, - ) + if hasattr(kwargs, "lasso_cv") and kwargs["lasso_cv"] is not None: + pass + elif len(y.shape) > 1 and y.shape[1] > 1: + kwargs["lasso_cv"] = MultiTaskLassoCV( + eps=1e-2, + fit_intercept=False, + cv=KFold(n_splits=5, shuffle=True, random_state=0), + tol=1e-4, + max_iter=5000, + random_state=1, + n_jobs=1, + ) + else: + kwargs["lasso_cv"] = LassoCV( + eps=1e-2, + fit_intercept=False, + cv=KFold(n_splits=5, shuffle=True, random_state=0), + tol=1e-4, + max_iter=5000, + random_state=1, + n_jobs=1, + ) + desparsified_lassos = memory.cache( DesparsifiedLasso( - lasso_cv=( - multitasklassoCV if len(y.shape) > 1 and y.shape[1] > 1 else lasso_cv - ), n_jobs=n_jobs, memory=memory, verbose=verbose, From fc1f941a86ad80cd570501a9a3836db63d5fc7ad Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 8 Sep 2025 17:33:14 +0200 Subject: [PATCH 13/93] add tests --- src/hidimstat/desparsified_lasso.py | 25 ++++++- test/test_desparsified_lasso.py | 101 ++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+), 3 deletions(-) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index 7b65e8be6..e31d14c68 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -153,14 +153,14 @@ def __init__( assert issubclass( Lasso, lasso.__class__ - ), "lasso needs to be a Lasso or a MultiTaskLassoCV" + ), "lasso needs to be a Lasso or a MultiTaskLasso" self.lasso = lasso if issubclass(LassoCV, lasso_cv.__class__): self.n_times_ = 1 elif issubclass(MultiTaskLassoCV, lasso_cv.__class__): self.n_times_ = -1 else: - raise ValueError("lasso_cv need to be a LassoCV or a MultiTaskLassoCV") + raise AssertionError("lasso_cv needs to be a LassoCV or a MultiTaskLassoCV") self.lasso_cv = lasso_cv self.centered = centered self.dof_ajdustement = dof_ajdustement @@ -236,7 +236,9 @@ def fit(self, X, y): self.lasso_cv.set_params( max_iter=n_features * self.lasso_cv.cv.n_splits ) - Warning(f"'max_iter' has been increased to {self.lasso_cv.max_iter}") + warnings.warn( + f"'max_iter' has been increased to {self.lasso_cv.max_iter}" + ) # use the cross-validation for define the best alpha of Lasso self.lasso_cv.set_params(n_jobs=self.n_jobs) self.lasso_cv.fit(X_, y_) @@ -458,6 +460,23 @@ def importance(self, X, y): return self.importances_ def fit_importance(self, X, y, cv=None): + """Fit and compute variable importance in one step. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data matrix. + y : array-like of shape (n_samples,) or (n_samples, n_times) + Target values. For single task, y should be 1D or (n_samples, 1). + For multi-task, y should be (n_samples, n_times). + cv : object + Not used. Cross-validation is controlled by lasso_cv parameter. + + Returns + ------- + importances_ : ndarray of shape (n_features,) or (n_features, n_times) + Desparsified lasso coefficient estimates. + """ if cv is not None: warnings.warn("cv won't be used") self.fit(X, y) diff --git a/test/test_desparsified_lasso.py b/test/test_desparsified_lasso.py index 61fd31b3f..22e9fda87 100644 --- a/test/test_desparsified_lasso.py +++ b/test/test_desparsified_lasso.py @@ -8,6 +8,8 @@ from scipy.linalg import toeplitz from sklearn.linear_model import MultiTaskLassoCV from sklearn.model_selection import KFold +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import LassoCV from hidimstat.desparsified_lasso import DesparsifiedLasso, desparsified_lasso from hidimstat._utils.scenario import multivariate_simulation @@ -123,3 +125,102 @@ def test_desparsified_group_lasso(): DesparsifiedLasso(lasso_cv=multitasklassoCV, covariance=bad_cov, test="r2").fit( X, y ) + + +def test_exception(): + n_samples = 50 + n_features = 100 + n_target = 10 + support_size = 2 + signal_noise_ratio = 5000 + rho_serial = 0.9 + corr = toeplitz(np.geomspace(1, rho_serial ** (n_target - 1), n_target)) + multitasklassoCV = MultiTaskLassoCV( + eps=1e-2, + fit_intercept=False, + cv=KFold(n_splits=5, shuffle=True, random_state=0), + tol=1e-4, + max_iter=5000, + random_state=1, + n_jobs=1, + ) + + X, y, beta, noise = multivariate_simulation( + n_samples=n_samples, + n_features=n_features, + n_targets=n_target, + support_size=support_size, + rho_serial=rho_serial, + signal_noise_ratio=signal_noise_ratio, + seed=10, + ) + + with pytest.raises( + AssertionError, match="lasso needs to be a Lasso or a MultiTaskLasso" + ): + DesparsifiedLasso(lasso=RandomForestClassifier()) + with pytest.raises( + AssertionError, match="lasso_cv needs to be a LassoCV or a MultiTaskLassoCV" + ): + DesparsifiedLasso(lasso_cv=RandomForestClassifier()) + with pytest.raises(AssertionError, match="Unknown test 'r2'"): + DesparsifiedLasso(test="r2") + desparsified_lasso = DesparsifiedLasso(lasso_cv=multitasklassoCV) + with pytest.raises( + ValueError, + match="The Desparsified Lasso requires to be fit before any analysis", + ): + desparsified_lasso.importance(X, y) + desparsified_lasso.sigma_hat_ = [] + with pytest.raises( + ValueError, + match="The Desparsified Lasso requires to be fit before any analysis", + ): + desparsified_lasso.importance(X, y) + + desparsified_lasso = DesparsifiedLasso(lasso_cv=multitasklassoCV).fit(X, y) + with pytest.raises(ValueError, match="Unknown test 'r2'"): + desparsified_lasso.test = "r2" + desparsified_lasso.importance(X, y) + + +def test_warning(): + n_samples, n_features = 52, 50 + support_size = 1 + signal_noise_ratio = 50 + rho = 0.0 + + X, y, beta, noise = multivariate_simulation( + n_samples=n_samples, + n_features=n_features, + support_size=support_size, + signal_noise_ratio=signal_noise_ratio, + rho=rho, + shuffle=False, + seed=10, + ) + desparsified_lasso = DesparsifiedLasso( + lasso_cv=LassoCV(cv=KFold(n_splits=2), max_iter=10) + ) + with pytest.warns(Warning, match="'max_iter' has been increased to"): + with pytest.warns(Warning, match="cv won't be used"): + desparsified_lasso.fit_importance(X, y, cv=[]) + + +def test_function_not_center(): + "Test function and not centered" + n_samples, n_features = 52, 50 + support_size = 1 + signal_noise_ratio = 50 + rho = 0.0 + + X, y, beta, noise = multivariate_simulation( + n_samples=n_samples, + n_features=n_features, + support_size=support_size, + signal_noise_ratio=signal_noise_ratio, + rho=rho, + shuffle=False, + seed=10, + ) + selection, importances, pvalues = desparsified_lasso(X, y, centered=False) From 3e645884546b880cbf8e50474551b882a34b7375 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 8 Sep 2025 17:51:58 +0200 Subject: [PATCH 14/93] Add Encldel and Cluster --- docs/src/api.rst | 6 +- src/hidimstat/__init__.py | 19 +- src/hidimstat/ensemble_clustered_inference.py | 647 +++++++++++------- test/test_ensemble_clustered_inference.py | 136 ++-- 4 files changed, 476 insertions(+), 332 deletions(-) diff --git a/docs/src/api.rst b/docs/src/api.rst index 0e14b6106..24c85e9bf 100644 --- a/docs/src/api.rst +++ b/docs/src/api.rst @@ -17,10 +17,6 @@ Functions :template: function.rst quantile_aggregation - clustered_inference - clustered_inference_pvalue - ensemble_clustered_inference - ensemble_clustered_inference_pvalue model_x_knockoff reid @@ -38,3 +34,5 @@ Classes PFI D0CRT DesparsifiedLasso + ClusteredInference + EnsembleClusteredInference diff --git a/src/hidimstat/__init__.py b/src/hidimstat/__init__.py index 63d3e438c..201fcb89e 100644 --- a/src/hidimstat/__init__.py +++ b/src/hidimstat/__init__.py @@ -1,13 +1,7 @@ from .base_variable_importance import BaseVariableImportance from .base_perturbation import BasePerturbation -from .ensemble_clustered_inference import ( - clustered_inference, - clustered_inference_pvalue, -) -from .ensemble_clustered_inference import ( - ensemble_clustered_inference, - ensemble_clustered_inference_pvalue, -) +from .ensemble_clustered_inference import ClusteredInference +from .ensemble_clustered_inference import EnsembleClusteredInference from .desparsified_lasso import desparsified_lasso, DesparsifiedLasso from .distilled_conditional_randomization_test import d0crt, D0CRT from .conditional_feature_importance import CFI @@ -30,14 +24,13 @@ __all__ = [ "quantile_aggregation", - "clustered_inference", - "clustered_inference_pvalue", - "ensemble_clustered_inference", - "ensemble_clustered_inference_pvalue", + "ClusteredInference", + "EnsembleClusteredInference", "d0crt", "D0CRT", "desparsified_lasso", - "DesparsifiedLasso" "reid", + "DesparsifiedLasso", + "reid", "model_x_knockoff", "model_x_knockoff_pvalue", "model_x_knockoff_bootstrap_quantile", diff --git a/src/hidimstat/ensemble_clustered_inference.py b/src/hidimstat/ensemble_clustered_inference.py index 25ba91688..0498cffe6 100644 --- a/src/hidimstat/ensemble_clustered_inference.py +++ b/src/hidimstat/ensemble_clustered_inference.py @@ -1,15 +1,17 @@ +import warnings + import numpy as np from sklearn.base import clone from joblib import Parallel, delayed from sklearn.utils.validation import check_memory +from sklearn.exceptions import NotFittedError from sklearn.cluster import FeatureAgglomeration -from sklearn.linear_model import LassoCV, MultiTaskLassoCV -from sklearn.model_selection import KFold +from sklearn.base import check_is_fitted +from sklearn.utils import check_random_state from hidimstat.desparsified_lasso import DesparsifiedLasso +from hidimstat.base_variable_importance import BaseVariableImportance from hidimstat._utils.bootstrap import _subsampling -from hidimstat.statistical_tools.aggregation import quantile_aggregation -from hidimstat.statistical_tools.multiple_testing import fdr_threshold def _ungroup_beta(beta_hat, n_features, ward): @@ -141,20 +143,7 @@ def _ward_clustering(X_init, ward, train_index): return X_reduced, ward -def clustered_inference( - X_init, - y, - ward, - n_clusters, - scaler_sampling=None, - train_size=1.0, - groups=None, - seed=0, - n_jobs=1, - memory=None, - verbose=1, - **kwargs, -): +class ClusteredInference(BaseVariableImportance): """ Clustered inference algorithm for statistical analysis of high-dimensional data. @@ -231,31 +220,73 @@ def clustered_inference( 3. Transform data to cluster space 4. Perform statistical inference using desparsified lasso """ - memory = check_memory(memory=memory) - assert issubclass( - ward.__class__, FeatureAgglomeration - ), "ward need to an instance of sklearn.cluster.FeatureAgglomeration" - - n_samples, n_features = X_init.shape - - if verbose > 0: - print( - f"Clustered inference: n_clusters = {n_clusters}, " - + f"inference method desparsified lasso, seed = {seed}," - + f"groups = {groups is not None} " + + def __init__( + self, + ward, + n_clusters, + variable_importance=DesparsifiedLasso(), + scaler_sampling=None, + train_size=1.0, + groups=None, + seed=0, + n_jobs=1, + memory=None, + verbose=1, + ): + self.ward = ward + self.n_clusters = n_clusters + self.variable_importance = variable_importance + self.scaler_sampling = scaler_sampling + self.train_size = train_size + self.groups = groups + self.seed = seed + self.n_jobs = n_jobs + self.memory = memory + self.verbose = verbose + + # generalize to all the feature generated + self.pvalues_corr_ = None + + def fit(self, X_init, y): + memory = check_memory(memory=self.memory) + assert issubclass( + self.ward.__class__, FeatureAgglomeration + ), "ward need to an instance of sklearn.cluster.FeatureAgglomeration" + + n_samples, n_features = X_init.shape + + if self.verbose > 0: + print( + f"Clustered inference: n_clusters = {self.n_clusters}, " + + f"inference method desparsified lasso, seed = {self.seed}," + + f"groups = {self.groups is not None} " + ) + + ## This are the 3 step in first loop of the algorithm 2 of [1] + # sampling row of X + train_index = _subsampling( + n_samples, self.train_size, groups=self.groups, seed=self.seed ) - ## This are the 3 step in first loop of the algorithm 2 of [1] - # sampling row of X - train_index = _subsampling(n_samples, train_size, groups=groups, seed=seed) + # transformation matrix + X_reduced, self.ward = memory.cache(_ward_clustering)( + X_init, clone(self.ward), train_index + ) - # transformation matrix - X_reduced, ward_ = memory.cache(_ward_clustering)(X_init, clone(ward), train_index) + # Preprocessing + if self.scaler_sampling is not None: + self.scaler_sampling = clone(self.scaler_sampling) + X_reduced = self.scaler_sampling.fit_transform(X_reduced) - # Preprocessing - if scaler_sampling is not None: - X_reduced = clone(scaler_sampling).fit_transform(X_reduced) + # inference methods + self.variable_importance = memory.cache(self.variable_importance.fit)( + X_reduced, + y, + ) + return self +<<<<<<< HEAD # inference methods if hasattr(kwargs, "lasso_cv") and kwargs["lasso_cv"] is not None: pass @@ -293,75 +324,160 @@ def clustered_inference( y, ) desparsified_lassos.importance(X_reduced, y) - - return ward_, desparsified_lassos - - -def clustered_inference_pvalue(n_samples, group, ward, desparsified_lassos, **kwargs): - """ - Compute corrected p-values at the cluster level and transform them - back to feature level. - - Parameters - ---------- - n_samples : int - Number of samples in the dataset - group : bool - If True, uses group lasso p-values for multivariate outcomes - ward : AgglomerativeClustering - Fitted clustering object - beta_hat : ndarray - Estimated coefficients at cluster level - theta_hat : ndarray - Estimated precision matrix - precision_diag : ndarray - Diagonal elements of the covariance matrix - **kwargs : dict - Additional arguments passed to p-value computation functions - - Returns - ------- - beta_hat : ndarray - Degrouped coefficients at feature level - pval : ndarray - P-values for each feature - pval_corr : ndarray - Multiple testing corrected p-values - one_minus_pval : ndarray - 1 - p-values for numerical stability - one_minus_pval_corr : ndarray - 1 - corrected p-values - """ - # corrected cluster-wise p-values - - # De-grouping - beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = _degrouping( - ward, - desparsified_lassos.importances_, - desparsified_lassos.pvalues_, - desparsified_lassos.pvalues_corr_, - 1 - desparsified_lassos.pvalues_, - 1 - desparsified_lassos.pvalues_corr_, +||||||| parent of ec9ff4e (Add Encldel and Cluster) + # inference methods + multitasklassoCV = MultiTaskLassoCV( + eps=1e-2, + fit_intercept=False, + cv=KFold(n_splits=5, shuffle=True, random_state=0), + tol=1e-4, + max_iter=5000, + random_state=1, + n_jobs=1, ) + lasso_cv = LassoCV( + eps=1e-2, + fit_intercept=False, + cv=KFold(n_splits=5, shuffle=True, random_state=0), + tol=1e-4, + max_iter=5000, + random_state=1, + n_jobs=1, + ) + desparsified_lassos = memory.cache( + DesparsifiedLasso( + lasso_cv=( + multitasklassoCV if len(y.shape) > 1 and y.shape[1] > 1 else lasso_cv + ), + n_jobs=n_jobs, + memory=memory, + verbose=verbose, + **kwargs, + ).fit, + ignore=["n_jobs", "verbose", "memory"], + )( + X_reduced, + y, + ) + desparsified_lassos.importance(X_reduced, y) +======= + def _check_fit(self): + """ + Check if the model has been fit before performing analysis. +>>>>>>> ec9ff4e (Add Encldel and Cluster) + + This private method verifies that all necessary attributes have been set + during the fitting process. + These attributes include: + - clf_x_ + - clf_y_ + - coefficient_ + - non_selection_ + + Raises + ------ + ValueError + If any of the required attributes are missing, indicating the model + hasn't been fit. + """ + self.variable_importance._check_fit() + try: + check_is_fitted(self.ward) + if self.scaler_sampling is not None: + check_is_fitted(self.scaler_sampling) + except NotFittedError: + raise ValueError( + "The ClusteredInference requires to be fit before any analysis" + ) - return beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr - + def importance(self, X, y): + """ + Compute feature importance scores using distilled CRT. + + Calculates test statistics and p-values for each feature using residual + correlations after the distillation process. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data matrix. + y : array-like of shape (n_samples,) + Target values. + + Returns + ------- + importances_ : ndarray of shape (n_features,) + Test statistics/importance scores for each feature. For unselected features, + the score is set to 0. + + Attributes + ---------- + importances_ : same as return value + pvalues_ : ndarray of shape (n_features,) + Two-sided p-values for each feature under Gaussian null. + + Notes + ----- + For each selected feature j: + 1. Computes residuals from regressing X_j on other features + 2. Computes residuals from regressing y on other features + 3. Calculates test statistic from correlation of residuals + 4. Computes p-value assuming standard normal distribution + """ + self._check_fit() + X_reduced = self.ward.transform(X) + if self.scaler_sampling is not None: + X_reduced = self.scaler_sampling.transform(X_reduced) + + self.variable_importance.importance(X_reduced, y) + beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = _degrouping( + self.ward, + self.variable_importance.importances_, + self.variable_importance.pvalues_, + self.variable_importance.pvalues_corr_, + 1 - self.variable_importance.pvalues_, + 1 - self.variable_importance.pvalues_corr_, + ) -def ensemble_clustered_inference( - X_init, - y, - ward, - n_clusters, - scaler_sampling=None, - train_size=0.3, - groups=None, - seed=0, - n_bootstraps=25, - n_jobs=None, - verbose=1, - memory=None, - **kwargs, -): + self.importances_ = beta_hat + self.pvalues_ = pval + self.pvalues_corr_ = pval_corr + return self.importances_ + + def fit_importance(self, X, y, cv=None): + """ + Fits the model to the data and computes feature importance. + + A convenience method that combines fit() and importance() into a single call. + First fits the dCRT model to the data, then calculates importance scores. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data matrix. + y : array-like of shape (n_samples,) + Target values. + cv : None or int, optional (default=None) + Not used. Included for compatibility. A warning will be issued if provided. + + Returns + ------- + importance : ndarray of shape (n_features,) + Feature importance scores/test statistics. + For features not selected during screening, scores are set to 0. + + Notes + ----- + Also sets the importances\_ and pvalues\_ attributes on the instance. + See fit() and importance() for details on the underlying computations. + """ + if cv is not None: + warnings.warn("cv won't be used") + self.fit(X, y) + return self.importance(X, y) + + +class EnsembleClusteredInference(BaseVariableImportance): """ Ensemble clustered inference algorithm for high-dimensional statistical inference, as described in :cite:`chevalier2022spatially`. @@ -467,152 +583,183 @@ def ensemble_clustered_inference( ---------- .. footbibliography:: """ - memory = check_memory(memory=memory) - assert issubclass( - ward.__class__, FeatureAgglomeration - ), "ward need to an instance of sklearn.cluster.FeatureAgglomeration" - - # Clustered inference algorithms - results = Parallel(n_jobs=n_jobs, verbose=verbose)( - delayed(clustered_inference)( - X_init, - y, - clone(ward), - n_clusters, - scaler_sampling=scaler_sampling, - train_size=train_size, - groups=groups, - seed=i, - n_jobs=1, - verbose=verbose, - memory=memory, - **kwargs, - ) - for i in np.arange(seed, seed + n_bootstraps) - ) - list_ward, list_desparsified_lassos = [], [] - for ward, desparsified_lassos in results: - list_ward.append(ward) - list_desparsified_lassos.append(desparsified_lassos) - return list_ward, list_desparsified_lassos - - -def ensemble_clustered_inference_pvalue( - n_samples, - group, - list_ward, - list_desparsified_lassos, - fdr=0.1, - fdr_control="bhq", - reshaping_function=None, - adaptive_aggregation=False, - gamma=0.5, - n_jobs=None, - verbose=0, - **kwargs, -): - """ - Compute and aggregate p-values across multiple bootstrap iterations - using an aggregation method. - - This function performs statistical inference on each bootstrap sample - and combines the results using a specified aggregation method to obtain - robust estimates. - The implementation follows the methodology in :footcite:`chevalier2022spatially`. - Parameters - ---------- - n_samples : int - Number of samples in the dataset - group : bool - If True, uses group lasso p-values for multivariate outcomes - list_ward : list of AgglomerativeClustering - List of fitted clustering objects from bootstraps - list_beta_hat : list of ndarray - List of estimated coefficients at cluster level from each bootstrap - list_theta_hat : list of ndarray - List of estimated precision matrices from each bootstrap - list_precision_diag : list of ndarray - List of diagonal elements of covariance matrices from each bootstrap - fdr : float, default=0.1 - False discovery rate threshold for multiple testing correction - fdr_control : str, default="bhq" - Method for FDR control ('bhq' for Benjamini-Hochberg) - Available methods are: - * 'bhq': Standard Benjamini-Hochberg :footcite:`benjamini1995controlling,bhy_2001` - * 'bhy': Benjamini-Hochberg-Yekutieli :footcite:p:`bhy_2001` - * 'ebh': e-Benjamini-Hochberg :footcite:`wang2022false` - reshaping_function : callable, optional (default=None) - Function to reshape data before FDR control - adaptive_aggregation : bool, default=False - Whether to use adaptive quantile aggregation - gamma : float, default=0.5 - Quantile level for aggregation - n_jobs : int or None, optional (default=None) - Number of parallel jobs. None means using all processors. - verbose : int, default=0 - Verbosity level for computation progress - **kwargs : dict - Additional arguments passed to p-value computation functions - - Returns - ------- - beta_hat : ndarray, shape (n_features,) or (n_features, n_times) - Averaged coefficients across bootstraps - selected : ndarray, shape (n_features,) - Selected features: 1 for positive effects, -1 for negative effects, - 0 for non-selected features - - References - ---------- - .. footbibliography:: - """ - results = Parallel(n_jobs=n_jobs, verbose=verbose)( - delayed(clustered_inference_pvalue)( - n_samples, - group, - list_ward[i], - list_desparsified_lassos[i], - **kwargs, + def __init__( + self, + variable_importance, + n_bootstraps=25, + n_jobs=None, + verbose=1, + memory=None, + random_state=None, + ): + self.variable_importance = variable_importance + self.n_bootstraps = n_bootstraps + self.n_jobs = n_jobs + self.verbose = verbose + self.memory = memory + self.random_state = random_state + + self.list_variable_importances_ = None + + def fit(self, X, y): + """ + Fit the dCRT model. + + This method fits the Distilled Conditional Randomization Test (DCRT) model + as described in :footcite:t:`liu2022fast`. It performs optional feature + screening using Lasso, computes coefficients, and prepares the model for + importance and p-value computation. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data matrix. + y : array-like of shape (n_samples,) + Target values. + + Returns + ------- + self : object + Returns the fitted instance. + + Notes + ----- + Main steps: + 1. Optional data centering with StandardScaler + 2. Lasso screening of variables (if no estimated coefficients provided) + 3. Feature selection based on coefficient magnitudes + 4. Model refitting on selected features (if refit=True) + 5. Fit model for future distillation + + The screening threshold controls which features are kept based on their + Lasso coefficients. Features with coefficients below the threshold are + set to zero. + + References + ---------- + .. footbibliography:: + """ + rng = check_random_state(self.random_state) + seed = rng.randint(1) + + def run_fit(variable_importance, X, y, random_state): + return variable_importance(random_state=random_state, n_jobs=1).fit(X, y) + + self.list_variable_importances_ = Parallel( + n_jobs=self.n_jobs, verbose=self.verbose + )( + delayed(run_fit)(clone(self.variable_importance), X, y, i) + for i in np.arange(seed, seed + self.n_bootstraps) + ) + return self + + def _check_fit(self): + """ + Check if the model has been fit before performing analysis. + + This private method verifies that all necessary attributes have been set + during the fitting process. + These attributes include: + - clf_x_ + - clf_y_ + - coefficient_ + - non_selection_ + + Raises + ------ + ValueError + If any of the required attributes are missing, indicating the model + hasn't been fit. + """ + if self.list_variable_importances_ is None: + raise ValueError("The D0CRT requires to be fit before any analysis") + + def importance(self, X, y): + """ + Compute feature importance scores using distilled CRT. + + Calculates test statistics and p-values for each feature using residual + correlations after the distillation process. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data matrix. + y : array-like of shape (n_samples,) + Target values. + + Returns + ------- + importances_ : ndarray of shape (n_features,) + Test statistics/importance scores for each feature. For unselected features, + the score is set to 0. + + Attributes + ---------- + importances_ : same as return value + pvalues_ : ndarray of shape (n_features,) + Two-sided p-values for each feature under Gaussian null. + + Notes + ----- + For each selected feature j: + 1. Computes residuals from regressing X_j on other features + 2. Computes residuals from regressing y on other features + 3. Calculates test statistic from correlation of residuals + 4. Computes p-value assuming standard normal distribution + """ + self._check_fit() + + def run_importance(variable_importance, X, y): + variable_importance.importance(X, y) + return None + + parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose) + parallel( + delayed(run_importance)(variable_importance, X, y) + for variable_importance in self.list_variable_importances_ ) - for i in range(len(list_ward)) - ) - # Collecting results - list_beta_hat = [] - list_pval, list_pval_corr = [], [] - list_one_minus_pval, list_one_minus_pval_corr = [], [] - for beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr in results: - list_beta_hat.append(beta_hat) - list_pval.append(pval) - list_pval_corr.append(pval_corr) - list_one_minus_pval.append(one_minus_pval) - list_one_minus_pval_corr.append(one_minus_pval_corr) - - # Ensembling - beta_hat = np.mean(list_beta_hat, axis=0) - # pvalue selection - aggregated_pval = quantile_aggregation( - np.array(list_pval), gamma=gamma, adaptive=adaptive_aggregation - ) - threshold_pval = fdr_threshold( - aggregated_pval, - fdr=fdr, - method=fdr_control, - reshaping_function=reshaping_function, - ) - # 1-pvalue selection - aggregated_one_minus_pval = quantile_aggregation( - np.array(list_one_minus_pval), gamma=gamma, adaptive=adaptive_aggregation - ) - threshold_one_minus_pval = fdr_threshold( - aggregated_one_minus_pval, - fdr=fdr, - method=fdr_control, - reshaping_function=reshaping_function, - ) - # group seelction - selected = np.zeros_like(beta_hat) - selected[np.where(aggregated_pval <= threshold_pval)] = 1 - selected[np.where(aggregated_one_minus_pval <= threshold_one_minus_pval)] = -1 - return beta_hat, selected + # Ensembling + # TODO check if selection_FDR is good + self.importances_ = np.mean( + [vi.importances_ for vi in self.list_variable_importances_], axis=0 + ) + # pvalue selection + self.pvalues_ = np.array( + [vi.pvalues_ for vi in self.list_variable_importances_] + ) + return self.importances_ + + def fit_importance(self, X, y, cv=None): + """ + Fits the model to the data and computes feature importance. + + A convenience method that combines fit() and importance() into a single call. + First fits the dCRT model to the data, then calculates importance scores. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data matrix. + y : array-like of shape (n_samples,) + Target values. + cv : None or int, optional (default=None) + Not used. Included for compatibility. A warning will be issued if provided. + + Returns + ------- + importance : ndarray of shape (n_features,) + Feature importance scores/test statistics. + For features not selected during screening, scores are set to 0. + + Notes + ----- + Also sets the importances\_ and pvalues\_ attributes on the instance. + See fit() and importance() for details on the underlying computations. + """ + if cv is not None: + warnings.warn("cv won't be used") + self.fit(X, y) + return self.importance(X, y) diff --git a/test/test_ensemble_clustered_inference.py b/test/test_ensemble_clustered_inference.py index 76e33bb7c..b894308dd 100644 --- a/test/test_ensemble_clustered_inference.py +++ b/test/test_ensemble_clustered_inference.py @@ -7,18 +7,28 @@ from sklearn.cluster import FeatureAgglomeration from sklearn.preprocessing import StandardScaler from sklearn.feature_extraction import image +from sklearn.linear_model import MultiTaskLassoCV +from sklearn.model_selection import KFold -from hidimstat.ensemble_clustered_inference import ( - clustered_inference, - clustered_inference_pvalue, -) -from hidimstat.ensemble_clustered_inference import ( - ensemble_clustered_inference, - ensemble_clustered_inference_pvalue, -) +from hidimstat.ensemble_clustered_inference import ClusteredInference +from hidimstat.ensemble_clustered_inference import EnsembleClusteredInference +from hidimstat.desparsified_lasso import DesparsifiedLasso from hidimstat._utils.scenario import multivariate_simulation +def set_desparsified_lasso_multi_time(): + multitasklassoCV = MultiTaskLassoCV( + eps=1e-2, + fit_intercept=False, + cv=KFold(n_splits=5, shuffle=True, random_state=0), + tol=1e-4, + max_iter=5000, + random_state=1, + n_jobs=1, + ) + return DesparsifiedLasso(lasso_cv=multitasklassoCV) + + # Scenario 1: data with no temporal dimension def test_clustered_inference_no_temporal(): """ @@ -58,20 +68,22 @@ def test_clustered_inference_no_temporal(): n_clusters=n_clusters, connectivity=connectivity, linkage="ward" ) - ward_, desparsified_lassos = clustered_inference( - X_init, y, ward, n_clusters, scaler_sampling=StandardScaler() - ) - - beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( - clustered_inference_pvalue(n_samples, None, ward_, desparsified_lassos) - ) + clustered_inference = ClusteredInference( + ward, n_clusters, scaler_sampling=StandardScaler() + ).fit(X_init, y) + clustered_inference.importance(X_init, y) expected = 0.5 * np.ones(n_features) expected[:support_size] = 0.0 - assert_almost_equal(pval_corr[:interior_support], expected[:interior_support]) assert_almost_equal( - pval_corr[extended_support:200], expected[extended_support:200], decimal=1 + clustered_inference.pvalues_corr_[:interior_support], + expected[:interior_support], + ) + assert_almost_equal( + clustered_inference.pvalues_corr_[extended_support:200], + expected[extended_support:200], + decimal=1, ) @@ -113,22 +125,26 @@ def test_clustered_inference_temporal(): n_clusters=n_clusters, connectivity=connectivity, linkage="ward" ) - ward_, desparsified_lassos = clustered_inference( - X, y, ward, n_clusters, scaler_sampling=StandardScaler() - ) - - beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( - clustered_inference_pvalue(n_samples, True, ward_, desparsified_lassos) - ) + clustered_inference = ClusteredInference( + ward, + n_clusters, + variable_importance=set_desparsified_lasso_multi_time(), + scaler_sampling=StandardScaler(), + ).fit(X, y) + clustered_inference.importance(X, y) expected = 0.5 * np.ones(n_features) expected[:support_size] = 0.0 assert_almost_equal( - pval_corr[:interior_support], expected[:interior_support], decimal=3 + clustered_inference.pvalues_corr_[:interior_support], + expected[:interior_support], + decimal=3, ) assert_almost_equal( - pval_corr[extended_support:], expected[extended_support:], decimal=1 + clustered_inference.pvalues_corr_[extended_support:], + expected[extended_support:], + decimal=1, ) @@ -182,22 +198,25 @@ def test_clustered_inference_no_temporal_groups(): n_clusters=n_clusters, connectivity=connectivity, linkage="ward" ) - ward_, desparsified_lassos = clustered_inference( - X_, y_, ward, n_clusters, groups=groups, scaler_sampling=StandardScaler() - ) - - beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( - clustered_inference_pvalue( - n_groups * n_samples, False, ward_, desparsified_lassos - ) - ) + clustered_inference = ClusteredInference( + ward, + n_clusters, + scaler_sampling=StandardScaler(), + groups=groups, + ).fit(X_, y_) + clustered_inference.importance(X_, y_) expected = 0.5 * np.ones(n_features) expected[:support_size] = 0.0 - assert_almost_equal(pval_corr[:interior_support], expected[:interior_support]) assert_almost_equal( - pval_corr[extended_support:200], expected[extended_support:200], decimal=1 + clustered_inference.pvalues_corr_[:interior_support], + expected[:interior_support], + ) + assert_almost_equal( + clustered_inference.pvalues_corr_[extended_support:200], + expected[extended_support:200], + decimal=1, ) @@ -239,17 +258,14 @@ def test_ensemble_clustered_inference(): n_clusters=n_clusters, connectivity=connectivity, linkage="ward" ) - list_ward, list_desparsified_lassos = ensemble_clustered_inference( - X_init, - y, - ward, - n_clusters, - scaler_sampling=StandardScaler(), - n_bootstraps=n_bootstraps, - ) - beta_hat, selected = ensemble_clustered_inference_pvalue( - n_samples, False, list_ward, list_desparsified_lassos + clustered_inference = ClusteredInference( + ward, n_clusters, scaler_sampling=StandardScaler() ) + EnCluDl = EnsembleClusteredInference( + variable_importance=clustered_inference, n_bootstraps=n_bootstraps + ).fit(X_init, y) + EnCluDl.importance(X_init, y) + selected = EnCluDl.selection_fdr(fdr=0.1) expected = np.zeros(n_features) expected[:support_size] = 1.0 @@ -297,21 +313,17 @@ def test_ensemble_clustered_inference_temporal_data(): n_clusters=n_clusters, connectivity=connectivity, linkage="ward" ) - list_ward, list_desparsified_lassos = ensemble_clustered_inference( - X, - y, + clustered_inference = ClusteredInference( ward, n_clusters, + variable_importance=set_desparsified_lasso_multi_time(), scaler_sampling=StandardScaler(), - n_bootstraps=n_bootstraps, - ) - beta_hat, selected = ensemble_clustered_inference_pvalue( - n_samples, - True, - list_ward, - list_desparsified_lassos, - fdr_control="bhq", ) + EnCluDl = EnsembleClusteredInference( + variable_importance=clustered_inference, n_bootstraps=n_bootstraps + ).fit(X, y) + EnCluDl.importance(X, y) + selected = EnCluDl.selection_fdr(fdr=0.1, fdr_control="bhq") expected = np.zeros(n_features) expected[:support_size] = 1.0 @@ -324,13 +336,7 @@ def test_ensemble_clustered_inference_temporal_data(): ) # different aggregation method - beta_hat, selected = ensemble_clustered_inference_pvalue( - n_samples, - True, - list_ward, - list_desparsified_lassos, - fdr_control="bhy", - ) + selected = EnCluDl.selection_fdr(fdr=0.1, fdr_control="bhy") expected = np.zeros(n_features) expected[:support_size] = 1.0 From 37b9fa55e54d6db603c40cd9dcf9034ba1759326 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 8 Sep 2025 17:52:43 +0200 Subject: [PATCH 15/93] fix merge --- src/hidimstat/ensemble_clustered_inference.py | 76 ------------------- 1 file changed, 76 deletions(-) diff --git a/src/hidimstat/ensemble_clustered_inference.py b/src/hidimstat/ensemble_clustered_inference.py index 0498cffe6..4bdf4aede 100644 --- a/src/hidimstat/ensemble_clustered_inference.py +++ b/src/hidimstat/ensemble_clustered_inference.py @@ -286,85 +286,9 @@ def fit(self, X_init, y): ) return self -<<<<<<< HEAD - # inference methods - if hasattr(kwargs, "lasso_cv") and kwargs["lasso_cv"] is not None: - pass - elif len(y.shape) > 1 and y.shape[1] > 1: - kwargs["lasso_cv"] = MultiTaskLassoCV( - eps=1e-2, - fit_intercept=False, - cv=KFold(n_splits=5, shuffle=True, random_state=0), - tol=1e-4, - max_iter=5000, - random_state=1, - n_jobs=1, - ) - else: - kwargs["lasso_cv"] = LassoCV( - eps=1e-2, - fit_intercept=False, - cv=KFold(n_splits=5, shuffle=True, random_state=0), - tol=1e-4, - max_iter=5000, - random_state=1, - n_jobs=1, - ) - - desparsified_lassos = memory.cache( - DesparsifiedLasso( - n_jobs=n_jobs, - memory=memory, - verbose=verbose, - **kwargs, - ).fit, - ignore=["n_jobs", "verbose", "memory"], - )( - X_reduced, - y, - ) - desparsified_lassos.importance(X_reduced, y) -||||||| parent of ec9ff4e (Add Encldel and Cluster) - # inference methods - multitasklassoCV = MultiTaskLassoCV( - eps=1e-2, - fit_intercept=False, - cv=KFold(n_splits=5, shuffle=True, random_state=0), - tol=1e-4, - max_iter=5000, - random_state=1, - n_jobs=1, - ) - lasso_cv = LassoCV( - eps=1e-2, - fit_intercept=False, - cv=KFold(n_splits=5, shuffle=True, random_state=0), - tol=1e-4, - max_iter=5000, - random_state=1, - n_jobs=1, - ) - desparsified_lassos = memory.cache( - DesparsifiedLasso( - lasso_cv=( - multitasklassoCV if len(y.shape) > 1 and y.shape[1] > 1 else lasso_cv - ), - n_jobs=n_jobs, - memory=memory, - verbose=verbose, - **kwargs, - ).fit, - ignore=["n_jobs", "verbose", "memory"], - )( - X_reduced, - y, - ) - desparsified_lassos.importance(X_reduced, y) -======= def _check_fit(self): """ Check if the model has been fit before performing analysis. ->>>>>>> ec9ff4e (Add Encldel and Cluster) This private method verifies that all necessary attributes have been set during the fitting process. From e4d1c445df0be0423142af0c586ec1cf997641ee Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 8 Sep 2025 17:56:10 +0200 Subject: [PATCH 16/93] update docstring --- src/hidimstat/desparsified_lasso.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index e31d14c68..d09b4ddb0 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -460,7 +460,8 @@ def importance(self, X, y): return self.importances_ def fit_importance(self, X, y, cv=None): - """Fit and compute variable importance in one step. + """ + Fit and compute variable importance in one step. Parameters ---------- From 33973a53195ad42078534ca741e2273c0cc20345 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 8 Sep 2025 19:34:42 +0200 Subject: [PATCH 17/93] merge cluster and EnCLuDL --- src/hidimstat/ensemble_clustered_inference.py | 654 +++++------------- 1 file changed, 183 insertions(+), 471 deletions(-) diff --git a/src/hidimstat/ensemble_clustered_inference.py b/src/hidimstat/ensemble_clustered_inference.py index 4bdf4aede..87b538c2c 100644 --- a/src/hidimstat/ensemble_clustered_inference.py +++ b/src/hidimstat/ensemble_clustered_inference.py @@ -3,9 +3,9 @@ import numpy as np from sklearn.base import clone from joblib import Parallel, delayed -from sklearn.utils.validation import check_memory from sklearn.exceptions import NotFittedError from sklearn.cluster import FeatureAgglomeration +from sklearn.preprocessing import StandardScaler from sklearn.base import check_is_fitted from sklearn.utils import check_random_state @@ -14,143 +14,16 @@ from hidimstat._utils.bootstrap import _subsampling -def _ungroup_beta(beta_hat, n_features, ward): - """ - Ungroup cluster-level beta coefficients to individual feature-level - coefficients. - - Parameters - ---------- - beta_hat : ndarray, shape (n_clusters,) or (n_clusters, n_times) - Beta coefficients at cluster level - n_features : int - Number of features in original space - ward : sklearn.cluster.FeatureAgglomeration - Fitted clustering object - - Returns - ------- - beta_hat_degrouped : ndarray, shape (n_features,) or (n_features, n_times) - Rescaled beta coefficients for individual features, weighted by - inverse cluster size - - Notes - ----- - Each coefficient is scaled by 1/cluster_size to maintain proper magnitude - when distributing cluster effects to individual features. - Handles both univariate (1D) and multivariate (2D) beta coefficients. - """ - labels = ward.labels_ - # compute the size of each cluster - clusters_size = np.zeros(labels.size) - for label in range(labels.max() + 1): - cluster_size = np.sum(labels == label) - clusters_size[labels == label] = cluster_size - # degroup beta_hat - if len(beta_hat.shape) == 1: - # weighting the weight of beta with the size of the cluster - beta_hat_degrouped = ward.inverse_transform(beta_hat) / clusters_size - elif len(beta_hat.shape) == 2: - n_times = beta_hat.shape[1] - beta_hat_degrouped = np.zeros((n_features, n_times)) - for i in range(n_times): - beta_hat_degrouped[:, i] = ( - ward.inverse_transform(beta_hat[:, i]) / clusters_size - ) - return beta_hat_degrouped - - -def _degrouping(ward, beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr): - """ - Degroup and rescale cluster-level statistics to individual features. - This function takes cluster-level statistics and assigns them back - to individual features, while appropriately rescaling the parameter - estimates based on cluster sizes. - - Parameters - ---------- - ward : sklearn.cluster.FeatureAgglomeration - Fitted clustering object containing the hierarchical structure - beta_hat : ndarray, shape (n_clusters,) or (n_clusters, n_times) - Estimated parameters at cluster level - pval : ndarray, shape (n_clusters,) - P-values at cluster level - pval_corr : ndarray, shape (n_clusters,) - Corrected p-values at cluster level - one_minus_pval : ndarray, shape (n_clusters,) - 1 - p-values at cluster level - one_minus_pval_corr : ndarray, shape (n_clusters,) - 1 - corrected p-values at cluster level - - Returns - ------- - beta_hat : ndarray, shape (n_features,) or (n_features, n_times) - Rescaled parameter estimates for individual features - pval : ndarray, shape (n_features,) - P-values for individual features - pval_corr : ndarray, shape (n_features,) - Corrected p-values for individual features - one_minus_pval : ndarray, shape (n_features,) - 1 - p-values for individual features - one_minus_pval_corr : ndarray, shape (n_features,) - 1 - corrected p-values for individual features - - Notes - ----- - The beta_hat values are rescaled by dividing by the cluster size - to maintain the proper scale of the estimates when moving from - cluster-level to feature-level. - The function handles both 1D and 2D beta_hat arrays for single and - multiple time points. - """ - # degroup variable other than beta_hat - pval, pval_corr, one_minus_pval, one_minus_pval_corr = map( - ward.inverse_transform, - [pval, pval_corr, one_minus_pval, one_minus_pval_corr], - ) - - beta_hat = _ungroup_beta(beta_hat, n_features=pval.shape[0], ward=ward) - - return beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr - - -def _ward_clustering(X_init, ward, train_index): - """ - Performs Ward clustering on data using a training subset. - - This function applies Ward hierarchical clustering to a dataset, where the clustering - is computed based on a subset of samples but then applied to the full dataset. - - Parameters - ---------- - X_init : numpy.ndarray - Initial data matrix of shape (n_samples, n_features) to be clustered - ward : sklearn.cluster.FeatureAgglomeration - Ward clustering estimator instance - train_index : array-like - Indices of samples to use for computing the clustering - - Returns - ------- - tuple - - X_reduced : numpy.ndarray - Transformed data matrix after applying Ward clustering - - ward : sklearn.cluster.FeatureAgglomeration - Fitted Ward clustering estimator - """ - ward = ward.fit(X_init[train_index, :]) - X_reduced = ward.transform(X_init) - return X_reduced, ward - - -class ClusteredInference(BaseVariableImportance): +class EnsembleClusteredInference(BaseVariableImportance): """ - Clustered inference algorithm for statistical analysis of - high-dimensional data. + Ensemble clustered inference algorithm for high-dimensional + statistical inference. This algorithm implements the method described in :cite:`chevalier2022spatially` for - performing statistical inference on high-dimensional linear models - using feature clustering to reduce dimensionality. + + This algorithm combines multiple runs of clustered inference with + different random subsamples to provide more robust statistical estimates. + It uses the desparsified lasso method for inference. Parameters ---------- @@ -223,66 +96,63 @@ class ClusteredInference(BaseVariableImportance): def __init__( self, - ward, - n_clusters, variable_importance=DesparsifiedLasso(), + ward=None, + n_bootstraps=25, scaler_sampling=None, train_size=1.0, groups=None, - seed=0, + random_state=None, n_jobs=1, - memory=None, verbose=1, ): - self.ward = ward - self.n_clusters = n_clusters + assert issubclass(DesparsifiedLasso, variable_importance) self.variable_importance = variable_importance + assert ward is None or issubclass( + FeatureAgglomeration, ward.__class__ + ), "Ward should a FeatureAgglomeration" + self.ward = ward + assert scaler_sampling is None or issubclass( + StandardScaler, scaler_sampling.__class__ + ) self.scaler_sampling = scaler_sampling + self.n_bootstraps = n_bootstraps self.train_size = train_size self.groups = groups - self.seed = seed + self.random_state = random_state self.n_jobs = n_jobs - self.memory = memory self.verbose = verbose # generalize to all the feature generated + self.list_ward_scaler_vi_ = None + self.list_importances_ = None + self.list_pvalues_ = None + self.list_pvalues_corr_ = None self.pvalues_corr_ = None - def fit(self, X_init, y): - memory = check_memory(memory=self.memory) - assert issubclass( - self.ward.__class__, FeatureAgglomeration - ), "ward need to an instance of sklearn.cluster.FeatureAgglomeration" - - n_samples, n_features = X_init.shape + def fit(self, X, y): + rng = check_random_state(self.random_state) + seed = rng.randint(1) if self.verbose > 0: print( - f"Clustered inference: n_clusters = {self.n_clusters}, " - + f"inference method desparsified lasso, seed = {self.seed}," + f"Clustered inference: n_clusters = {self.ward.n_clusters}, " + + f"inference method desparsified lasso, seed = {self.random_state}," + f"groups = {self.groups is not None} " ) - - ## This are the 3 step in first loop of the algorithm 2 of [1] - # sampling row of X - train_index = _subsampling( - n_samples, self.train_size, groups=self.groups, seed=self.seed - ) - - # transformation matrix - X_reduced, self.ward = memory.cache(_ward_clustering)( - X_init, clone(self.ward), train_index - ) - - # Preprocessing - if self.scaler_sampling is not None: - self.scaler_sampling = clone(self.scaler_sampling) - X_reduced = self.scaler_sampling.fit_transform(X_reduced) - - # inference methods - self.variable_importance = memory.cache(self.variable_importance.fit)( - X_reduced, - y, + parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose) + self.list_ward_scaler_vi_ = parallel( + delayed(_bootstrap_run_fit)( + X, + y, + self.train_size, + self.groups, + i, + self.ward, + self.scaler_sampling, + self.variable_importance, + ) + for i in np.arange(seed, seed + self.n_bootstraps) ) return self @@ -304,15 +174,24 @@ def _check_fit(self): If any of the required attributes are missing, indicating the model hasn't been fit. """ - self.variable_importance._check_fit() - try: - check_is_fitted(self.ward) - if self.scaler_sampling is not None: - check_is_fitted(self.scaler_sampling) - except NotFittedError: - raise ValueError( - "The ClusteredInference requires to be fit before any analysis" - ) + if self.list_ward_scaler_vi_ is None: + raise ValueError("The requires to be fit before any analysis") + for ward, scaler, vi in self.list_ward_scaler_vi_: + if ward is not None: + try: + check_is_fitted(self.ward) + except NotFittedError: + raise ValueError( + "The ClusteredInference requires to be fit before any analysis" + ) + if scaler is not None: + try: + check_is_fitted(self.scaler_sampling) + except NotFittedError: + raise ValueError( + "The ClusteredInference requires to be fit before any analysis" + ) + vi._check_fit() def importance(self, X, y): """ @@ -349,23 +228,30 @@ def importance(self, X, y): 4. Computes p-value assuming standard normal distribution """ self._check_fit() - X_reduced = self.ward.transform(X) - if self.scaler_sampling is not None: - X_reduced = self.scaler_sampling.transform(X_reduced) - - self.variable_importance.importance(X_reduced, y) - beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = _degrouping( - self.ward, - self.variable_importance.importances_, - self.variable_importance.pvalues_, - self.variable_importance.pvalues_corr_, - 1 - self.variable_importance.pvalues_, - 1 - self.variable_importance.pvalues_corr_, + + parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose) + results = parallel( + delayed(_bootstrap_run_importance)( + ward, + scaler, + vi, + X, + y, + ) + for ward, scaler, vi in self.list_ward_scaler_vi_ ) - self.importances_ = beta_hat - self.pvalues_ = pval - self.pvalues_corr_ = pval_corr + self.list_importances_ = [] + self.list_pvalues_ = [] + self.list_pvalues_corr_ = [] + for importances, pvalues, pvalues_corr in results: + self.list_importances_.append(importances) + self.list_pvalues_.append(pvalues) + self.list_pvalues_corr_.append(pvalues_corr) + + self.importances_ = np.mean(self.list_importances_) + self.pvalues_ = np.mean(self.list_pvalues_) + self.pvalues_corr_ = np.mean(self.list_pvalues_corr_) return self.importances_ def fit_importance(self, X, y, cv=None): @@ -401,289 +287,115 @@ def fit_importance(self, X, y, cv=None): return self.importance(X, y) -class EnsembleClusteredInference(BaseVariableImportance): - """ - Ensemble clustered inference algorithm for high-dimensional - statistical inference, as described in :cite:`chevalier2022spatially`. +def _bootstrap_run_fit( + X_init, + y, + train_size, + groups, + seed, + ward, + scaler_sampling, + variable_importance, +): + + n_samples, n_features = X_init.shape + + ## This are the 3 step in first loop of the algorithm 2 of `chevalier2022spatially` + # sampling row of X + train_index = _subsampling(n_samples, train_size, groups=groups, seed=seed) + + X = X_init[train_index, :] + # transformation matrix + if ward is not None: + ward_ = clone(ward) + X_reduced, ward = ward_.fit_transform(X) + else: + X_reduced = X_init + ward_ = None + + # Preprocessing + if scaler_sampling is not None: + scaler_sampling_ = clone(scaler_sampling) + X_reduced = scaler_sampling_.fit_transform(X_reduced) + else: + scaler_sampling_ = None + + # inference methods + variable_importance_ = clone(variable_importance).fit(X_reduced, y) + + return ward_, scaler_sampling_, variable_importance_ + + +def _bootstrap_run_importance(ward_, scaler_sampling_, variable_importance_, X, y): + # apply reduction + if ward_ is not None: + X_ = ward_.transform(X) + else: + X_ = X + + # apply Preprocessing + if scaler_sampling_ is not None: + X_ = scaler_sampling_.transform(X) + else: + X_ = X + + variable_importance_.importance(X_, y) + + if ward_ is not None: + pvalue = ward_.inverse_transform(variable_importance_.pvalue_) + pvalue_corr = ward_.inverse_transform(variable_importance_.pvalue_corr) + importance = _ungroup_beta( + variable_importance_.importances_, n_features=pvalue.shape[0], ward=ward_ + ) + else: + pvalue = variable_importance_.pvalue_ + pvalue_corr = variable_importance_.pvalue_corr_ + importance = variable_importance_.importances_ - This algorithm combines multiple runs of clustered inference with - different random subsamples to provide more robust statistical estimates. - It uses the desparsified lasso method for inference. + return importance, pvalue, pvalue_corr - Parameters - ---------- - X_init : ndarray, shape (n_samples, n_features) - Original high-dimensional input data matrix. - y : ndarray, shape (n_samples,) or (n_samples, n_times) - Target variable(s). Can be univariate or multivariate (temporal) data. +def _ungroup_beta(beta_hat, n_features, ward): + """ + Ungroup cluster-level beta coefficients to individual feature-level + coefficients. + Parameters + ---------- + beta_hat : ndarray, shape (n_clusters,) or (n_clusters, n_times) + Beta coefficients at cluster level + n_features : int + Number of features in original space ward : sklearn.cluster.FeatureAgglomeration - Feature agglomeration object implementing Ward hierarchical clustering. - - n_clusters : int - Number of clusters for dimensionality reduction. - - scaler_sampling : sklearn.preprocessing object, optional (default=None) - Scaler to standardize the clustered features. - - train_size : float, optional (default=0.3) - Fraction of samples used for clustering. Using train_size < 1 enables - random subsampling for better generalization. - - groups : ndarray, shape (n_samples,), optional (default=None) - Sample group labels for stratified subsampling. Ensures balanced - representation of groups in subsamples. - - inference_method : str, optional (default='desparsified-lasso') - Method used for inference. - Currently, the two available methods are 'desparsified-lasso' - and 'group-desparsified-lasso'. Use 'desparsified-lasso' for - non-temporal data and 'group-desparsified-lasso' for temporal data. - - seed: int, optional (default=0) - Seed used for generating the first random subsample of the data. - This seed controls the clustering randomness. - - ensembling_method : str, optional (default='quantiles') - Method used for ensembling. Currently, the two available methods - are 'quantiles' and 'median'. - - gamma_min : float, optional (default=0.2) - Lowest gamma-quantile considered to compute the adaptive - quantile aggregation formula. This parameter is used only if - `ensembling_method` is 'quantiles'. - - n_bootstraps : int, optional (default=25) - Number of bootstrap iterations for ensemble inference. - - n_jobs : int or None, optional (default=None) - Number of parallel jobs. None means using all processors. - - verbose: int, optional (default=1) - The verbosity level. If `verbose > 0`, a message is printed before - running the clustered inference. - - memory : joblib.Memory or str, optional (default=None) - Used to cache the output of the clustering and inference computation. - By default, no caching is done. If provided, it should be the path - to the caching directory or a joblib.Memory object. - - **kwargs : dict - Additional keyword arguments passed to statistical inference functions. + Fitted clustering object Returns ------- - list_ward : list of FeatureAgglomeration - List of fitted clustering objects from each bootstrap. - - list_beta_hat : list of ndarray - List of estimated coefficients from each bootstrap. - - pval : ndarray, shape (n_features,) - p-value, with numerically accurate values for - positive effects (i.e., for p-values close to zero). - - list_theta_hat : list of ndarray - List of estimated precision matrices. - - list_precision_diag : list of ndarray - List of diagonal elements of covariance matrices. - - one_minus_pval : ndarray, shape (n_features,) - One minus the p-value, with numerically accurate values - for negative effects (i.e., for p-values close to one). + beta_hat_degrouped : ndarray, shape (n_features,) or (n_features, n_times) + Rescaled beta coefficients for individual features, weighted by + inverse cluster size Notes ----- - The algorithm performs these steps for each bootstrap iteration: - 1. Subsample the data using stratified sampling if groups are provided - 2. Cluster features using Ward's hierarchical clustering - 3. Transform data to reduced cluster space - 4. Perform statistical inference using desparsified lasso - 5. Aggregate results across all iterations - - References - ---------- - .. footbibliography:: + Each coefficient is scaled by 1/cluster_size to maintain proper magnitude + when distributing cluster effects to individual features. + Handles both univariate (1D) and multivariate (2D) beta coefficients. """ - - def __init__( - self, - variable_importance, - n_bootstraps=25, - n_jobs=None, - verbose=1, - memory=None, - random_state=None, - ): - self.variable_importance = variable_importance - self.n_bootstraps = n_bootstraps - self.n_jobs = n_jobs - self.verbose = verbose - self.memory = memory - self.random_state = random_state - - self.list_variable_importances_ = None - - def fit(self, X, y): - """ - Fit the dCRT model. - - This method fits the Distilled Conditional Randomization Test (DCRT) model - as described in :footcite:t:`liu2022fast`. It performs optional feature - screening using Lasso, computes coefficients, and prepares the model for - importance and p-value computation. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - Training data matrix. - y : array-like of shape (n_samples,) - Target values. - - Returns - ------- - self : object - Returns the fitted instance. - - Notes - ----- - Main steps: - 1. Optional data centering with StandardScaler - 2. Lasso screening of variables (if no estimated coefficients provided) - 3. Feature selection based on coefficient magnitudes - 4. Model refitting on selected features (if refit=True) - 5. Fit model for future distillation - - The screening threshold controls which features are kept based on their - Lasso coefficients. Features with coefficients below the threshold are - set to zero. - - References - ---------- - .. footbibliography:: - """ - rng = check_random_state(self.random_state) - seed = rng.randint(1) - - def run_fit(variable_importance, X, y, random_state): - return variable_importance(random_state=random_state, n_jobs=1).fit(X, y) - - self.list_variable_importances_ = Parallel( - n_jobs=self.n_jobs, verbose=self.verbose - )( - delayed(run_fit)(clone(self.variable_importance), X, y, i) - for i in np.arange(seed, seed + self.n_bootstraps) - ) - return self - - def _check_fit(self): - """ - Check if the model has been fit before performing analysis. - - This private method verifies that all necessary attributes have been set - during the fitting process. - These attributes include: - - clf_x_ - - clf_y_ - - coefficient_ - - non_selection_ - - Raises - ------ - ValueError - If any of the required attributes are missing, indicating the model - hasn't been fit. - """ - if self.list_variable_importances_ is None: - raise ValueError("The D0CRT requires to be fit before any analysis") - - def importance(self, X, y): - """ - Compute feature importance scores using distilled CRT. - - Calculates test statistics and p-values for each feature using residual - correlations after the distillation process. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - Input data matrix. - y : array-like of shape (n_samples,) - Target values. - - Returns - ------- - importances_ : ndarray of shape (n_features,) - Test statistics/importance scores for each feature. For unselected features, - the score is set to 0. - - Attributes - ---------- - importances_ : same as return value - pvalues_ : ndarray of shape (n_features,) - Two-sided p-values for each feature under Gaussian null. - - Notes - ----- - For each selected feature j: - 1. Computes residuals from regressing X_j on other features - 2. Computes residuals from regressing y on other features - 3. Calculates test statistic from correlation of residuals - 4. Computes p-value assuming standard normal distribution - """ - self._check_fit() - - def run_importance(variable_importance, X, y): - variable_importance.importance(X, y) - return None - - parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose) - parallel( - delayed(run_importance)(variable_importance, X, y) - for variable_importance in self.list_variable_importances_ - ) - - # Ensembling - # TODO check if selection_FDR is good - self.importances_ = np.mean( - [vi.importances_ for vi in self.list_variable_importances_], axis=0 - ) - # pvalue selection - self.pvalues_ = np.array( - [vi.pvalues_ for vi in self.list_variable_importances_] - ) - return self.importances_ - - def fit_importance(self, X, y, cv=None): - """ - Fits the model to the data and computes feature importance. - - A convenience method that combines fit() and importance() into a single call. - First fits the dCRT model to the data, then calculates importance scores. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - Training data matrix. - y : array-like of shape (n_samples,) - Target values. - cv : None or int, optional (default=None) - Not used. Included for compatibility. A warning will be issued if provided. - - Returns - ------- - importance : ndarray of shape (n_features,) - Feature importance scores/test statistics. - For features not selected during screening, scores are set to 0. - - Notes - ----- - Also sets the importances\_ and pvalues\_ attributes on the instance. - See fit() and importance() for details on the underlying computations. - """ - if cv is not None: - warnings.warn("cv won't be used") - self.fit(X, y) - return self.importance(X, y) + labels = ward.labels_ + # compute the size of each cluster + clusters_size = np.zeros(labels.size) + for label in range(labels.max() + 1): + cluster_size = np.sum(labels == label) + clusters_size[labels == label] = cluster_size + # degroup beta_hat + if len(beta_hat.shape) == 1: + # weighting the weight of beta with the size of the cluster + beta_hat_degrouped = ward.inverse_transform(beta_hat) / clusters_size + elif len(beta_hat.shape) == 2: + n_times = beta_hat.shape[1] + beta_hat_degrouped = np.zeros((n_features, n_times)) + for i in range(n_times): + beta_hat_degrouped[:, i] = ( + ward.inverse_transform(beta_hat[:, i]) / clusters_size + ) + return beta_hat_degrouped From 076148e20b6d21b342197c07350c54e5dd0d5238 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 9 Sep 2025 16:43:32 +0200 Subject: [PATCH 18/93] change signal noise ratio --- test/test_desparsified_lasso.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_desparsified_lasso.py b/test/test_desparsified_lasso.py index 22e9fda87..4c8c2d7b3 100644 --- a/test/test_desparsified_lasso.py +++ b/test/test_desparsified_lasso.py @@ -66,7 +66,7 @@ def test_desparsified_group_lasso(): n_features = 100 n_target = 10 support_size = 2 - signal_noise_ratio = 5000 + signal_noise_ratio = 100 rho_serial = 0.9 corr = toeplitz(np.geomspace(1, rho_serial ** (n_target - 1), n_target)) multitasklassoCV = MultiTaskLassoCV( @@ -132,7 +132,7 @@ def test_exception(): n_features = 100 n_target = 10 support_size = 2 - signal_noise_ratio = 5000 + signal_noise_ratio = 50 rho_serial = 0.9 corr = toeplitz(np.geomspace(1, rho_serial ** (n_target - 1), n_target)) multitasklassoCV = MultiTaskLassoCV( From f28126c62c52683581b11424a323eb27879d2892 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 9 Sep 2025 16:47:06 +0200 Subject: [PATCH 19/93] change name for model --- examples/plot_fmri_data_example.py | 2 +- src/hidimstat/desparsified_lasso.py | 54 ++++++++++++++--------------- test/test_desparsified_lasso.py | 20 +++++------ 3 files changed, 36 insertions(+), 40 deletions(-) diff --git a/examples/plot_fmri_data_example.py b/examples/plot_fmri_data_example.py index fa8055724..7f21fbd30 100644 --- a/examples/plot_fmri_data_example.py +++ b/examples/plot_fmri_data_example.py @@ -160,7 +160,7 @@ def preprocess_haxby(subject=2, memory=None): # of 5 G for memory. To handle this problem, the following methods use some # feature aggregation methods. try: - desparsified_lasso = DesparsifiedLasso(noise_method="median", lasso_cv=estimator) + desparsified_lasso = DesparsifiedLasso(noise_method="median", model_y=estimator) desparsified_lasso.fit_importance(X, y) pval_dl = desparsified_lasso.pvalues_ one_minus_pval_dl = 1 - pval_dl diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index d09b4ddb0..dd96db704 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -123,7 +123,7 @@ class DesparsifiedLasso(BaseVariableImportance): def __init__( self, - lasso_cv=LassoCV( + model_y=LassoCV( eps=1e-2, fit_intercept=False, cv=KFold(n_splits=5, shuffle=True, random_state=0), @@ -132,7 +132,7 @@ def __init__( random_state=1, n_jobs=1, ), - lasso=Lasso(max_iter=5000, tol=1e-3), + model_x=Lasso(max_iter=5000, tol=1e-3), centered=True, dof_ajdustement=False, alpha_max_fraction=0.01, @@ -152,16 +152,16 @@ def __init__( ): assert issubclass( - Lasso, lasso.__class__ + Lasso, model_x.__class__ ), "lasso needs to be a Lasso or a MultiTaskLasso" - self.lasso = lasso - if issubclass(LassoCV, lasso_cv.__class__): + self.model_x = model_x + if issubclass(LassoCV, model_y.__class__): self.n_times_ = 1 - elif issubclass(MultiTaskLassoCV, lasso_cv.__class__): + elif issubclass(MultiTaskLassoCV, model_y.__class__): self.n_times_ = -1 else: raise AssertionError("lasso_cv needs to be a LassoCV or a MultiTaskLassoCV") - self.lasso_cv = lasso_cv + self.model_y = model_y self.centered = centered self.dof_ajdustement = dof_ajdustement self.alpha_max_fraction = alpha_max_fraction @@ -229,26 +229,24 @@ def fit(self, X, y): _, n_features = X_.shape try: - check_is_fitted(self.lasso_cv) + check_is_fitted(self.model_y) except NotFittedError: # check if max_iter is large enough - if self.lasso_cv.max_iter // self.lasso_cv.cv.n_splits <= n_features: - self.lasso_cv.set_params( - max_iter=n_features * self.lasso_cv.cv.n_splits - ) + if self.model_y.max_iter // self.model_y.cv.n_splits <= n_features: + self.model_y.set_params(max_iter=n_features * self.model_y.cv.n_splits) warnings.warn( - f"'max_iter' has been increased to {self.lasso_cv.max_iter}" + f"'max_iter' has been increased to {self.model_y.max_iter}" ) # use the cross-validation for define the best alpha of Lasso - self.lasso_cv.set_params(n_jobs=self.n_jobs) - self.lasso_cv.fit(X_, y_) + self.model_y.set_params(n_jobs=self.n_jobs) + self.model_y.fit(X_, y_) # Estimate the support of the variable importance - residual = self.lasso_cv.predict(X_) - y_ + residual = self.model_y.predict(X_) - y_ # Lasso regression and noise standard deviation estimation self.sigma_hat_ = memory.cache(reid, ignore=["n_jobs"])( - self.lasso_cv.coef_, + self.model_y.coef_, residual, tolerance=self.tolerance_reid, # for group @@ -277,7 +275,7 @@ def _check_fit(self): "The Desparsified Lasso requires to be fit before any analysis" ) try: - check_is_fitted(self.lasso_cv) + check_is_fitted(self.model_y) except NotFittedError: raise ValueError( "The Desparsified Lasso requires to be fit before any analysis" @@ -334,7 +332,7 @@ def importance(self, X, y): X_ = X y_ = y n_samples, n_features = X_.shape - assert X_.shape[1] == self.lasso_cv.coef_.shape[-1] + assert X_.shape[1] == self.model_y.coef_.shape[-1] assert self.n_times_ == 1 or self.n_times_ == y.shape[1] if self.n_times_ > 1: if self.covariance is not None and self.covariance.shape != ( @@ -345,7 +343,7 @@ def importance(self, X, y): f'Shape of "cov" should be ({self.n_times_}, {self.n_times_}),' + f' the shape of "cov" was ({self.covariance.shape}) instead' ) - assert y_.shape[1] == self.lasso_cv.coef_.shape[0] + assert y_.shape[1] == self.model_y.coef_.shape[0] # define the alphas for the Nodewise Lasso list_alpha_max = _alpha_max(X_, X_, fill_diagonal=True, axis=0) @@ -363,7 +361,7 @@ def importance(self, X, y): delayed(_compute_residuals)( X=X_, id_column=i, - clf=clone(self.lasso).set_params( + clf=clone(self.model_x).set_params( alpha=alphas[i], precompute=np.delete(np.delete(gram, i, axis=0), i, axis=1), random_state=np.random.RandomState( @@ -380,8 +378,8 @@ def importance(self, X, y): # Computing the degrees of freedom adjustement if self.dof_ajdustement: - coefficient_max = np.max(np.abs(self.lasso_cv.coef_)) - support = np.sum(np.abs(self.lasso_cv.coef_) > 0.01 * coefficient_max) + coefficient_max = np.max(np.abs(self.model_y.coef_)) + support = np.sum(np.abs(self.model_y.coef_) > 0.01 * coefficient_max) support = min(support, n_samples - 1) dof_factor = n_samples / (n_samples - support) else: @@ -396,7 +394,7 @@ def importance(self, X, y): P_nodiagonal = P - np.diag(np.diag(P)) Id = np.identity(n_features) P_nodiagonal = dof_factor * P_nodiagonal + (dof_factor - 1) * Id - beta_hat = beta_bias.T - P_nodiagonal.dot(self.lasso_cv.coef_.T) + beta_hat = beta_bias.T - P_nodiagonal.dot(self.model_y.coef_.T) # confidence intervals precision_diagonal = precision_diagonal * dof_factor**2 @@ -546,7 +544,7 @@ def desparsified_lasso( X, y, cv=None, - lasso_cv=LassoCV( + model_y=LassoCV( eps=1e-2, fit_intercept=False, cv=KFold(n_splits=5, shuffle=True, random_state=0), @@ -554,7 +552,7 @@ def desparsified_lasso( max_iter=5000, random_state=0, ), - lasso=Lasso(max_iter=5000, tol=1e-3), + model_x=Lasso(max_iter=5000, tol=1e-3), centered=True, dof_ajdustement=False, alpha_max_fraction=0.01, @@ -577,8 +575,8 @@ def desparsified_lasso( threshold_pvalue=None, ): methods = DesparsifiedLasso( - lasso_cv=lasso_cv, - lasso=lasso, + model_y=model_y, + model_x=model_x, centered=centered, dof_ajdustement=dof_ajdustement, alpha_max_fraction=alpha_max_fraction, diff --git a/test/test_desparsified_lasso.py b/test/test_desparsified_lasso.py index 4c8c2d7b3..b3d10e537 100644 --- a/test/test_desparsified_lasso.py +++ b/test/test_desparsified_lasso.py @@ -90,7 +90,7 @@ def test_desparsified_group_lasso(): ) desparsified_lasso = DesparsifiedLasso( - lasso_cv=multitasklassoCV, covariance=corr + model_y=multitasklassoCV, covariance=corr ).fit(X, y) importances = desparsified_lasso.importance(X, y) @@ -102,9 +102,7 @@ def test_desparsified_group_lasso(): assert_almost_equal(importances, beta, decimal=1) assert_almost_equal(desparsified_lasso.pvalues_corr_, expected_pval_corr, decimal=1) - desparsified_lasso = DesparsifiedLasso(lasso_cv=multitasklassoCV, test="F").fit( - X, y - ) + desparsified_lasso = DesparsifiedLasso(model_y=multitasklassoCV, test="F").fit(X, y) importances = desparsified_lasso.importance(X, y) assert_almost_equal(importances, beta, decimal=1) @@ -116,13 +114,13 @@ def test_desparsified_group_lasso(): # ValueError, desparsified_lasso, X=X, y=y, multioutput=True, covariance=bad_cov # ) desparsified_lasso = DesparsifiedLasso( - lasso_cv=multitasklassoCV, covariance=bad_cov + model_y=multitasklassoCV, covariance=bad_cov ).fit(X, y) with pytest.raises(ValueError): desparsified_lasso.importance(X, y) with pytest.raises(AssertionError, match="Unknown test 'r2'"): - DesparsifiedLasso(lasso_cv=multitasklassoCV, covariance=bad_cov, test="r2").fit( + DesparsifiedLasso(model_y=multitasklassoCV, covariance=bad_cov, test="r2").fit( X, y ) @@ -158,14 +156,14 @@ def test_exception(): with pytest.raises( AssertionError, match="lasso needs to be a Lasso or a MultiTaskLasso" ): - DesparsifiedLasso(lasso=RandomForestClassifier()) + DesparsifiedLasso(model_x=RandomForestClassifier()) with pytest.raises( AssertionError, match="lasso_cv needs to be a LassoCV or a MultiTaskLassoCV" ): - DesparsifiedLasso(lasso_cv=RandomForestClassifier()) + DesparsifiedLasso(model_y=RandomForestClassifier()) with pytest.raises(AssertionError, match="Unknown test 'r2'"): DesparsifiedLasso(test="r2") - desparsified_lasso = DesparsifiedLasso(lasso_cv=multitasklassoCV) + desparsified_lasso = DesparsifiedLasso(model_y=multitasklassoCV) with pytest.raises( ValueError, match="The Desparsified Lasso requires to be fit before any analysis", @@ -178,7 +176,7 @@ def test_exception(): ): desparsified_lasso.importance(X, y) - desparsified_lasso = DesparsifiedLasso(lasso_cv=multitasklassoCV).fit(X, y) + desparsified_lasso = DesparsifiedLasso(model_y=multitasklassoCV).fit(X, y) with pytest.raises(ValueError, match="Unknown test 'r2'"): desparsified_lasso.test = "r2" desparsified_lasso.importance(X, y) @@ -200,7 +198,7 @@ def test_warning(): seed=10, ) desparsified_lasso = DesparsifiedLasso( - lasso_cv=LassoCV(cv=KFold(n_splits=2), max_iter=10) + model_y=LassoCV(cv=KFold(n_splits=2), max_iter=10) ) with pytest.warns(Warning, match="'max_iter' has been increased to"): with pytest.warns(Warning, match="cv won't be used"): From e8134d8a966ce74b43e549b4e5288b697756af50 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 9 Sep 2025 18:53:29 +0200 Subject: [PATCH 20/93] remove function for knockoff --- src/hidimstat/base_variable_importance.py | 90 ----------------------- 1 file changed, 90 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index e2d699ead..fc5e947ab 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -239,93 +239,3 @@ def selection_fdr( ) selected = self.aggregated_eval_ >= self.threshold_fdr_ return selected - - -def _estimated_threshold(test_score, fdr=0.1): - """ - Calculate the threshold based on the procedure stated in the knockoff article. - Original code: - https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/knockoff_filter.R - Parameters - ---------- - test_score : 1D ndarray, shape (n_features, ) - Vector of test statistic. - fdr : float - Desired controlled FDR (false discovery rate) level. - Returns - ------- - threshold : float or np.inf - Threshold level. - """ - offset = 1 # Offset equals 1 is the knockoff+ procedure. - - threshold_mesh = np.sort(np.abs(test_score[test_score != 0])) - np.concatenate( - [[0], threshold_mesh, [np.inf]] - ) # if there is no solution, the threshold is inf - # find the right value of t for getting a good fdr - # Equation 1.8 of barber2015controlling and 3.10 in Candès 2018 - threshold = 0.0 - for threshold in threshold_mesh: - false_pos = np.sum(test_score <= -threshold) - selected = np.sum(test_score >= threshold) - if (offset + false_pos) / np.maximum(selected, 1) <= fdr: - break - return threshold - - -def _empirical_pval(test_score): - """ - Compute the empirical p-values from the test based on knockoff+. - Parameters - ---------- - test_score : 1D ndarray, shape (n_features, ) - Vector of test statistics. - Returns - ------- - pvals : 1D ndarray, shape (n_features, ) - Vector of empirical p-values. - """ - pvals = [] - n_features = test_score.size - - offset = 1 # Offset equals 1 is the knockoff+ procedure. - - test_score_inv = -test_score - for i in range(n_features): - if test_score[i] <= 0: - pvals.append(1) - else: - pvals.append( - (offset + np.sum(test_score_inv >= test_score[i])) / n_features - ) - - return np.array(pvals) - - -def _empirical_eval(test_score, ko_threshold): - """ - Compute the empirical e-values from the test based on knockoff. - Parameters - ---------- - test_score : 1D ndarray, shape (n_features, ) - Vector of test statistics. - ko_threshold : float - Threshold level. - Returns - ------- - evals : 1D ndarray, shape (n_features, ) - Vector of empirical e-values. - """ - evals = [] - n_features = test_score.size - - offset = 1 # Offset equals 1 is the knockoff+ procedure. - - for i in range(n_features): - if test_score[i] < ko_threshold: - evals.append(0) - else: - evals.append(n_features / (offset + np.sum(test_score <= -ko_threshold))) - - return np.array(evals) From 51685e87738ce47740578b1214d58772b11bc20a Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 9 Sep 2025 19:14:01 +0200 Subject: [PATCH 21/93] update selection_fdr --- src/hidimstat/base_variable_importance.py | 87 +++++++---------------- 1 file changed, 24 insertions(+), 63 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index fc5e947ab..b56a47950 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -147,7 +147,6 @@ def selection_fdr( self, fdr, fdr_control="bhq", - evalues=False, reshaping_function=None, adaptive_aggregation=False, gamma=0.5, @@ -155,29 +154,24 @@ def selection_fdr( """ Performs feature selection based on False Discovery Rate (FDR) control. - This method selects features by controlling the FDR using either p-values or e-values - derived from test scores. It supports different FDR control methods and optional - adaptive aggregation of the statistical values. + This method selects features by controlling the FDR using either p-values. + It supports different FDR control methods and optional adaptive aggregation + of the statistical values. Parameters ---------- - fdr : float, default=None + fdr : float The target false discovery rate level (between 0 and 1) - fdr_control: string, default="bhq" + fdr_control: str, default="bhq" The FDR control method to use. Options are: - "bhq": Benjamini-Hochberg procedure - 'bhy': Benjamini-Hochberg-Yekutieli procedure - - "ebh": e-BH procedure (only for e-values) - evalues: boolean, default=False - If True, uses e-values for selection. If False, uses p-values. reshaping_function: callable, default=None Reshaping function for BHY method, default uses sum of reciprocals - adaptive_aggregation: boolean, default=False - If True, uses adaptive weights for p-value aggregation. - Only applicable when evalues=False. - gamma: boolean, default=0.5 - The gamma parameter for quantile aggregation of p-values. - Only used when evalues=False. + adaptive_aggregation: bool, default=False + If True, uses adaptive weights for p-value aggregation + gamma: float, default=0.5 + The gamma parameter for quantile aggregation of p-values (between 0 and 1) Returns ------- @@ -187,55 +181,22 @@ def selection_fdr( Raises ------ AssertionError - If test_scores\_ is None or if incompatible combinations of parameters are provided + If list_pvalues_ attribute is missing or fdr_control is invalid """ - self._check_importance() - assert ( - self.test_scores_ is not None + assert hasattr( + self, "list_pvalues_" ), "this method doesn't support selection base on FDR" + self._check_importance() + assert fdr_control == "bhq" and fdr_control == "bhy" - if self.test_scores_ is None: - if self.pvalues_ is None: - raise ValueError( - "For using a selection with FDR, it require a method which compute at least FDR." - ) - else: - self.threshold_fdr_ = fdr_threshold( - self.pvalues_, - fdr=fdr, - method=fdr_control, - reshaping_function=reshaping_function, - ) - elif self.test_scores_.shape[0] == 1: - self.threshold_fdr_ = _estimated_threshold(self.test_scores_, fdr=fdr) - selected = self.test_scores_[0] >= self.threshold_fdr_ - elif not evalues: - assert fdr_control != "ebh", "for p-value, the fdr control can't be 'ebh'" - pvalues = np.array( - [_empirical_pval(test_score) for test_score in self.test_scores_] - ) - self.aggregated_pval_ = quantile_aggregation( - pvalues, gamma=gamma, adaptive=adaptive_aggregation - ) - self.threshold_fdr_ = fdr_threshold( - self.aggregated_pval_, - fdr=fdr, - method=fdr_control, - reshaping_function=reshaping_function, - ) - selected = self.aggregated_pval_ <= self.threshold_fdr_ - else: - assert fdr_control == "ebh", "for e-value, the fdr control need to be 'ebh'" - evalues = [] - for test_score in self.test_scores_: - ko_threshold = _estimated_threshold(test_score, fdr=fdr) - evalues.append(_empirical_eval(test_score, ko_threshold)) - self.aggregated_eval_ = np.mean(evalues, axis=0) - self.threshold_fdr_ = fdr_threshold( - self.aggregated_eval_, - fdr=fdr, - method=fdr_control, - reshaping_function=reshaping_function, - ) - selected = self.aggregated_eval_ >= self.threshold_fdr_ + aggregated_pval = quantile_aggregation( + np.array(self.list_pvalues_), gamma=gamma, adaptive=adaptive_aggregation + ) + threshold_pval = fdr_threshold( + aggregated_pval, + fdr=fdr, + method=fdr_control, + reshaping_function=reshaping_function, + ) + selected = aggregated_pval <= threshold_pval return selected From 77f6391c6dd452a2cfa80435acd2d0d5123c9575 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 9 Sep 2025 19:32:39 +0200 Subject: [PATCH 22/93] ifix some bugs --- src/hidimstat/__init__.py | 2 -- src/hidimstat/ensemble_clustered_inference.py | 27 +++++++-------- test/test_ensemble_clustered_inference.py | 34 ++++++++----------- 3 files changed, 28 insertions(+), 35 deletions(-) diff --git a/src/hidimstat/__init__.py b/src/hidimstat/__init__.py index 201fcb89e..70a95710a 100644 --- a/src/hidimstat/__init__.py +++ b/src/hidimstat/__init__.py @@ -1,6 +1,5 @@ from .base_variable_importance import BaseVariableImportance from .base_perturbation import BasePerturbation -from .ensemble_clustered_inference import ClusteredInference from .ensemble_clustered_inference import EnsembleClusteredInference from .desparsified_lasso import desparsified_lasso, DesparsifiedLasso from .distilled_conditional_randomization_test import d0crt, D0CRT @@ -24,7 +23,6 @@ __all__ = [ "quantile_aggregation", - "ClusteredInference", "EnsembleClusteredInference", "d0crt", "D0CRT", diff --git a/src/hidimstat/ensemble_clustered_inference.py b/src/hidimstat/ensemble_clustered_inference.py index 87b538c2c..a1505656d 100644 --- a/src/hidimstat/ensemble_clustered_inference.py +++ b/src/hidimstat/ensemble_clustered_inference.py @@ -106,7 +106,7 @@ def __init__( n_jobs=1, verbose=1, ): - assert issubclass(DesparsifiedLasso, variable_importance) + assert issubclass(DesparsifiedLasso, variable_importance.__class__) self.variable_importance = variable_importance assert ward is None or issubclass( FeatureAgglomeration, ward.__class__ @@ -179,17 +179,17 @@ def _check_fit(self): for ward, scaler, vi in self.list_ward_scaler_vi_: if ward is not None: try: - check_is_fitted(self.ward) + check_is_fitted(ward) except NotFittedError: raise ValueError( - "The ClusteredInference requires to be fit before any analysis" + "The EnsembleClusteredInference requires to be fit before any analysis" ) if scaler is not None: try: - check_is_fitted(self.scaler_sampling) + check_is_fitted(scaler) except NotFittedError: raise ValueError( - "The ClusteredInference requires to be fit before any analysis" + "The EnsembleClusteredInference requires to be fit before any analysis" ) vi._check_fit() @@ -249,9 +249,9 @@ def importance(self, X, y): self.list_pvalues_.append(pvalues) self.list_pvalues_corr_.append(pvalues_corr) - self.importances_ = np.mean(self.list_importances_) - self.pvalues_ = np.mean(self.list_pvalues_) - self.pvalues_corr_ = np.mean(self.list_pvalues_corr_) + self.importances_ = np.mean(self.list_importances_, axis=0) + self.pvalues_ = np.mean(self.list_pvalues_, axis=0) + self.pvalues_corr_ = np.mean(self.list_pvalues_corr_, axis=0) return self.importances_ def fit_importance(self, X, y, cv=None): @@ -304,11 +304,10 @@ def _bootstrap_run_fit( # sampling row of X train_index = _subsampling(n_samples, train_size, groups=groups, seed=seed) - X = X_init[train_index, :] # transformation matrix if ward is not None: - ward_ = clone(ward) - X_reduced, ward = ward_.fit_transform(X) + ward_ = clone(ward).fit(X_init[train_index, :]) + X_reduced = ward_.transform(X_init) else: X_reduced = X_init ward_ = None @@ -335,15 +334,15 @@ def _bootstrap_run_importance(ward_, scaler_sampling_, variable_importance_, X, # apply Preprocessing if scaler_sampling_ is not None: - X_ = scaler_sampling_.transform(X) + X_ = scaler_sampling_.transform(X_) else: X_ = X variable_importance_.importance(X_, y) if ward_ is not None: - pvalue = ward_.inverse_transform(variable_importance_.pvalue_) - pvalue_corr = ward_.inverse_transform(variable_importance_.pvalue_corr) + pvalue = ward_.inverse_transform(variable_importance_.pvalues_) + pvalue_corr = ward_.inverse_transform(variable_importance_.pvalues_corr_) importance = _ungroup_beta( variable_importance_.importances_, n_features=pvalue.shape[0], ward=ward_ ) diff --git a/test/test_ensemble_clustered_inference.py b/test/test_ensemble_clustered_inference.py index b894308dd..b989a1035 100644 --- a/test/test_ensemble_clustered_inference.py +++ b/test/test_ensemble_clustered_inference.py @@ -10,7 +10,6 @@ from sklearn.linear_model import MultiTaskLassoCV from sklearn.model_selection import KFold -from hidimstat.ensemble_clustered_inference import ClusteredInference from hidimstat.ensemble_clustered_inference import EnsembleClusteredInference from hidimstat.desparsified_lasso import DesparsifiedLasso from hidimstat._utils.scenario import multivariate_simulation @@ -68,8 +67,8 @@ def test_clustered_inference_no_temporal(): n_clusters=n_clusters, connectivity=connectivity, linkage="ward" ) - clustered_inference = ClusteredInference( - ward, n_clusters, scaler_sampling=StandardScaler() + clustered_inference = EnsembleClusteredInference( + ward=ward, scaler_sampling=StandardScaler(), n_bootstraps=1 ).fit(X_init, y) clustered_inference.importance(X_init, y) @@ -125,11 +124,11 @@ def test_clustered_inference_temporal(): n_clusters=n_clusters, connectivity=connectivity, linkage="ward" ) - clustered_inference = ClusteredInference( - ward, - n_clusters, + clustered_inference = EnsembleClusteredInference( + ward=ward, variable_importance=set_desparsified_lasso_multi_time(), scaler_sampling=StandardScaler(), + n_bootstraps=1, ).fit(X, y) clustered_inference.importance(X, y) @@ -198,11 +197,11 @@ def test_clustered_inference_no_temporal_groups(): n_clusters=n_clusters, connectivity=connectivity, linkage="ward" ) - clustered_inference = ClusteredInference( - ward, - n_clusters, + clustered_inference = EnsembleClusteredInference( + ward=ward, scaler_sampling=StandardScaler(), groups=groups, + n_bootstraps=1, ).fit(X_, y_) clustered_inference.importance(X_, y_) @@ -258,11 +257,10 @@ def test_ensemble_clustered_inference(): n_clusters=n_clusters, connectivity=connectivity, linkage="ward" ) - clustered_inference = ClusteredInference( - ward, n_clusters, scaler_sampling=StandardScaler() - ) EnCluDl = EnsembleClusteredInference( - variable_importance=clustered_inference, n_bootstraps=n_bootstraps + ward=ward, + scaler_sampling=StandardScaler(), + n_bootstraps=n_bootstraps, ).fit(X_init, y) EnCluDl.importance(X_init, y) selected = EnCluDl.selection_fdr(fdr=0.1) @@ -313,14 +311,12 @@ def test_ensemble_clustered_inference_temporal_data(): n_clusters=n_clusters, connectivity=connectivity, linkage="ward" ) - clustered_inference = ClusteredInference( - ward, - n_clusters, + EnCluDl = EnsembleClusteredInference( variable_importance=set_desparsified_lasso_multi_time(), + ward=ward, + n_clusters=n_clusters, scaler_sampling=StandardScaler(), - ) - EnCluDl = EnsembleClusteredInference( - variable_importance=clustered_inference, n_bootstraps=n_bootstraps + n_bootstraps=n_bootstraps, ).fit(X, y) EnCluDl.importance(X, y) selected = EnCluDl.selection_fdr(fdr=0.1, fdr_control="bhq") From 3dd668dc8e764a85c550fe23eae1d19261fea5ce Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 9 Sep 2025 19:37:33 +0200 Subject: [PATCH 23/93] fix test --- src/hidimstat/base_variable_importance.py | 2 +- test/test_ensemble_clustered_inference.py | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index b56a47950..3b7882fc3 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -187,7 +187,7 @@ def selection_fdr( self, "list_pvalues_" ), "this method doesn't support selection base on FDR" self._check_importance() - assert fdr_control == "bhq" and fdr_control == "bhy" + assert fdr_control == "bhq" or fdr_control == "bhy" aggregated_pval = quantile_aggregation( np.array(self.list_pvalues_), gamma=gamma, adaptive=adaptive_aggregation diff --git a/test/test_ensemble_clustered_inference.py b/test/test_ensemble_clustered_inference.py index b989a1035..7857e9c23 100644 --- a/test/test_ensemble_clustered_inference.py +++ b/test/test_ensemble_clustered_inference.py @@ -314,7 +314,6 @@ def test_ensemble_clustered_inference_temporal_data(): EnCluDl = EnsembleClusteredInference( variable_importance=set_desparsified_lasso_multi_time(), ward=ward, - n_clusters=n_clusters, scaler_sampling=StandardScaler(), n_bootstraps=n_bootstraps, ).fit(X, y) @@ -325,10 +324,10 @@ def test_ensemble_clustered_inference_temporal_data(): expected[:support_size] = 1.0 assert_almost_equal( - selected[:interior_support, 0], expected[:interior_support], decimal=3 + selected[:interior_support], expected[:interior_support], decimal=3 ) assert_almost_equal( - selected[extended_support:, 0], expected[extended_support:], decimal=1 + selected[extended_support:], expected[extended_support:], decimal=1 ) # different aggregation method @@ -338,8 +337,8 @@ def test_ensemble_clustered_inference_temporal_data(): expected[:support_size] = 1.0 assert_almost_equal( - selected[:interior_support, 0], expected[:interior_support], decimal=3 + selected[:interior_support], expected[:interior_support], decimal=3 ) assert_almost_equal( - selected[extended_support:, 0], expected[extended_support:], decimal=1 + selected[extended_support:], expected[extended_support:], decimal=1 ) From 3b1f945d4f1d0426d5bcef56daa44d92026f5a9c Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 9 Sep 2025 19:41:01 +0200 Subject: [PATCH 24/93] fix api --- docs/src/api.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/src/api.rst b/docs/src/api.rst index 24c85e9bf..6aeee0d4a 100644 --- a/docs/src/api.rst +++ b/docs/src/api.rst @@ -34,5 +34,4 @@ Classes PFI D0CRT DesparsifiedLasso - ClusteredInference EnsembleClusteredInference From 6f9449578a6f93d4e910f089d3ece890046d5297 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 9 Sep 2025 19:47:49 +0200 Subject: [PATCH 25/93] fix example --- examples/plot_2D_simulation_example.py | 41 ++++++++------------------ 1 file changed, 13 insertions(+), 28 deletions(-) diff --git a/examples/plot_2D_simulation_example.py b/examples/plot_2D_simulation_example.py index fc0b437e2..cba6a4a99 100644 --- a/examples/plot_2D_simulation_example.py +++ b/examples/plot_2D_simulation_example.py @@ -58,14 +58,7 @@ from sklearn.linear_model import MultiTaskLassoCV from hidimstat import DesparsifiedLasso -from hidimstat.ensemble_clustered_inference import ( - clustered_inference, - clustered_inference_pvalue, -) -from hidimstat.ensemble_clustered_inference import ( - ensemble_clustered_inference, - ensemble_clustered_inference_pvalue, -) +from hidimstat.ensemble_clustered_inference import EnsembleClusteredInference from hidimstat.statistical_tools.p_values import zscore_from_pval from hidimstat._utils.scenario import multivariate_simulation_spatial @@ -267,20 +260,21 @@ def plot(maps, titles): ) # clustered desparsified lasso (CluDL) -ward_, desparsified_lasso = clustered_inference( - X_init, y, ward, n_clusters, scaler_sampling=StandardScaler() -) -beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( - clustered_inference_pvalue(n_samples, False, ward_, desparsified_lasso) +clustered_inference = EnsembleClusteredInference( + ward=ward, scaler_sampling=StandardScaler(), n_bootstraps=1 ) +clustered_inference.fit_importance(X_init, y) # compute estimated support (first method) -zscore = zscore_from_pval(pval, one_minus_pval) +zscore = zscore_from_pval( + clustered_inference.pvalues_, 1 - clustered_inference.pvalues_ +) selected_cdl = zscore > thr_c # use the "clustering threshold" # compute estimated support (second method) selected_cdl = np.logical_or( - pval_corr < fwer_target / 2, one_minus_pval_corr < fwer_target / 2 + clustered_inference.pvalues_corr_ < fwer_target / 2, + 1 - clustered_inference.pvalues_corr_ < fwer_target / 2, ) ############################################################################# @@ -291,20 +285,11 @@ def plot(maps, titles): # solutions are then aggregated into one. # ensemble of clustered desparsified lasso (EnCluDL) -list_ward, list_desparsified_lasso = ensemble_clustered_inference( - X_init, - y, - ward, - n_clusters, - scaler_sampling=StandardScaler(), -) -beta_hat, selected_ecdl = ensemble_clustered_inference_pvalue( - n_samples, - False, - list_ward, - list_desparsified_lasso, - fdr=fwer_target, +ensemble_clustered_inference = EnsembleClusteredInference( + ward=ward, scaler_sampling=StandardScaler(), n_bootstraps=1 ) +ensemble_clustered_inference.fit_importance(X_init, y) +selected_ecdl = ensemble_clustered_inference.selection_fdr(fdr=fwer_target) ############################################################################# # Results From d8c0f803d8891babe33ae0d773004dcc93799d94 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 10 Sep 2025 11:30:29 +0200 Subject: [PATCH 26/93] smal modification --- src/hidimstat/desparsified_lasso.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index dd96db704..8e9752bb6 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -241,13 +241,10 @@ def fit(self, X, y): self.model_y.set_params(n_jobs=self.n_jobs) self.model_y.fit(X_, y_) - # Estimate the support of the variable importance - residual = self.model_y.predict(X_) - y_ - # Lasso regression and noise standard deviation estimation self.sigma_hat_ = memory.cache(reid, ignore=["n_jobs"])( - self.model_y.coef_, - residual, + self.model_y.coef_, # estimated support of the variable importance + self.model_y.predict(X_) - y_, # compute the residual, tolerance=self.tolerance_reid, # for group multioutput=self.n_times_ > 1, From 631cf877b0aeed66da7c777e2b58019e0e98e8cb Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 10 Sep 2025 11:42:29 +0200 Subject: [PATCH 27/93] fix name variable --- src/hidimstat/desparsified_lasso.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index 8e9752bb6..d11b5a8cc 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -387,11 +387,12 @@ def importance(self, X, y): beta_bias = dof_factor * np.dot(y_.T, Z) / np.sum(X_ * Z, axis=0) # beta hat - P = (np.dot(X_.T, Z) / np.sum(X_ * Z, axis=0)).T - P_nodiagonal = P - np.diag(np.diag(P)) - Id = np.identity(n_features) - P_nodiagonal = dof_factor * P_nodiagonal + (dof_factor - 1) * Id - beta_hat = beta_bias.T - P_nodiagonal.dot(self.model_y.coef_.T) + p = (np.dot(X_.T, Z) / np.sum(X_ * Z, axis=0)).T + p_nodiagonal = p - np.diag(np.diag(p)) + p_nodiagonal = dof_factor * p_nodiagonal + (dof_factor - 1) * np.identity( + n_features + ) + beta_hat = beta_bias.T - p_nodiagonal.dot(self.model_y.coef_.T) # confidence intervals precision_diagonal = precision_diagonal * dof_factor**2 From b54fe16a57398e5c4120ee8738f8a2240267958b Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 10 Sep 2025 11:44:00 +0200 Subject: [PATCH 28/93] fix docstring --- src/hidimstat/desparsified_lasso.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index d11b5a8cc..c30845a2b 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -33,10 +33,10 @@ class DesparsifiedLasso(BaseVariableImportance): Parameters ---------- - lasso_cv : LassoCV or MultiTaskLassoCV instance, default=LassoCV() + model_y : LassoCV or MultiTaskLassoCV instance, default=LassoCV() CV object used for initial Lasso fit. - lasso : Lasso instance, default=Lasso() + model_x : Lasso instance, default=Lasso() Base Lasso estimator used for nodewise regressions. centered : bool, default=True From 413574076dbb1fa0c20b03d3345d13f01b3f5719 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 10 Sep 2025 12:01:17 +0200 Subject: [PATCH 29/93] fix example and test --- examples/plot_fmri_data_example.py | 6 ++---- src/hidimstat/ensemble_clustered_inference.py | 4 ++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/examples/plot_fmri_data_example.py b/examples/plot_fmri_data_example.py index 7f21fbd30..f62df2ad7 100644 --- a/examples/plot_fmri_data_example.py +++ b/examples/plot_fmri_data_example.py @@ -178,7 +178,7 @@ def preprocess_haxby(subject=2, memory=None): ward, n_clusters, scaler_sampling=StandardScaler(), - lasso_cv=estimator, # , tolerance=1e-2 + model_y=estimator, ) beta_hat, pval_cdl, _, one_minus_pval_cdl, _ = clustered_inference_pvalue( X.shape[0], None, ward_, cl_desparsified_lasso @@ -200,9 +200,7 @@ def preprocess_haxby(subject=2, memory=None): groups=groups, scaler_sampling=StandardScaler(), n_bootstraps=5, - lasso_cv=estimator, - # max_iteration=6000, - # tolerance=1e-2, + model_y=estimator, n_jobs=2, ) beta_hat, selected = ensemble_clustered_inference_pvalue( diff --git a/src/hidimstat/ensemble_clustered_inference.py b/src/hidimstat/ensemble_clustered_inference.py index 25ba91688..bb684c118 100644 --- a/src/hidimstat/ensemble_clustered_inference.py +++ b/src/hidimstat/ensemble_clustered_inference.py @@ -260,7 +260,7 @@ def clustered_inference( if hasattr(kwargs, "lasso_cv") and kwargs["lasso_cv"] is not None: pass elif len(y.shape) > 1 and y.shape[1] > 1: - kwargs["lasso_cv"] = MultiTaskLassoCV( + kwargs["model_y"] = MultiTaskLassoCV( eps=1e-2, fit_intercept=False, cv=KFold(n_splits=5, shuffle=True, random_state=0), @@ -270,7 +270,7 @@ def clustered_inference( n_jobs=1, ) else: - kwargs["lasso_cv"] = LassoCV( + kwargs["model_y"] = LassoCV( eps=1e-2, fit_intercept=False, cv=KFold(n_splits=5, shuffle=True, random_state=0), From 70c74a4fc8357b4511150e5f3659bb7bba7d3e31 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 10 Sep 2025 18:18:06 +0200 Subject: [PATCH 30/93] put _subsampling in encludl --- src/hidimstat/_utils/bootstrap.py | 35 ------------------- src/hidimstat/ensemble_clustered_inference.py | 35 ++++++++++++++++++- 2 files changed, 34 insertions(+), 36 deletions(-) delete mode 100644 src/hidimstat/_utils/bootstrap.py diff --git a/src/hidimstat/_utils/bootstrap.py b/src/hidimstat/_utils/bootstrap.py deleted file mode 100644 index dd4bd1b97..000000000 --- a/src/hidimstat/_utils/bootstrap.py +++ /dev/null @@ -1,35 +0,0 @@ -import numpy as np -from sklearn.utils import resample - - -def _subsampling(n_samples, train_size, groups=None, seed=0): - """ - Random subsampling for statistical inference. - - Parameters - ---------- - n_samples : int - Total number of samples in the dataset. - train_size : float - Fraction of samples to include in the training set (between 0 and 1). - groups : ndarray, shape (n_samples,), optional (default=None) - Group labels for samples. - If not None, a subset of groups is selected. - seed : int, optional (default=0) - Random seed for reproducibility. - - Returns - ------- - train_index : ndarray - Indices of selected samples for training. - """ - index_row = np.arange(n_samples) if groups is None else np.unique(groups) - train_index = resample( - index_row, - n_samples=int(len(index_row) * train_size), - replace=False, - random_state=seed, - ) - if groups is not None: - train_index = np.arange(n_samples)[np.isin(groups, train_index)] - return train_index diff --git a/src/hidimstat/ensemble_clustered_inference.py b/src/hidimstat/ensemble_clustered_inference.py index a1505656d..8c332a16d 100644 --- a/src/hidimstat/ensemble_clustered_inference.py +++ b/src/hidimstat/ensemble_clustered_inference.py @@ -11,7 +11,7 @@ from hidimstat.desparsified_lasso import DesparsifiedLasso from hidimstat.base_variable_importance import BaseVariableImportance -from hidimstat._utils.bootstrap import _subsampling +from sklearn.utils import resample class EnsembleClusteredInference(BaseVariableImportance): @@ -354,6 +354,39 @@ def _bootstrap_run_importance(ward_, scaler_sampling_, variable_importance_, X, return importance, pvalue, pvalue_corr +def _subsampling(n_samples, train_size, groups=None, seed=0): + """ + Random subsampling for statistical inference. + + Parameters + ---------- + n_samples : int + Total number of samples in the dataset. + train_size : float + Fraction of samples to include in the training set (between 0 and 1). + groups : ndarray, shape (n_samples,), optional (default=None) + Group labels for samples. + If not None, a subset of groups is selected. + seed : int, optional (default=0) + Random seed for reproducibility. + + Returns + ------- + train_index : ndarray + Indices of selected samples for training. + """ + index_row = np.arange(n_samples) if groups is None else np.unique(groups) + train_index = resample( + index_row, + n_samples=int(len(index_row) * train_size), + replace=False, + random_state=seed, + ) + if groups is not None: + train_index = np.arange(n_samples)[np.isin(groups, train_index)] + return train_index + + def _ungroup_beta(beta_hat, n_features, ward): """ Ungroup cluster-level beta coefficients to individual feature-level From 571e306c8000bd147733614d39d993f64dda50b0 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 13 Oct 2025 15:59:08 +0200 Subject: [PATCH 31/93] fix commit --- test/test_ensemble_clustered_inference.py | 8 +++----- test/test_noise_std.py | 3 --- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/test/test_ensemble_clustered_inference.py b/test/test_ensemble_clustered_inference.py index 4bf0374a8..9c552508e 100644 --- a/test/test_ensemble_clustered_inference.py +++ b/test/test_ensemble_clustered_inference.py @@ -57,7 +57,7 @@ def test_clustered_inference_no_temporal(): ) ward_, desparsified_lassos = clustered_inference( - X_init, y, ward, n_clusters, scaler_sampling=StandardScaler() + X_init, y, ward, scaler_sampling=StandardScaler() ) beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( @@ -112,7 +112,7 @@ def test_clustered_inference_temporal(): ) ward_, desparsified_lassos = clustered_inference( - X, y, ward, n_clusters, scaler_sampling=StandardScaler() + X, y, ward, scaler_sampling=StandardScaler() ) beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( @@ -181,7 +181,7 @@ def test_clustered_inference_no_temporal_groups(): ) ward_, desparsified_lassos = clustered_inference( - X_, y_, ward, n_clusters, groups=groups, scaler_sampling=StandardScaler() + X_, y_, ward, groups=groups, scaler_sampling=StandardScaler() ) beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( @@ -241,7 +241,6 @@ def test_ensemble_clustered_inference(): X_init, y, ward, - n_clusters, scaler_sampling=StandardScaler(), n_bootstraps=n_bootstraps, ) @@ -299,7 +298,6 @@ def test_ensemble_clustered_inference_temporal_data(): X, y, ward, - n_clusters, scaler_sampling=StandardScaler(), n_bootstraps=n_bootstraps, ) diff --git a/test/test_noise_std.py b/test/test_noise_std.py index d1070dbdd..2a08f942b 100644 --- a/test/test_noise_std.py +++ b/test/test_noise_std.py @@ -100,7 +100,6 @@ def test_group_reid(): residual, multioutput=True, tolerance=1e-3, - random_state=random_state, ) error_relative = np.abs(cov_hat - cov) / cov assert np.max(error_relative) < 0.3 @@ -110,7 +109,6 @@ def test_group_reid(): residual, multioutput=True, method="AR", - random_state=random_state, ) error_relative = np.abs(cov_hat - cov) / cov assert np.max(error_relative) < 0.3 @@ -120,7 +118,6 @@ def test_group_reid(): residual, multioutput=True, stationary=False, - random_state=random_state, ) error_relative = np.abs(cov_hat - cov) / cov assert np.max(error_relative) > 0.3 From af12949b432ea2c845350ecc73dae79287620458 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 13 Oct 2025 15:59:20 +0200 Subject: [PATCH 32/93] rename variable --- src/hidimstat/noise_std.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hidimstat/noise_std.py b/src/hidimstat/noise_std.py index de213440c..34cb0a8c5 100644 --- a/src/hidimstat/noise_std.py +++ b/src/hidimstat/noise_std.py @@ -114,15 +114,15 @@ def reid( sigma_hat = np.median(sigma_hat_raw) * np.ones(n_times) # compute rho from the empirical correlation matrix # (section 2.5 of `chevalier2020statistical`) - correlation_emperical = np.corrcoef(residual.T) + correlation_empirical = np.corrcoef(residual.T) else: sigma_hat = sigma_hat_raw residual_rescaled = residual / sigma_hat - correlation_emperical = np.corrcoef(residual_rescaled.T) + correlation_empirical = np.corrcoef(residual_rescaled.T) # Median method if not stationary or method == "median": - rho_hat = np.median(np.diag(correlation_emperical, 1)) + rho_hat = np.median(np.diag(correlation_empirical, 1)) # estimate M (section 2.5 of `chevalier2020statistical`) correlation_hat = toeplitz( np.geomspace(1, rho_hat ** (n_times - 1), n_times) @@ -136,7 +136,7 @@ def reid( rho_ar[0] = 1 for i in range(1, order + 1): - rho_ar[i] = np.median(np.diag(correlation_emperical, i)) + rho_ar[i] = np.median(np.diag(correlation_empirical, i)) # solve the Yule-Walker equations (see eq.2 in `eshel2003yule`) R = toeplitz(rho_ar[:-1]) From fae39eeddb8012ff3e1735654398e82887eea357 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 13 Oct 2025 16:06:29 +0200 Subject: [PATCH 33/93] fix format --- examples/plot_fmri_data_example.py | 15 +++++++++++---- src/hidimstat/__init__.py | 4 ++-- src/hidimstat/desparsified_lasso.py | 17 ++++++++--------- src/hidimstat/ensemble_clustered_inference.py | 2 +- test/test_desparsified_lasso.py | 4 ++-- test/test_noise_std.py | 5 ++--- 6 files changed, 26 insertions(+), 21 deletions(-) diff --git a/examples/plot_fmri_data_example.py b/examples/plot_fmri_data_example.py index 0aa5e7c7a..76a7d1041 100644 --- a/examples/plot_fmri_data_example.py +++ b/examples/plot_fmri_data_example.py @@ -37,12 +37,12 @@ from matplotlib.pyplot import get_cmap from nilearn import datasets from nilearn.image import mean_img -from sklearn.linear_model import LassoCV from nilearn.maskers import NiftiMasker -from sklearn.model_selection import KFold from nilearn.plotting import plot_stat_map, show from sklearn.cluster import FeatureAgglomeration from sklearn.feature_extraction import image +from sklearn.linear_model import LassoCV +from sklearn.model_selection import KFold from sklearn.preprocessing import StandardScaler from sklearn.utils import Bunch @@ -161,10 +161,17 @@ def preprocess_haxby(subject=2, memory=None): # feature aggregation methods. # try: - desparsified_lasso = DesparsifiedLasso(noise_method="median", model_y=estimator, max_iteration=1000, random_state=0, n_jobs=n_jobs) + desparsified_lasso = DesparsifiedLasso( + noise_method="median", + model_y=estimator, + max_iteration=1000, + random_state=0, + n_jobs=n_jobs, + ) desparsified_lasso.fit_importance(X, y) pval_dl = desparsified_lasso.pvalues_ one_minus_pval_dl = 1 - pval_dl +except MemoryError as err: pval_dl = None one_minus_pval_dl = None print("As expected, Desparsified Lasso uses too much memory.") @@ -181,7 +188,7 @@ def preprocess_haxby(subject=2, memory=None): model_y=estimator, tolerance=1e-2, random_state=1, - n_jobs=n_jobs + n_jobs=n_jobs, ) beta_hat, pval_cdl, _, one_minus_pval_cdl, _ = clustered_inference_pvalue( X.shape[0], None, ward_, cl_desparsified_lasso diff --git a/src/hidimstat/__init__.py b/src/hidimstat/__init__.py index 8dd37e185..4ea86bb73 100644 --- a/src/hidimstat/__init__.py +++ b/src/hidimstat/__init__.py @@ -1,6 +1,6 @@ from .conditional_feature_importance import CFI -from .desparsified_lasso import desparsified_lasso, DesparsifiedLasso -from .distilled_conditional_randomization_test import d0crt, D0CRT +from .desparsified_lasso import DesparsifiedLasso, desparsified_lasso +from .distilled_conditional_randomization_test import D0CRT, d0crt from .ensemble_clustered_inference import ( clustered_inference, clustered_inference_pvalue, diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index f36e015d7..6873ffcb8 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -5,23 +5,22 @@ from numpy.linalg import multi_dot from scipy import stats from scipy.linalg import inv -from sklearn.base import clone, check_is_fitted -from sklearn.linear_model import Lasso -from sklearn.utils.validation import check_memory -from sklearn.linear_model import LassoCV, MultiTaskLassoCV -from sklearn.model_selection import KFold +from sklearn.base import check_is_fitted, clone from sklearn.exceptions import NotFittedError +from sklearn.linear_model import Lasso, LassoCV, MultiTaskLassoCV +from sklearn.model_selection import KFold from sklearn.preprocessing import StandardScaler +from sklearn.utils.validation import check_memory +from hidimstat._utils.docstring import _aggregate_docstring +from hidimstat._utils.regression import _alpha_max +from hidimstat._utils.utils import check_random_state, seed_estimator from hidimstat.base_variable_importance import BaseVariableImportance from hidimstat.noise_std import reid from hidimstat.statistical_tools.p_values import ( pval_from_cb, pval_from_two_sided_pval_and_sign, ) -from hidimstat._utils.docstring import _aggregate_docstring -from hidimstat._utils.regression import _alpha_max -from hidimstat._utils.utils import check_random_state, seed_estimator class DesparsifiedLasso(BaseVariableImportance): @@ -282,7 +281,7 @@ def fit(self, X, y): precision_diagonal = np.stack(results[:, 1]) self.clf_ = [clf for clf in results[:, 2]] - # Computing the degrees of freedom adjustement + # Computing the degrees of freedom adjustment if self.dof_ajdustement: coefficient_max = np.max(np.abs(self.model_y.coef_)) support = np.sum(np.abs(self.model_y.coef_) > 0.01 * coefficient_max) diff --git a/src/hidimstat/ensemble_clustered_inference.py b/src/hidimstat/ensemble_clustered_inference.py index 496070865..322786c38 100644 --- a/src/hidimstat/ensemble_clustered_inference.py +++ b/src/hidimstat/ensemble_clustered_inference.py @@ -7,9 +7,9 @@ from sklearn.utils.validation import check_memory from tqdm import tqdm -from hidimstat.desparsified_lasso import DesparsifiedLasso from hidimstat._utils.bootstrap import _subsampling from hidimstat._utils.utils import check_random_state +from hidimstat.desparsified_lasso import DesparsifiedLasso from hidimstat.statistical_tools.aggregation import quantile_aggregation from hidimstat.statistical_tools.multiple_testing import fdr_threshold diff --git a/test/test_desparsified_lasso.py b/test/test_desparsified_lasso.py index d4553336d..2f630cb60 100644 --- a/test/test_desparsified_lasso.py +++ b/test/test_desparsified_lasso.py @@ -6,12 +6,12 @@ import pytest from numpy.testing import assert_almost_equal from scipy.linalg import toeplitz +from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import MultiTaskLassoCV from sklearn.model_selection import KFold -from sklearn.ensemble import RandomForestClassifier -from hidimstat.desparsified_lasso import DesparsifiedLasso, desparsified_lasso from hidimstat._utils.scenario import multivariate_simulation +from hidimstat.desparsified_lasso import DesparsifiedLasso, desparsified_lasso def test_desparsified_lasso(): diff --git a/test/test_noise_std.py b/test/test_noise_std.py index 2a08f942b..bc8569abf 100644 --- a/test/test_noise_std.py +++ b/test/test_noise_std.py @@ -6,13 +6,12 @@ import pytest from numpy.testing import assert_almost_equal from scipy.linalg import toeplitz +from sklearn.linear_model import LassoCV, MultiTaskLassoCV +from sklearn.model_selection import KFold from hidimstat._utils.scenario import multivariate_simulation from hidimstat.noise_std import empirical_snr, reid -from sklearn.linear_model import LassoCV, MultiTaskLassoCV -from sklearn.model_selection import KFold - def test_reid(): """Estimating noise standard deviation in two scenarios. From cc1a85aaf022de581413b66d4965fb4a2dbb6b27 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 13 Oct 2025 18:32:10 +0200 Subject: [PATCH 34/93] fix example --- examples/plot_2D_simulation_example.py | 1 - examples/plot_fmri_data_example.py | 20 ++++++++------------ src/hidimstat/desparsified_lasso.py | 12 +++++++++--- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/examples/plot_2D_simulation_example.py b/examples/plot_2D_simulation_example.py index fe77cca7b..5e423daca 100644 --- a/examples/plot_2D_simulation_example.py +++ b/examples/plot_2D_simulation_example.py @@ -238,7 +238,6 @@ def weight_map_2D_extended(shape, roi_size, delta): X_init, y, ward, - n_clusters, scaler_sampling=StandardScaler(), random_state=0, ) diff --git a/examples/plot_fmri_data_example.py b/examples/plot_fmri_data_example.py index 76a7d1041..29a025349 100644 --- a/examples/plot_fmri_data_example.py +++ b/examples/plot_fmri_data_example.py @@ -40,6 +40,7 @@ from nilearn.maskers import NiftiMasker from nilearn.plotting import plot_stat_map, show from sklearn.cluster import FeatureAgglomeration +from sklearn.base import clone from sklearn.feature_extraction import image from sklearn.linear_model import LassoCV from sklearn.model_selection import KFold @@ -66,7 +67,7 @@ new_soft_limit = limit_5G if soft < 0 else min(limit_5G, soft) new_hard_limit = limit_5G if hard < 0 else min(limit_5G, hard) resource.setrlimit(resource.RLIMIT_AS, (new_soft_limit, new_hard_limit)) -n_jobs = 2 +n_jobs = 1 # %% @@ -148,7 +149,7 @@ def preprocess_haxby(subject=2, memory=None): fit_intercept=False, cv=KFold(n_splits=5, shuffle=True, random_state=0), tol=1e-2, - max_iter=4000, + max_iter=6000, random_state=1, n_jobs=1, ) @@ -163,8 +164,7 @@ def preprocess_haxby(subject=2, memory=None): try: desparsified_lasso = DesparsifiedLasso( noise_method="median", - model_y=estimator, - max_iteration=1000, + model_y=clone(estimator), random_state=0, n_jobs=n_jobs, ) @@ -183,10 +183,9 @@ def preprocess_haxby(subject=2, memory=None): X, y, ward, - n_clusters, scaler_sampling=StandardScaler(), - model_y=estimator, - tolerance=1e-2, + model_y=clone(estimator), + tolerance_reid=1e-2, random_state=1, n_jobs=n_jobs, ) @@ -206,14 +205,11 @@ def preprocess_haxby(subject=2, memory=None): X, y, ward, - n_clusters, groups=groups, scaler_sampling=StandardScaler(), n_bootstraps=5, - model_y=estimator, - n_jobs=2, - max_iteration=6000, - tolerance=1e-2, + model_y=clone(estimator), + tolerance_reid=1e-2, random_state=2, n_jobs=n_jobs, ) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index 6873ffcb8..49e4263fe 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -145,6 +145,7 @@ def __init__( distribution="norm", epsilon_pvalue=1e-14, test="chi2", + save_model_x=False, n_jobs=1, memory=None, verbose=0, @@ -174,6 +175,7 @@ def __init__( self.epsilon_pvalue = epsilon_pvalue assert test == "chi2" or test == "F", f"Unknown test '{test}'" self.test = test + self.save_model_x = save_model_x self.n_jobs = n_jobs self.random_state = random_state self.memory = memory @@ -185,6 +187,7 @@ def __init__( self.pvalues_corr_ = None self.precision_diagonal_ = None self.clf_ = None + self.n_samples_ = None def fit(self, X, y): """ @@ -272,6 +275,7 @@ def fit(self, X, y): ), random_state=rng_spwan, ), + return_clf=self.save_model_x, ) for i, rng_spwan in enumerate(rng.spawn(n_features)) ) @@ -452,7 +456,7 @@ def fit_importance(self, X, y): return self.importance(X, y) -def _compute_residuals(X, id_column, clf): +def _compute_residuals(X, id_column, clf, return_clf): """ Compute nodewise Lasso regression for desparsified Lasso estimation @@ -503,13 +507,15 @@ def _compute_residuals(X, id_column, clf): # which is used as an estimation of the noise covariance. precision_diagonal_i = n_samples * np.sum(z**2) / np.dot(X_i, z) ** 2 - return z, precision_diagonal_i, clf + if return_clf: + return z, precision_diagonal_i, clf + else: + return z, precision_diagonal_i, None def desparsified_lasso( X, y, - cv=None, model_y=LassoCV( eps=1e-2, fit_intercept=False, From 3666fdf90dd65f6ff372104c547b3a16c0b6dce4 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 13 Oct 2025 18:57:06 +0200 Subject: [PATCH 35/93] update docstring --- src/hidimstat/desparsified_lasso.py | 155 ++++++++++++++-------------- 1 file changed, 80 insertions(+), 75 deletions(-) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index 49e4263fe..071bb702e 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -25,36 +25,36 @@ class DesparsifiedLasso(BaseVariableImportance): """ - Desparsified Lasso + Desparsified Lasso Estimator + Statistical inference in high-dimensional regression using the desparsified Lasso. + Provides debiased coefficient estimates, confidence intervals and p-values. Algorithm based on Algorithm 1 of d-Lasso and d-MTLasso in :footcite:t:`chevalier2020statisticalthesis`. Parameters ---------- model_y : LassoCV or MultiTaskLassoCV instance, default=LassoCV() - CV object used for initial Lasso fit. + Estimator used for initial Lasso fit. Must implement fit and predict. + It should be Lasso or MultiTaskLasso. - model_x : Lasso instance, default=Lasso() + model_x : sklearn estimator, default=Lasso() Base Lasso estimator used for nodewise regressions. centered : bool, default=True - Whether to center X and y. + Whether to center X and y before fitting. dof_ajdustement : bool, default=False If True, applies degrees of freedom adjustment from :footcite:t:`bellec2022biasing`. alpha_max_fraction : float, default=0.01 - Fraction of max alpha used for nodewise Lasso regularization. + Fraction of maximum alpha for nodewise Lasso regularization. tolerance_reid : float, default=1e-4 - Tolerance for Reid variance estimation method. - - random_state : int, RandomState instance or None, default=None - Controls randomization in CV splitter and Lasso fits. + Tolerance for Reid variance estimation. covariance : ndarray of shape (n_times, n_times) or None, default=None - Temporal noise covariance matrix. If None, estimated from data. + Pre-specified temporal noise covariance matrix. If None, estimated from data. noise_method : {'AR', 'median'}, default='AR' Method to estimate noise covariance: @@ -65,13 +65,13 @@ class DesparsifiedLasso(BaseVariableImportance): Order of AR model when noise_method='AR'. Must be < n_times. stationary : bool, default=True - Whether to assume stationary noise in estimation. + Whether to assume stationary noise. confidence : float, default=0.95 - Confidence level for intervals, must be in [0, 1]. + Confidence level for intervals, between 0 and 1. distribution : str, default='norm' - Distribution for p-value calculation. Only 'norm' supported. + Distribution for p-values. Only 'norm' supported. epsilon_pvalue : float, default=1e-14 Small value to avoid numerical issues in p-values. @@ -81,24 +81,30 @@ class DesparsifiedLasso(BaseVariableImportance): - 'chi2': Chi-squared test (large samples) - 'F': F-test (small samples) + save_model_x : bool, default=False + Whether to save nodewise regression models. + + random_state : int, RandomState or None, default=None + Controls random number generation. + n_jobs : int, default=1 - Number of parallel jobs. -1 means all CPUs. + Number of parallel jobs. - memory : str or Memory object, default=None - Used to cache nodewise Lasso computations. + memory : str or Memory, default=None + Cache for computations. verbose : int, default=0 Verbosity level. Attributes ---------- - importances_ : ndarray of shape (n_features,) or (n_features, n_times) - Desparsified Lasso coefficient estimates. + importances_ : ndarray + Desparsified coefficient estimates. - pvalues_ : ndarray of shape (n_features,) + pvalues_ : ndarray Two-sided p-values. - pvalues_corr_ : ndarray of shape (n_features,) + pvalues_corr_ : ndarray Multiple testing corrected p-values. sigma_hat_ : float or ndarray of shape (n_times, n_times) @@ -112,7 +118,6 @@ class DesparsifiedLasso(BaseVariableImportance): Notes ----- - X and y are always centered. Consider pre-scaling X if not already scaled. Chi-squared test assumes asymptotic normality, F-test preferred for small samples. References @@ -136,7 +141,6 @@ def __init__( dof_ajdustement=False, alpha_max_fraction=0.01, tolerance_reid=1e-4, - random_state=None, covariance=None, noise_method="AR", order=1, @@ -146,6 +150,7 @@ def __init__( epsilon_pvalue=1e-14, test="chi2", save_model_x=False, + random_state=None, n_jobs=1, memory=None, verbose=0, @@ -193,31 +198,34 @@ def fit(self, X, y): """ Fit the Desparsified Lasso model. - This method fits the Desparsified Lasso model, which provides debiased estimates - and statistical inference for high-dimensional linear models through a two-step - procedure involving initial Lasso estimation followed by bias correction. + This method fits the Desparsified Lasso model to provide debiased coefficient estimates + and statistical inference for high-dimensional regression. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data matrix. y : array-like of shape (n_samples,) or (n_samples, n_times) - Target values. For single task, y should be 1D or (n_samples, 1). - For multi-task, y should be (n_samples, n_times). + Target values. For single task, y should be 1D. + For multi-task, y should be 2D with shape (n_samples, n_times). Returns ------- self : object - Returns the fitted instance. + Returns the instance with fitted attributes: + - importances_ : Desparsified coefficient estimates + - sigma_hat_ : Estimated noise level + - precision_diagonal_ : Diagonal of precision matrix + - clf_ : Fitted nodewise regression models (if save_model_x=True) Notes ----- - Main steps: - 1. Optional data centering - 2. Initial Lasso fit using cross-validation - 3. Computation of residuals - 4. Estimation of noise standard deviation - 5. Preparation for subsequent importance score calculation + The fitting process: + 1. Centers X and y if self.centered=True + 2. Fits initial Lasso using cross-validation + 3. Estimates noise variance using Reid method + 4. Computes nodewise Lasso regressions in parallel + 5. Calculates debiased coefficients and precision matrix """ memory = check_memory(self.memory) rng = check_random_state(self.random_state) @@ -322,7 +330,12 @@ def _check_fit(self): ValueError If model hasn't been fit or required attributes are missing. """ - if self.sigma_hat_ is None: + if ( + self.clf_ is None + or self.importances_ is None + or self.precision_diagonal_ is None + or self.sigma_hat_ is None + ): raise ValueError( "The Desparsified Lasso requires to be fit before any analysis" ) @@ -335,10 +348,12 @@ def _check_fit(self): def importance(self, X, y): """ - Compute desparsified lasso estimates and confidence intervals. + Compute desparsified lasso estimates, confidence intervals and p-values. - Calculates debiased coefficients, confidence intervals and p-values - using the desparsified lasso method. + Uses fitted model to calculate debiased coefficients along with confidence + intervals and p-values. For single task regression, provides confidence + intervals based on Gaussian approximation. For multi-task case, + computes chi-squared or F test p-values. Parameters ---------- @@ -346,32 +361,24 @@ def importance(self, X, y): Input data matrix. y : array-like of shape (n_samples,) or (n_samples, n_times) Target values. For single task, y should be 1D or (n_samples, 1). - For multi-task, y should be (n_samples, n_times). + For multi-task, y should be 2D with shape (n_samples, n_times). Returns ------- importances_ : ndarray of shape (n_features,) or (n_features, n_times) Desparsified lasso coefficient estimates. - Attributes - ---------- - importances_ : same as return value - pvalues_ : ndarray of shape (n_features,) - Two-sided p-values for each feature. - pvalues_corr_ : ndarray of shape (n_features,) - Multiple testing corrected p-values. - confidence_bound_min_ : ndarray of shape (n_features,) - Lower confidence bounds (only for single task). - confidence_bound_max_ : ndarray of shape (n_features,) - Upper confidence bounds (only for single task). - Notes ----- - The method: - 1. Performs nodewise lasso regressions to estimate precision matrix - 2. Debiases initial lasso estimates - 3. Computes confidence intervals and p-values - 4. For multi-task case, uses chi-squared or F test + Updates several instance attributes: + - importances_: Desparsified coefficient estimates + - pvalues_: Two-sided p-values + - pvalues_corr_: Multiple testing corrected p-values + - confidence_bound_min_: Lower confidence bounds (single task only) + - confidence_bound_max_: Upper confidence bounds (single task only) + + For multi-task case, p-values are based on chi-squared or F tests, + configured by the test parameter ('chi2' or 'F'). """ self._check_fit() beta_hat = self.importances_ @@ -458,35 +465,31 @@ def fit_importance(self, X, y): def _compute_residuals(X, id_column, clf, return_clf): """ - Compute nodewise Lasso regression for desparsified Lasso estimation + Compute nodewise Lasso regression for desparsified Lasso estimation. For feature i, regresses X[:,i] against all other features to obtain residuals and precision matrix diagonal entry needed for debiasing. Parameters ---------- - X : ndarray, shape (n_samples, n_features) - Centered input data matrix + X : ndarray of shape (n_samples, n_features) + Input data matrix. id_column : int - Index i of feature to regress - alpha : float - Lasso regularization parameter - gram : ndarray, shape (n_features, n_features) - Precomputed X.T @ X matrix - max_iteration : int, default=5000 - Maximum Lasso iterations - tolerance : float, default=1e-3 - Optimization tolerance - random_state : Generator, default=None - Random state for reproducibility + Index i of feature to regress. + clf : sklearn estimator + Pre-configured estimator. + return_clf : bool + Whether to return fitted sklearn estimator model. Returns ------- - z : ndarray, shape (n_samples,) - Residuals from regression + z : ndarray of shape (n_samples,) + Residuals from regression. precision_diagonal_i : float Diagonal entry i of precision matrix estimate, - computed as n * ||z||^2 / ^2 + computed as n * ||z||^2 / ^2. + clf : sklearn estimator or None + Fitted Lasso model if return_clf=True, else None. Notes ----- @@ -529,7 +532,6 @@ def desparsified_lasso( dof_ajdustement=False, alpha_max_fraction=0.01, tolerance_reid=1e-4, - random_state=None, covariance=None, noise_method="AR", order=1, @@ -538,6 +540,8 @@ def desparsified_lasso( distribution="norm", epsilon_pvalue=1e-14, test="chi2", + save_model_x=False, + random_state=None, n_jobs=1, memory=None, verbose=0, @@ -553,7 +557,6 @@ def desparsified_lasso( dof_ajdustement=dof_ajdustement, alpha_max_fraction=alpha_max_fraction, tolerance_reid=tolerance_reid, - random_state=random_state, covariance=covariance, noise_method=noise_method, order=order, @@ -562,6 +565,8 @@ def desparsified_lasso( distribution=distribution, epsilon_pvalue=epsilon_pvalue, test=test, + save_model_x=save_model_x, + random_state=random_state, n_jobs=n_jobs, memory=memory, verbose=verbose, From 7034a3bf88398443b2a934b7a512620b4338f7a1 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 13 Oct 2025 19:07:39 +0200 Subject: [PATCH 36/93] add warning --- examples/plot_fmri_data_example.py | 2 +- src/hidimstat/desparsified_lasso.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/plot_fmri_data_example.py b/examples/plot_fmri_data_example.py index 29a025349..9bd81059d 100644 --- a/examples/plot_fmri_data_example.py +++ b/examples/plot_fmri_data_example.py @@ -39,8 +39,8 @@ from nilearn.image import mean_img from nilearn.maskers import NiftiMasker from nilearn.plotting import plot_stat_map, show -from sklearn.cluster import FeatureAgglomeration from sklearn.base import clone +from sklearn.cluster import FeatureAgglomeration from sklearn.feature_extraction import image from sklearn.linear_model import LassoCV from sklearn.model_selection import KFold diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index 071bb702e..a686a42ac 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -346,7 +346,7 @@ def _check_fit(self): "The Desparsified Lasso requires to be fit before any analysis" ) - def importance(self, X, y): + def importance(self, X=None, y=None): """ Compute desparsified lasso estimates, confidence intervals and p-values. @@ -380,6 +380,10 @@ def importance(self, X, y): For multi-task case, p-values are based on chi-squared or F tests, configured by the test parameter ('chi2' or 'F'). """ + if X is not None: + warnings.warn("X won't be used.") + if y is not None: + warnings.warn("y won't be used.") self._check_fit() beta_hat = self.importances_ From 283e7608f9d5ed40ce4c936d519540adeb175d77 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 13 Oct 2025 19:15:54 +0200 Subject: [PATCH 37/93] add warning --- src/hidimstat/desparsified_lasso.py | 2 +- test/test_desparsified_lasso.py | 22 ++++++++++++++-------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index a686a42ac..ff3fdd419 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -38,7 +38,7 @@ class DesparsifiedLasso(BaseVariableImportance): Estimator used for initial Lasso fit. Must implement fit and predict. It should be Lasso or MultiTaskLasso. - model_x : sklearn estimator, default=Lasso() + model_x : LassoCV or MultiTaskLassoCV instance, default=Lasso() Base Lasso estimator used for nodewise regressions. centered : bool, default=True diff --git a/test/test_desparsified_lasso.py b/test/test_desparsified_lasso.py index 2f630cb60..9ae072991 100644 --- a/test/test_desparsified_lasso.py +++ b/test/test_desparsified_lasso.py @@ -43,7 +43,7 @@ def test_desparsified_lasso(): ) desparsified_lasso = DesparsifiedLasso(confidence=confidence).fit(X, y) - _ = desparsified_lasso.importance(X, y) + _ = desparsified_lasso.importance() # Check that beta is within the confidence intervals correct_interval = np.sum( (beta >= desparsified_lasso.confidence_bound_min_) @@ -62,7 +62,7 @@ def test_desparsified_lasso(): desparsified_lasso = DesparsifiedLasso( dof_ajdustement=True, confidence=confidence ).fit(X, y) - _ = desparsified_lasso.importance(X, y) + _ = desparsified_lasso.importance() # Check that beta is within the confidence intervals correct_interval = np.sum( @@ -118,7 +118,7 @@ def test_desparsified_group_lasso(): desparsified_lasso = DesparsifiedLasso( model_y=multitasklassoCV, covariance=corr ).fit(X, y) - importances = desparsified_lasso.importance(X, y) + importances = desparsified_lasso.importance() assert_almost_equal(importances, beta, decimal=1) @@ -132,7 +132,7 @@ def test_desparsified_group_lasso(): assert tp / np.sum(important) >= 0.8 desparsified_lasso = DesparsifiedLasso(model_y=multitasklassoCV, test="F").fit(X, y) - importances = desparsified_lasso.importance(X, y) + importances = desparsified_lasso.importance() assert_almost_equal(importances, beta, decimal=1) tp = np.sum(desparsified_lasso.pvalues_corr_[important] < alpha) @@ -149,7 +149,7 @@ def test_desparsified_group_lasso(): model_y=multitasklassoCV, covariance=bad_cov ).fit(X, y) with pytest.raises(ValueError): - desparsified_lasso.importance(X, y) + desparsified_lasso.importance() with pytest.raises(AssertionError, match="Unknown test 'r2'"): DesparsifiedLasso(model_y=multitasklassoCV, covariance=bad_cov, test="r2").fit( @@ -200,18 +200,24 @@ def test_exception(): ValueError, match="The Desparsified Lasso requires to be fit before any analysis", ): - desparsified_lasso.importance(X, y) + desparsified_lasso.importance() desparsified_lasso.sigma_hat_ = [] with pytest.raises( ValueError, match="The Desparsified Lasso requires to be fit before any analysis", ): - desparsified_lasso.importance(X, y) + desparsified_lasso.importance() desparsified_lasso = DesparsifiedLasso(model_y=multitasklassoCV).fit(X, y) with pytest.raises(ValueError, match="Unknown test 'r2'"): desparsified_lasso.test = "r2" - desparsified_lasso.importance(X, y) + desparsified_lasso.importance() + + desparsified_lasso = DesparsifiedLasso(model_y=multitasklassoCV).fit(X, y) + with pytest.warns(Warning, match="X won't be used."): + desparsified_lasso.importance(X=X) + with pytest.warns(Warning, match="y won't be used."): + desparsified_lasso.importance(y=y) def test_function_not_center(): From 7ab50a73829b60304899191f0b018f11c258ebde Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 13 Oct 2025 20:03:07 +0200 Subject: [PATCH 38/93] fix docstring --- src/hidimstat/desparsified_lasso.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index ff3fdd419..31ce9a8f3 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -213,10 +213,10 @@ def fit(self, X, y): ------- self : object Returns the instance with fitted attributes: - - importances_ : Desparsified coefficient estimates - - sigma_hat_ : Estimated noise level - - precision_diagonal_ : Diagonal of precision matrix - - clf_ : Fitted nodewise regression models (if save_model_x=True) + - 'importances_' : Desparsified coefficient estimates + - 'sigma_hat_' : Estimated noise level + - 'precision_diagonal_' : Diagonal of precision matrix + - 'clf_' : Fitted nodewise regression models (if save_model_x=True) Notes ----- @@ -371,11 +371,11 @@ def importance(self, X=None, y=None): Notes ----- Updates several instance attributes: - - importances_: Desparsified coefficient estimates - - pvalues_: Two-sided p-values - - pvalues_corr_: Multiple testing corrected p-values - - confidence_bound_min_: Lower confidence bounds (single task only) - - confidence_bound_max_: Upper confidence bounds (single task only) + - 'importances_': Desparsified coefficient estimates + - 'pvalues_': Two-sided p-values + - 'pvalues_corr_': Multiple testing corrected p-values + - 'confidence_bound_min_': Lower confidence bounds (single task only) + - 'confidence_bound_max_': Upper confidence bounds (single task only) For multi-task case, p-values are based on chi-squared or F tests, configured by the test parameter ('chi2' or 'F'). From a008b8926236245f970bc8e90ada6cc3e81356e4 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 14 Oct 2025 09:59:26 +0200 Subject: [PATCH 39/93] fix docstring --- src/hidimstat/desparsified_lasso.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index 31ce9a8f3..8b418c207 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -213,10 +213,10 @@ def fit(self, X, y): ------- self : object Returns the instance with fitted attributes: - - 'importances_' : Desparsified coefficient estimates - - 'sigma_hat_' : Estimated noise level - - 'precision_diagonal_' : Diagonal of precision matrix - - 'clf_' : Fitted nodewise regression models (if save_model_x=True) + - `importances_` : Desparsified coefficient estimates + - `sigma_hat_` : Estimated noise level + - `precision_diagonal_` : Diagonal of precision matrix + - `clf_` : Fitted nodewise regression models (if save_model_x=True) Notes ----- @@ -371,11 +371,11 @@ def importance(self, X=None, y=None): Notes ----- Updates several instance attributes: - - 'importances_': Desparsified coefficient estimates - - 'pvalues_': Two-sided p-values - - 'pvalues_corr_': Multiple testing corrected p-values - - 'confidence_bound_min_': Lower confidence bounds (single task only) - - 'confidence_bound_max_': Upper confidence bounds (single task only) + - `importances_`: Desparsified coefficient estimates + - `pvalues_`: Two-sided p-values + - `pvalues_corr_`: Multiple testing corrected p-values + - `confidence_bound_min_`: Lower confidence bounds (single task only) + - `confidence_bound_max_`: Upper confidence bounds (single task only) For multi-task case, p-values are based on chi-squared or F tests, configured by the test parameter ('chi2' or 'F'). From 6c471873fbe40dbccd223ce02161547761505ba2 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 14 Oct 2025 17:37:33 +0200 Subject: [PATCH 40/93] add options alphas --- src/hidimstat/desparsified_lasso.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index 8b418c207..377b48c36 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -45,16 +45,19 @@ class DesparsifiedLasso(BaseVariableImportance): Whether to center X and y before fitting. dof_ajdustement : bool, default=False - If True, applies degrees of freedom adjustment from :footcite:t:`bellec2022biasing`. + If True, applies degrees of freedom adjustment. + + alphas : array-like or None, default=None + Regularization parameters for each variable. If None, computed from alpha_max_fraction. alpha_max_fraction : float, default=0.01 - Fraction of maximum alpha for nodewise Lasso regularization. + Fraction of maximum alpha for nodewise Lasso regularization. Used only if alphas=None. tolerance_reid : float, default=1e-4 Tolerance for Reid variance estimation. covariance : ndarray of shape (n_times, n_times) or None, default=None - Pre-specified temporal noise covariance matrix. If None, estimated from data. + Pre-specified noise covariance matrix across tasks. If None, estimated from data. noise_method : {'AR', 'median'}, default='AR' Method to estimate noise covariance: @@ -139,6 +142,7 @@ def __init__( model_x=Lasso(max_iter=5000, tol=1e-3), centered=True, dof_ajdustement=False, + alphas=None, alpha_max_fraction=0.01, tolerance_reid=1e-4, covariance=None, @@ -169,6 +173,7 @@ def __init__( self.model_y = model_y self.centered = centered self.dof_ajdustement = dof_ajdustement + self.alphas = alphas self.alpha_max_fraction = alpha_max_fraction self.tolerance_reid = tolerance_reid self.covariance = covariance @@ -240,6 +245,7 @@ def fit(self, X, y): X_ = X y_ = y self.n_samples_, n_features = X_.shape + assert self.alphas is None or len(self.alphas) == n_features try: check_is_fitted(self.model_y) @@ -267,8 +273,9 @@ def fit(self, X, y): ) # define the alphas for the Nodewise Lasso - list_alpha_max = _alpha_max(X_, X_, fill_diagonal=True, axis=0) - alphas = self.alpha_max_fraction * list_alpha_max + if self.alphas is None: + list_alpha_max = _alpha_max(X_, X_, fill_diagonal=True, axis=0) + alphas = self.alpha_max_fraction * list_alpha_max gram = np.dot(X_.T, X_) # Gram matrix # Calculating precision matrix (Nodewise Lasso) From d58cad51be13a605419b302b7e4c887432605409 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 15 Oct 2025 15:26:03 +0200 Subject: [PATCH 41/93] update comments --- src/hidimstat/desparsified_lasso.py | 140 +++++++++++++++------------- 1 file changed, 75 insertions(+), 65 deletions(-) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index 377b48c36..fc64bc140 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -7,7 +7,7 @@ from scipy.linalg import inv from sklearn.base import check_is_fitted, clone from sklearn.exceptions import NotFittedError -from sklearn.linear_model import Lasso, LassoCV, MultiTaskLassoCV +from sklearn.linear_model import Lasso, LassoCV, MultiTaskLasso, MultiTaskLassoCV from sklearn.model_selection import KFold from sklearn.preprocessing import StandardScaler from sklearn.utils.validation import check_memory @@ -25,7 +25,7 @@ class DesparsifiedLasso(BaseVariableImportance): """ - Desparsified Lasso Estimator + Desparsified Lasso Estimator (also known as Debiased Lasso) Statistical inference in high-dimensional regression using the desparsified Lasso. Provides debiased coefficient estimates, confidence intervals and p-values. @@ -35,66 +35,66 @@ class DesparsifiedLasso(BaseVariableImportance): Parameters ---------- model_y : LassoCV or MultiTaskLassoCV instance, default=LassoCV() - Estimator used for initial Lasso fit. Must implement fit and predict. - It should be Lasso or MultiTaskLasso. - - model_x : LassoCV or MultiTaskLassoCV instance, default=Lasso() - Base Lasso estimator used for nodewise regressions. + Initial model for selecting relevant features. Must implement fit and predict. + For single task use LassoCV, for multi-task use MultiTaskLassoCV. centered : bool, default=True Whether to center X and y before fitting. dof_ajdustement : bool, default=False - If True, applies degrees of freedom adjustment. + Whether to apply degrees of freedom adjustment for small samples. + + model_x : Lasso or MultiTaskLasso instance, default=Lasso() + Base model for nodewise regressions. alphas : array-like or None, default=None - Regularization parameters for each variable. If None, computed from alpha_max_fraction. + Regularization strengths for nodewise regressions. If None, computed from alpha_max_fraction. alpha_max_fraction : float, default=0.01 - Fraction of maximum alpha for nodewise Lasso regularization. Used only if alphas=None. + Fraction of maximum alpha to use when alphas=None. - tolerance_reid : float, default=1e-4 - Tolerance for Reid variance estimation. + random_state : int or RandomState, default=None + Controls randomization. - covariance : ndarray of shape (n_times, n_times) or None, default=None - Pre-specified noise covariance matrix across tasks. If None, estimated from data. + save_model_x : bool, default=False + Whether to save fitted nodewise regression models. + + tolerance_reid : float, default=1e-4 + Convergence tolerance for noise estimation. noise_method : {'AR', 'median'}, default='AR' - Method to estimate noise covariance: + Method for noise covariance estimation: - 'AR': Autoregressive model - - 'median': Median correlation between consecutive timepoints + - 'median': Median correlation order : int, default=1 - Order of AR model when noise_method='AR'. Must be < n_times. + Order of AR model if noise_method='AR'. stationary : bool, default=True Whether to assume stationary noise. confidence : float, default=0.95 - Confidence level for intervals, between 0 and 1. + Confidence level for intervals. distribution : str, default='norm' - Distribution for p-values. Only 'norm' supported. + Distribution for p-values, only 'norm' supported. epsilon_pvalue : float, default=1e-14 - Small value to avoid numerical issues in p-values. + Small constant to avoid numerical issues. test : {'chi2', 'F'}, default='chi2' - Test for p-values: + Test statistic for p-values: - 'chi2': Chi-squared test (large samples) - 'F': F-test (small samples) - save_model_x : bool, default=False - Whether to save nodewise regression models. - - random_state : int, RandomState or None, default=None - Controls random number generation. + covariance : ndarray or None, default=None + Pre-specified noise covariance matrix. n_jobs : int, default=1 Number of parallel jobs. memory : str or Memory, default=None - Cache for computations. + Cache for intermediate results. verbose : int, default=0 Verbosity level. @@ -102,7 +102,7 @@ class DesparsifiedLasso(BaseVariableImportance): Attributes ---------- importances_ : ndarray - Desparsified coefficient estimates. + Debiased coefficient estimates. pvalues_ : ndarray Two-sided p-values. @@ -110,22 +110,17 @@ class DesparsifiedLasso(BaseVariableImportance): pvalues_corr_ : ndarray Multiple testing corrected p-values. - sigma_hat_ : float or ndarray of shape (n_times, n_times) - Estimated noise level or precision matrix. + sigma_hat_ : float or ndarray + Estimated noise level. + + precision_diagonal_ : ndarray + Diagonal entries of precision matrix. - confidence_bound_min_ : ndarray of shape (n_features,) + confidence_bound_min_ : ndarray Lower confidence bounds. - confidence_bound_max_ : ndarray of shape (n_features,) + confidence_bound_max_ : ndarray Upper confidence bounds. - - Notes - ----- - Chi-squared test assumes asymptotic normality, F-test preferred for small samples. - - References - ---------- - .. footbibliography:: """ def __init__( @@ -139,31 +134,31 @@ def __init__( random_state=1, n_jobs=1, ), - model_x=Lasso(max_iter=5000, tol=1e-3), centered=True, dof_ajdustement=False, + # parameters for model_x + model_x=Lasso(max_iter=5000, tol=1e-3), alphas=None, alpha_max_fraction=0.01, + random_state=None, + save_model_x=False, + # parameters for reid tolerance_reid=1e-4, - covariance=None, noise_method="AR", order=1, stationary=True, + # parameters for tests confidence=0.95, distribution="norm", epsilon_pvalue=1e-14, test="chi2", - save_model_x=False, - random_state=None, + covariance=None, + # parameters for optimization n_jobs=1, memory=None, verbose=0, ): super().__init__() - assert issubclass( - Lasso, model_x.__class__ - ), "lasso needs to be a Lasso or a MultiTaskLasso" - self.model_x = model_x if issubclass(LassoCV, model_y.__class__): self.n_times_ = 1 elif issubclass(MultiTaskLassoCV, model_y.__class__): @@ -173,31 +168,39 @@ def __init__( self.model_y = model_y self.centered = centered self.dof_ajdustement = dof_ajdustement + # model x + assert issubclass(Lasso, model_x.__class__) or issubclass( + MultiTaskLasso, model_x.__class__ + ), "lasso needs to be a Lasso or a MultiTaskLasso" + self.model_x = model_x self.alphas = alphas self.alpha_max_fraction = alpha_max_fraction + self.save_model_x = save_model_x + self.random_state = random_state + # parameter for reid self.tolerance_reid = tolerance_reid - self.covariance = covariance self.noise_method = noise_method self.order = order self.stationary = stationary + # parameter for test self.confidence = confidence self.distribution = distribution self.epsilon_pvalue = epsilon_pvalue + self.covariance = covariance assert test == "chi2" or test == "F", f"Unknown test '{test}'" self.test = test - self.save_model_x = save_model_x + # parameters for optimization self.n_jobs = n_jobs - self.random_state = random_state self.memory = memory self.verbose = verbose + self.n_samples_ = None + self.clf_ = None self.sigma_hat_ = None + self.precision_diagonal_ = None self.confidence_bound_min_ = None self.confidence_bound_max_ = None self.pvalues_corr_ = None - self.precision_diagonal_ = None - self.clf_ = None - self.n_samples_ = None def fit(self, X, y): """ @@ -409,7 +412,7 @@ def importance(self, X=None, y=None): self.confidence_bound_max_ = beta_hat + confint_radius self.confidence_bound_min_ = beta_hat - confint_radius - pval, pval_corr, one_minus_pval, one_minus_pval_corr = pval_from_cb( + pval, pval_corr, _, _ = pval_from_cb( self.confidence_bound_min_, self.confidence_bound_max_, confidence=self.confidence, @@ -445,8 +448,8 @@ def importance(self, X=None, y=None): # Compute the p-values sign_beta = np.sign(np.sum(beta_hat, axis=1)) - pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( - pval_from_two_sided_pval_and_sign(two_sided_pval, sign_beta) + pval, pval_corr, _, _ = pval_from_two_sided_pval_and_sign( + two_sided_pval, sign_beta, eps=self.epsilon_pvalue ) self.pvalues_ = pval @@ -538,24 +541,30 @@ def desparsified_lasso( max_iter=5000, random_state=0, ), - model_x=Lasso(max_iter=5000, tol=1e-3), centered=True, dof_ajdustement=False, + # parameter for model_x + model_x=Lasso(max_iter=5000, tol=1e-3), + alphas=None, alpha_max_fraction=0.01, + save_model_x=False, + random_state=None, + # parameter for reid tolerance_reid=1e-4, - covariance=None, noise_method="AR", order=1, stationary=True, + # paramter for tests confidence=0.95, distribution="norm", epsilon_pvalue=1e-14, test="chi2", - save_model_x=False, - random_state=None, + covariance=None, + # parameter for optimization n_jobs=1, memory=None, verbose=0, + # parameter for selections k_lowest=None, percentile=None, threshold_min=None, @@ -563,12 +572,14 @@ def desparsified_lasso( ): methods = DesparsifiedLasso( model_y=model_y, - model_x=model_x, centered=centered, dof_ajdustement=dof_ajdustement, + model_x=model_x, + alphas=alphas, alpha_max_fraction=alpha_max_fraction, + save_model_x=save_model_x, + random_state=random_state, tolerance_reid=tolerance_reid, - covariance=covariance, noise_method=noise_method, order=order, stationary=stationary, @@ -576,8 +587,7 @@ def desparsified_lasso( distribution=distribution, epsilon_pvalue=epsilon_pvalue, test=test, - save_model_x=save_model_x, - random_state=random_state, + covariance=covariance, n_jobs=n_jobs, memory=memory, verbose=verbose, From 3a53abd49680a274c47d01c21b2b68cc1f516974 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 15 Oct 2025 16:20:07 +0200 Subject: [PATCH 42/93] remove shuffle in cv --- src/hidimstat/desparsified_lasso.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index fc64bc140..2b9e07ae3 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -128,7 +128,7 @@ def __init__( model_y=LassoCV( eps=1e-2, fit_intercept=False, - cv=KFold(n_splits=5, shuffle=True, random_state=0), + cv=KFold(n_splits=5), tol=1e-4, max_iter=5000, random_state=1, @@ -283,7 +283,7 @@ def fit(self, X, y): # Calculating precision matrix (Nodewise Lasso) results = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)( - delayed(_compute_residuals)( + delayed(_joblib_compute_residuals)( X=X_, id_column=i, clf=seed_estimator( @@ -477,7 +477,7 @@ def fit_importance(self, X, y): return self.importance(X, y) -def _compute_residuals(X, id_column, clf, return_clf): +def _joblib_compute_residuals(X, id_column, clf, return_clf): """ Compute nodewise Lasso regression for desparsified Lasso estimation. From 7d0c2caca6c41e867287340c91903eab591ba317 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 15 Oct 2025 17:10:16 +0200 Subject: [PATCH 43/93] Fix comment --- src/hidimstat/desparsified_lasso.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index 2b9e07ae3..7aebe80db 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -177,12 +177,12 @@ def __init__( self.alpha_max_fraction = alpha_max_fraction self.save_model_x = save_model_x self.random_state = random_state - # parameter for reid + # parameters for reid self.tolerance_reid = tolerance_reid self.noise_method = noise_method self.order = order self.stationary = stationary - # parameter for test + # parameters for test self.confidence = confidence self.distribution = distribution self.epsilon_pvalue = epsilon_pvalue @@ -543,18 +543,18 @@ def desparsified_lasso( ), centered=True, dof_ajdustement=False, - # parameter for model_x + # parameters for model_x model_x=Lasso(max_iter=5000, tol=1e-3), alphas=None, alpha_max_fraction=0.01, save_model_x=False, random_state=None, - # parameter for reid + # parameters for reid tolerance_reid=1e-4, noise_method="AR", order=1, stationary=True, - # paramter for tests + # parameters for tests confidence=0.95, distribution="norm", epsilon_pvalue=1e-14, From 33ef1cc5cff60ebf6f0549bcd01a3cc97da5b956 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 15 Oct 2025 17:10:36 +0200 Subject: [PATCH 44/93] change cv in EnClDL --- src/hidimstat/ensemble_clustered_inference.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hidimstat/ensemble_clustered_inference.py b/src/hidimstat/ensemble_clustered_inference.py index 322786c38..3fb6b312c 100644 --- a/src/hidimstat/ensemble_clustered_inference.py +++ b/src/hidimstat/ensemble_clustered_inference.py @@ -255,7 +255,7 @@ def clustered_inference( kwargs["model_y"] = MultiTaskLassoCV( eps=1e-2, fit_intercept=False, - cv=KFold(n_splits=5, shuffle=True, random_state=0), + cv=KFold(n_splits=5), tol=1e-4, max_iter=5000, random_state=1, @@ -265,7 +265,7 @@ def clustered_inference( kwargs["model_y"] = LassoCV( eps=1e-2, fit_intercept=False, - cv=KFold(n_splits=5, shuffle=True, random_state=0), + cv=KFold(n_splits=5), tol=1e-4, max_iter=5000, random_state=1, From 365b56e5283725de49d3abe9a2f19e54646030cd Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 15 Oct 2025 17:52:54 +0200 Subject: [PATCH 45/93] improve coverage --- src/hidimstat/desparsified_lasso.py | 6 ------ test/test_desparsified_lasso.py | 22 +++++++++++----------- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index 7aebe80db..a79d69fda 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -349,12 +349,6 @@ def _check_fit(self): raise ValueError( "The Desparsified Lasso requires to be fit before any analysis" ) - try: - check_is_fitted(self.model_y) - except NotFittedError: - raise ValueError( - "The Desparsified Lasso requires to be fit before any analysis" - ) def importance(self, X=None, y=None): """ diff --git a/test/test_desparsified_lasso.py b/test/test_desparsified_lasso.py index 9ae072991..3d942e045 100644 --- a/test/test_desparsified_lasso.py +++ b/test/test_desparsified_lasso.py @@ -101,7 +101,7 @@ def test_desparsified_group_lasso(): fit_intercept=False, cv=KFold(n_splits=5, shuffle=True, random_state=0), tol=1e-4, - max_iter=5000, + max_iter=50, random_state=1, n_jobs=1, ) @@ -115,9 +115,10 @@ def test_desparsified_group_lasso(): signal_noise_ratio=signal_noise_ratio, ) - desparsified_lasso = DesparsifiedLasso( - model_y=multitasklassoCV, covariance=corr - ).fit(X, y) + with pytest.warns(Warning, match="'max_iter' has been increased to "): + desparsified_lasso = DesparsifiedLasso( + model_y=multitasklassoCV, covariance=corr, save_model_x=True + ).fit(X, y) importances = desparsified_lasso.importance() assert_almost_equal(importances, beta, decimal=1) @@ -130,6 +131,10 @@ def test_desparsified_group_lasso(): fp = np.sum(desparsified_lasso.pvalues_corr_[non_important] < alpha) assert fp / np.sum(non_important) <= alpha assert tp / np.sum(important) >= 0.8 + assert ( + desparsified_lasso.clf_ is not None + and len(desparsified_lasso.clf_) == n_features + ) desparsified_lasso = DesparsifiedLasso(model_y=multitasklassoCV, test="F").fit(X, y) importances = desparsified_lasso.importance() @@ -139,6 +144,7 @@ def test_desparsified_group_lasso(): fp = np.sum(desparsified_lasso.pvalues_corr_[non_important] < alpha) assert fp / np.sum(non_important) <= alpha assert tp / np.sum(important) >= 0.8 + assert desparsified_lasso # Testing error is raised when the covariance matrix has wrong shape bad_cov = np.delete(corr, 0, axis=1) @@ -196,12 +202,6 @@ def test_exception(): with pytest.raises(AssertionError, match="Unknown test 'r2'"): DesparsifiedLasso(test="r2") desparsified_lasso = DesparsifiedLasso(model_y=multitasklassoCV) - with pytest.raises( - ValueError, - match="The Desparsified Lasso requires to be fit before any analysis", - ): - desparsified_lasso.importance() - desparsified_lasso.sigma_hat_ = [] with pytest.raises( ValueError, match="The Desparsified Lasso requires to be fit before any analysis", @@ -221,7 +221,7 @@ def test_exception(): def test_function_not_center(): - "Test function and not centered" + "Test function when the data don't need to be centered" n_samples, n_features = 52, 50 support_size = 1 signal_noise_ratio = 50 From 4ab75f34c4b6c12398234d33b01a5fd0cd5c3456 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Thu, 16 Oct 2025 11:11:05 +0200 Subject: [PATCH 46/93] Update src/hidimstat/noise_std.py Co-authored-by: bthirion --- src/hidimstat/noise_std.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/hidimstat/noise_std.py b/src/hidimstat/noise_std.py index 34cb0a8c5..c1a779ef5 100644 --- a/src/hidimstat/noise_std.py +++ b/src/hidimstat/noise_std.py @@ -68,6 +68,7 @@ def reid( n_times = beta_hat.shape[0] else: n_times = None + n_samples = residual.shape[0] # get the number of non-zero coefficients From e4c209ade0a791ea63dc51fb88a5442db3b5586e Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Thu, 16 Oct 2025 11:11:16 +0200 Subject: [PATCH 47/93] Update test/test_desparsified_lasso.py Co-authored-by: bthirion --- test/test_desparsified_lasso.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_desparsified_lasso.py b/test/test_desparsified_lasso.py index 3d942e045..289025496 100644 --- a/test/test_desparsified_lasso.py +++ b/test/test_desparsified_lasso.py @@ -221,7 +221,7 @@ def test_exception(): def test_function_not_center(): - "Test function when the data don't need to be centered" + """Test function when the data don't need to be centered""" n_samples, n_features = 52, 50 support_size = 1 signal_noise_ratio = 50 From 845834113cc241ec5230826d4a55a8fcf4a07338 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 16 Oct 2025 11:23:37 +0200 Subject: [PATCH 48/93] replace n_time by n_task --- src/hidimstat/desparsified_lasso.py | 34 +++++++++---------- src/hidimstat/ensemble_clustered_inference.py | 22 ++++++------ test/test_desparsified_lasso.py | 1 + 3 files changed, 29 insertions(+), 28 deletions(-) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index a79d69fda..c25985b76 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -160,9 +160,9 @@ def __init__( ): super().__init__() if issubclass(LassoCV, model_y.__class__): - self.n_times_ = 1 + self.n_task_ = 1 elif issubclass(MultiTaskLassoCV, model_y.__class__): - self.n_times_ = -1 + self.n_task_ = -1 else: raise AssertionError("lasso_cv needs to be a LassoCV or a MultiTaskLassoCV") self.model_y = model_y @@ -213,9 +213,9 @@ def fit(self, X, y): ---------- X : array-like of shape (n_samples, n_features) Training data matrix. - y : array-like of shape (n_samples,) or (n_samples, n_times) + y : array-like of shape (n_samples,) or (n_samples, n_task) Target values. For single task, y should be 1D. - For multi-task, y should be 2D with shape (n_samples, n_times). + For multi-task, y should be 2D with shape (n_samples, n_task). Returns ------- @@ -237,8 +237,8 @@ def fit(self, X, y): """ memory = check_memory(self.memory) rng = check_random_state(self.random_state) - if self.n_times_ == -1: - self.n_times_ = y.shape[1] + if self.n_task_ == -1: + self.n_task_ = y.shape[1] # centering the data and the target variable if self.centered: @@ -269,7 +269,7 @@ def fit(self, X, y): self.model_y.predict(X_) - y_, # compute the residual, tolerance=self.tolerance_reid, # for group - multioutput=self.n_times_ > 1, + multioutput=self.n_task_ > 1, method=self.noise_method, order=self.order, stationary=self.stationary, @@ -363,13 +363,13 @@ def importance(self, X=None, y=None): ---------- X : array-like of shape (n_samples, n_features) Input data matrix. - y : array-like of shape (n_samples,) or (n_samples, n_times) + y : array-like of shape (n_samples,) or (n_samples, n_task) Target values. For single task, y should be 1D or (n_samples, 1). - For multi-task, y should be 2D with shape (n_samples, n_times). + For multi-task, y should be 2D with shape (n_samples, n_task). Returns ------- - importances_ : ndarray of shape (n_features,) or (n_features, n_times) + importances_ : ndarray of shape (n_features,) or (n_features, n_task) Desparsified lasso coefficient estimates. Notes @@ -391,7 +391,7 @@ def importance(self, X=None, y=None): self._check_fit() beta_hat = self.importances_ - if self.n_times_ == 1: + if self.n_task_ == 1: # define the quantile for the confidence intervals quantile = stats.norm.ppf(1 - (1 - self.confidence) / 2) # see definition of lower and upper bound in algorithm 1 @@ -425,16 +425,16 @@ def importance(self, X=None, y=None): / self.precision_diagonal_ ) two_sided_pval = np.minimum( - 2 * stats.chi2.sf(chi2_scores, df=self.n_times_), 1.0 + 2 * stats.chi2.sf(chi2_scores, df=self.n_task_), 1.0 ) elif self.test == "F": f_scores = ( np.diag(multi_dot([beta_hat, theta_hat, beta_hat.T])) / self.precision_diagonal_ - / self.n_times_ + / self.n_task_ ) two_sided_pval = np.minimum( - 2 * stats.f.sf(f_scores, dfd=self.n_samples_, dfn=self.n_times_), + 2 * stats.f.sf(f_scores, dfd=self.n_samples_, dfn=self.n_task_), 1.0, ) else: @@ -458,13 +458,13 @@ def fit_importance(self, X, y): ---------- X : array-like of shape (n_samples, n_features) Training data matrix. - y : array-like of shape (n_samples,) or (n_samples, n_times) + y : array-like of shape (n_samples,) or (n_samples, n_task) Target values. For single task, y should be 1D or (n_samples, 1). - For multi-task, y should be (n_samples, n_times). + For multi-task, y should be (n_samples, n_task). Returns ------- - importances_ : ndarray of shape (n_features,) or (n_features, n_times) + importances_ : ndarray of shape (n_features,) or (n_features, n_task) Desparsified lasso coefficient estimates. """ self.fit(X, y) diff --git a/src/hidimstat/ensemble_clustered_inference.py b/src/hidimstat/ensemble_clustered_inference.py index 3fb6b312c..802bf2401 100644 --- a/src/hidimstat/ensemble_clustered_inference.py +++ b/src/hidimstat/ensemble_clustered_inference.py @@ -21,7 +21,7 @@ def _ungroup_beta(beta_hat, n_features, ward): Parameters ---------- - beta_hat : ndarray, shape (n_clusters,) or (n_clusters, n_times) + beta_hat : ndarray, shape (n_clusters,) or (n_clusters, n_task) Beta coefficients at cluster level n_features : int Number of features in original space @@ -30,7 +30,7 @@ def _ungroup_beta(beta_hat, n_features, ward): Returns ------- - beta_hat_degrouped : ndarray, shape (n_features,) or (n_features, n_times) + beta_hat_degrouped : ndarray, shape (n_features,) or (n_features, n_task) Rescaled beta coefficients for individual features, weighted by inverse cluster size @@ -51,9 +51,9 @@ def _ungroup_beta(beta_hat, n_features, ward): # weighting the weight of beta with the size of the cluster beta_hat_degrouped = ward.inverse_transform(beta_hat) / clusters_size elif len(beta_hat.shape) == 2: - n_times = beta_hat.shape[1] - beta_hat_degrouped = np.zeros((n_features, n_times)) - for i in range(n_times): + n_task = beta_hat.shape[1] + beta_hat_degrouped = np.zeros((n_features, n_task)) + for i in range(n_task): beta_hat_degrouped[:, i] = ( ward.inverse_transform(beta_hat[:, i]) / clusters_size ) @@ -71,7 +71,7 @@ def _degrouping(ward, beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_ ---------- ward : sklearn.cluster.FeatureAgglomeration Fitted clustering object containing the hierarchical structure - beta_hat : ndarray, shape (n_clusters,) or (n_clusters, n_times) + beta_hat : ndarray, shape (n_clusters,) or (n_clusters, n_task) Estimated parameters at cluster level pval : ndarray, shape (n_clusters,) P-values at cluster level @@ -84,7 +84,7 @@ def _degrouping(ward, beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_ Returns ------- - beta_hat : ndarray, shape (n_features,) or (n_features, n_times) + beta_hat : ndarray, shape (n_features,) or (n_features, n_task) Rescaled parameter estimates for individual features pval : ndarray, shape (n_features,) P-values for individual features @@ -169,7 +169,7 @@ def clustered_inference( X_init : ndarray, shape (n_samples, n_features) Original high-dimensional input data matrix. - y : ndarray, shape (n_samples,) or (n_samples, n_times) + y : ndarray, shape (n_samples,) or (n_samples, n_task) Target variable(s). Can be univariate or multivariate (temporal) data. ward : sklearn.cluster.FeatureAgglomeration @@ -208,7 +208,7 @@ def clustered_inference( ward_ : FeatureAgglomeration Fitted clustering object. - beta_hat : ndarray, shape (n_clusters,) or (n_clusters, n_times) + beta_hat : ndarray, shape (n_clusters,) or (n_clusters, n_task) Estimated coefficients at cluster level. theta_hat : ndarray @@ -366,7 +366,7 @@ def ensemble_clustered_inference( X_init : ndarray, shape (n_samples, n_features) Original high-dimensional input data matrix. - y : ndarray, shape (n_samples,) or (n_samples, n_times) + y : ndarray, shape (n_samples,) or (n_samples, n_task) Target variable(s). Can be univariate or multivariate (temporal) data. ward : sklearn.cluster.FeatureAgglomeration @@ -545,7 +545,7 @@ def ensemble_clustered_inference_pvalue( Returns ------- - beta_hat : ndarray, shape (n_features,) or (n_features, n_times) + beta_hat : ndarray, shape (n_features,) or (n_features, n_task) Averaged coefficients across bootstraps selected : ndarray, shape (n_features,) Selected features: 1 for positive effects, -1 for negative effects, diff --git a/test/test_desparsified_lasso.py b/test/test_desparsified_lasso.py index 289025496..e33967f95 100644 --- a/test/test_desparsified_lasso.py +++ b/test/test_desparsified_lasso.py @@ -164,6 +164,7 @@ def test_desparsified_group_lasso(): def test_exception(): + """Test exception of Desparsified Lasso""" n_samples = 50 n_features = 100 n_target = 10 From 74a163666888b9073993175fb7286d8c018dea19 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 16 Oct 2025 11:58:14 +0200 Subject: [PATCH 49/93] replace n_time by n_task --- src/hidimstat/noise_std.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/hidimstat/noise_std.py b/src/hidimstat/noise_std.py index c1a779ef5..1bc7b982e 100644 --- a/src/hidimstat/noise_std.py +++ b/src/hidimstat/noise_std.py @@ -26,10 +26,10 @@ def reid( Parameters ---------- - beta_hat : ndarray, shape (n_features,) or (n_times, n_features) + beta_hat : ndarray, shape (n_features,) or (n_task, n_features) Estimated sparse coefficient vector from regression. - residual : ndarray, shape (n_samples,) or (n_samples, n_times) + residual : ndarray, shape (n_samples,) or (n_samples, n_task) Residuals from the regression model. tolerance : float, default=1e-4 @@ -47,13 +47,13 @@ def reid( - 'AR': Uses Yule-Walker method with specified order order : int, default=1 - Order of AR model when method='AR'. Must be < n_times. + Order of AR model when method='AR'. Must be < n_task. Returns ------- sigma_hat_raw or covariance_hat : float or ndarray For single output: estimated noise standard deviation - For multiple outputs: estimated (n_times, n_times) covariance matrix + For multiple outputs: estimated (n_task, n_task) covariance matrix Notes ----- @@ -65,9 +65,9 @@ def reid( .. footbibliography:: """ if multioutput: - n_times = beta_hat.shape[0] + n_task = beta_hat.shape[0] else: - n_times = None + n_task = None n_samples = residual.shape[0] @@ -96,7 +96,7 @@ def reid( print("Group reid: simple cov estimation") elif method == "AR": print(f"Group reid: {method}{order} cov estimation") - if order > n_times - 1: + if order > n_task - 1: raise ValueError( "The requested AR order is to high with " + "respect to the number of time steps." @@ -112,7 +112,7 @@ def reid( if stationary: # consideration of stationary noise # (section 2.5 of `chevalier2020statistical`) - sigma_hat = np.median(sigma_hat_raw) * np.ones(n_times) + sigma_hat = np.median(sigma_hat_raw) * np.ones(n_task) # compute rho from the empirical correlation matrix # (section 2.5 of `chevalier2020statistical`) correlation_empirical = np.corrcoef(residual.T) @@ -125,9 +125,7 @@ def reid( if not stationary or method == "median": rho_hat = np.median(np.diag(correlation_empirical, 1)) # estimate M (section 2.5 of `chevalier2020statistical`) - correlation_hat = toeplitz( - np.geomspace(1, rho_hat ** (n_times - 1), n_times) - ) + correlation_hat = toeplitz(np.geomspace(1, rho_hat ** (n_task - 1), n_task)) covariance_hat = np.outer(sigma_hat, sigma_hat) * correlation_hat # Yule-Walker method (algorithm in section 3 of `eshel2003yule`) @@ -144,7 +142,7 @@ def reid( coefficients_ar = solve(R, rho_ar[1:]) # estimate the variance of the noise from the AR model - residual_estimate = np.zeros((n_samples, n_times - order)) + residual_estimate = np.zeros((n_samples, n_task - order)) for i in range(order): # time window used to estimate the residual from AR model start = order - i - 1 @@ -156,9 +154,9 @@ def reid( ) # estimation of the autocorrelation matrices - rho_ar_full = np.zeros(n_times) + rho_ar_full = np.zeros(n_task) rho_ar_full[: rho_ar.size] = rho_ar - for i in range(order + 1, n_times): + for i in range(order + 1, n_task): start = i - order end = i rho_ar_full[i] = np.dot(coefficients_ar[::-1], rho_ar_full[start:end]) From 88cf0478c94c70bdc67bd86658e0c56bbfe1a1f4 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 16 Oct 2025 11:58:51 +0200 Subject: [PATCH 50/93] Move reid in desparsified lasso --- src/hidimstat/_utils/scenario.py | 43 +++++ src/hidimstat/desparsified_lasso.py | 174 +++++++++++++++++++- src/hidimstat/noise_std.py | 217 ------------------------- test/_utils/test_scenario.py | 45 ++++++ test/test_desparsified_lasso.py | 187 +++++++++++++++++++++- test/test_noise_std.py | 240 ---------------------------- 6 files changed, 446 insertions(+), 460 deletions(-) delete mode 100644 src/hidimstat/noise_std.py delete mode 100644 test/test_noise_std.py diff --git a/src/hidimstat/_utils/scenario.py b/src/hidimstat/_utils/scenario.py index 85b7ce88d..bd445143e 100644 --- a/src/hidimstat/_utils/scenario.py +++ b/src/hidimstat/_utils/scenario.py @@ -302,3 +302,46 @@ def multivariate_simulation( y = prod_temp + noise return X, y, beta_true, noise + + +def empirical_snr(X, y, beta, noise=None): + """ + Compute the empirical signal-to-noise ratio (SNR) for + the linear model y = X @ beta + noise. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Design matrix. + + y : ndarray, shape (n_samples,) + Target vector. + + beta : ndarray, shape (n_features,) + Parameter vector. + + noise : ndarray, shape (n_samples,), optional + Noise vector. If None, computed as y - X @ beta. + + Returns + ------- + signal_noise_ratio_hat : float + Empirical SNR computed as var(signal) / var(noise). + + Notes + ----- + SNR measures the ratio of signal power to noise power, + indicating model estimation quality. + Higher values suggest better signal recovery. + """ + X = np.asarray(X) + + signal = np.dot(X, beta) + + if noise is None: + noise = y - signal + + # compute signal-to-noise ratio + signal_noise_ratio_ = (np.linalg.norm(signal) / np.linalg.norm(noise)) ** 2 + + return signal_noise_ratio_ diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index c25985b76..b8f0a3508 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -2,7 +2,7 @@ import numpy as np from joblib import Parallel, delayed -from numpy.linalg import multi_dot +from numpy.linalg import multi_dot, norm from scipy import stats from scipy.linalg import inv from sklearn.base import check_is_fitted, clone @@ -11,6 +11,7 @@ from sklearn.model_selection import KFold from sklearn.preprocessing import StandardScaler from sklearn.utils.validation import check_memory +from scipy.linalg import solve, toeplitz from hidimstat._utils.docstring import _aggregate_docstring from hidimstat._utils.regression import _alpha_max @@ -617,3 +618,174 @@ def desparsified_lasso( For features not selected during screening, p-values are set to 1. """, ) + + +def reid( + beta_hat, + residual, + tolerance=1e-4, + multioutput=False, + stationary=True, + method="median", + order=1, +): + """ + Residual sum of squares based estimators for noise standard deviation + estimation. + + This implementation follows the procedure described in + :footcite:t:`fan2012variance` and :footcite:t:`reid2016study`. + The beta_hat should correspond to the coefficient of Lasso with + cross-validation, and the residual is based on this model. + + For group, the implementation is based on the procedure + from :footcite:t:`chevalier2020statistical`. + + Parameters + ---------- + beta_hat : ndarray, shape (n_features,) or (n_task, n_features) + Estimated sparse coefficient vector from regression. + + residual : ndarray, shape (n_samples,) or (n_samples, n_task) + Residuals from the regression model. + + tolerance : float, default=1e-4 + Threshold for considering coefficients as non-zero. + + multioutput : bool, default=False + If True, handles multiple outputs (group case). + + stationary : bool, default=True + Whether noise has constant magnitude across time steps. + + method : {'median', 'AR'}, (default='simple') + Covariance estimation method: + - 'median': Uses median correlation between consecutive time steps + - 'AR': Uses Yule-Walker method with specified order + + order : int, default=1 + Order of AR model when method='AR'. Must be < n_task. + + Returns + ------- + sigma_hat_raw or covariance_hat : float or ndarray + For single output: estimated noise standard deviation + For multiple outputs: estimated (n_task, n_task) covariance matrix + + Notes + ----- + Implementation based on :footcite:t:`reid2016study` for single output + and :footcite:t:`chevalier2020statistical` for multiple outputs. + + References + ---------- + .. footbibliography:: + """ + if multioutput: + n_task = beta_hat.shape[0] + else: + n_task = None + + n_samples = residual.shape[0] + + # get the number of non-zero coefficients + # we consider that the coefficient with a value under + # tolerance * coefficients_.max() is null + coefficients_ = ( + np.sum(np.abs(beta_hat), axis=0) + if len(beta_hat.shape) > 1 + else np.abs(beta_hat) + ) + size_support = np.sum(coefficients_ > tolerance * coefficients_.max()) + + # avoid dividing by 0 + size_support = min(size_support, n_samples - 1) + + # estimate the noise standard deviation (eq. 3 in `reid2016study`) + sigma_hat_raw = norm(residual, axis=0) / np.sqrt(n_samples - size_support) + + if not multioutput: + return sigma_hat_raw + + ## Computation of the covariance matrix for group + else: + if method == "median": + print("Group reid: simple cov estimation") + elif method == "AR": + print(f"Group reid: {method}{order} cov estimation") + if order > n_task - 1: + raise ValueError( + "The requested AR order is to high with " + + "respect to the number of time steps." + ) + elif not stationary: + raise ValueError( + "The AR method is not compatible with the non-stationary" + + " noise assumption." + ) + else: + raise ValueError("Unknown method for estimating the covariance matrix") + ## compute empirical correlation of the residual + if stationary: + # consideration of stationary noise + # (section 2.5 of `chevalier2020statistical`) + sigma_hat = np.median(sigma_hat_raw) * np.ones(n_task) + # compute rho from the empirical correlation matrix + # (section 2.5 of `chevalier2020statistical`) + correlation_empirical = np.corrcoef(residual.T) + else: + sigma_hat = sigma_hat_raw + residual_rescaled = residual / sigma_hat + correlation_empirical = np.corrcoef(residual_rescaled.T) + + # Median method + if not stationary or method == "median": + rho_hat = np.median(np.diag(correlation_empirical, 1)) + # estimate M (section 2.5 of `chevalier2020statistical`) + correlation_hat = toeplitz(np.geomspace(1, rho_hat ** (n_task - 1), n_task)) + covariance_hat = np.outer(sigma_hat, sigma_hat) * correlation_hat + + # Yule-Walker method (algorithm in section 3 of `eshel2003yule`) + elif stationary and method == "AR": + # compute the autocorrelation coefficients of the AR model + rho_ar = np.zeros(order + 1) + rho_ar[0] = 1 + + for i in range(1, order + 1): + rho_ar[i] = np.median(np.diag(correlation_empirical, i)) + + # solve the Yule-Walker equations (see eq.2 in `eshel2003yule`) + R = toeplitz(rho_ar[:-1]) + coefficients_ar = solve(R, rho_ar[1:]) + + # estimate the variance of the noise from the AR model + residual_estimate = np.zeros((n_samples, n_task - order)) + for i in range(order): + # time window used to estimate the residual from AR model + start = order - i - 1 + end = -i - 1 + residual_estimate += coefficients_ar[i] * residual[:, start:end] + residual_difference = residual[:, order:] - residual_estimate + sigma_epsilon = np.median( + norm(residual_difference, axis=0) / np.sqrt(n_samples) + ) + + # estimation of the autocorrelation matrices + rho_ar_full = np.zeros(n_task) + rho_ar_full[: rho_ar.size] = rho_ar + for i in range(order + 1, n_task): + start = i - order + end = i + rho_ar_full[i] = np.dot(coefficients_ar[::-1], rho_ar_full[start:end]) + correlation_hat = toeplitz(rho_ar_full) + + # estimation of the variance of an AR process + sigma_hat[:] = sigma_epsilon / np.sqrt( + (1 - np.dot(coefficients_ar, rho_ar[1:])) + ) + # estimation of the covariance based on the + # correlation matrix and sigma + # COV(X_t, X_t) = COR(X_t, X_t) * \sigma^2 + covariance_hat = np.outer(sigma_hat, sigma_hat) * correlation_hat + + return covariance_hat diff --git a/src/hidimstat/noise_std.py b/src/hidimstat/noise_std.py deleted file mode 100644 index 1bc7b982e..000000000 --- a/src/hidimstat/noise_std.py +++ /dev/null @@ -1,217 +0,0 @@ -import numpy as np -from numpy.linalg import norm -from scipy.linalg import solve, toeplitz - - -def reid( - beta_hat, - residual, - tolerance=1e-4, - multioutput=False, - stationary=True, - method="median", - order=1, -): - """ - Residual sum of squares based estimators for noise standard deviation - estimation. - - This implementation follows the procedure described in - :footcite:t:`fan2012variance` and :footcite:t:`reid2016study`. - The beta_hat should correspond to the coefficient of Lasso with - cross-validation, and the residual is based on this model. - - For group, the implementation is based on the procedure - from :footcite:t:`chevalier2020statistical`. - - Parameters - ---------- - beta_hat : ndarray, shape (n_features,) or (n_task, n_features) - Estimated sparse coefficient vector from regression. - - residual : ndarray, shape (n_samples,) or (n_samples, n_task) - Residuals from the regression model. - - tolerance : float, default=1e-4 - Threshold for considering coefficients as non-zero. - - multioutput : bool, default=False - If True, handles multiple outputs (group case). - - stationary : bool, default=True - Whether noise has constant magnitude across time steps. - - method : {'median', 'AR'}, (default='simple') - Covariance estimation method: - - 'median': Uses median correlation between consecutive time steps - - 'AR': Uses Yule-Walker method with specified order - - order : int, default=1 - Order of AR model when method='AR'. Must be < n_task. - - Returns - ------- - sigma_hat_raw or covariance_hat : float or ndarray - For single output: estimated noise standard deviation - For multiple outputs: estimated (n_task, n_task) covariance matrix - - Notes - ----- - Implementation based on :footcite:t:`reid2016study` for single output - and :footcite:t:`chevalier2020statistical` for multiple outputs. - - References - ---------- - .. footbibliography:: - """ - if multioutput: - n_task = beta_hat.shape[0] - else: - n_task = None - - n_samples = residual.shape[0] - - # get the number of non-zero coefficients - # we consider that the coefficient with a value under - # tolerance * coefficients_.max() is null - coefficients_ = ( - np.sum(np.abs(beta_hat), axis=0) - if len(beta_hat.shape) > 1 - else np.abs(beta_hat) - ) - size_support = np.sum(coefficients_ > tolerance * coefficients_.max()) - - # avoid dividing by 0 - size_support = min(size_support, n_samples - 1) - - # estimate the noise standard deviation (eq. 3 in `reid2016study`) - sigma_hat_raw = norm(residual, axis=0) / np.sqrt(n_samples - size_support) - - if not multioutput: - return sigma_hat_raw - - ## Computation of the covariance matrix for group - else: - if method == "median": - print("Group reid: simple cov estimation") - elif method == "AR": - print(f"Group reid: {method}{order} cov estimation") - if order > n_task - 1: - raise ValueError( - "The requested AR order is to high with " - + "respect to the number of time steps." - ) - elif not stationary: - raise ValueError( - "The AR method is not compatible with the non-stationary" - + " noise assumption." - ) - else: - raise ValueError("Unknown method for estimating the covariance matrix") - ## compute empirical correlation of the residual - if stationary: - # consideration of stationary noise - # (section 2.5 of `chevalier2020statistical`) - sigma_hat = np.median(sigma_hat_raw) * np.ones(n_task) - # compute rho from the empirical correlation matrix - # (section 2.5 of `chevalier2020statistical`) - correlation_empirical = np.corrcoef(residual.T) - else: - sigma_hat = sigma_hat_raw - residual_rescaled = residual / sigma_hat - correlation_empirical = np.corrcoef(residual_rescaled.T) - - # Median method - if not stationary or method == "median": - rho_hat = np.median(np.diag(correlation_empirical, 1)) - # estimate M (section 2.5 of `chevalier2020statistical`) - correlation_hat = toeplitz(np.geomspace(1, rho_hat ** (n_task - 1), n_task)) - covariance_hat = np.outer(sigma_hat, sigma_hat) * correlation_hat - - # Yule-Walker method (algorithm in section 3 of `eshel2003yule`) - elif stationary and method == "AR": - # compute the autocorrelation coefficients of the AR model - rho_ar = np.zeros(order + 1) - rho_ar[0] = 1 - - for i in range(1, order + 1): - rho_ar[i] = np.median(np.diag(correlation_empirical, i)) - - # solve the Yule-Walker equations (see eq.2 in `eshel2003yule`) - R = toeplitz(rho_ar[:-1]) - coefficients_ar = solve(R, rho_ar[1:]) - - # estimate the variance of the noise from the AR model - residual_estimate = np.zeros((n_samples, n_task - order)) - for i in range(order): - # time window used to estimate the residual from AR model - start = order - i - 1 - end = -i - 1 - residual_estimate += coefficients_ar[i] * residual[:, start:end] - residual_difference = residual[:, order:] - residual_estimate - sigma_epsilon = np.median( - norm(residual_difference, axis=0) / np.sqrt(n_samples) - ) - - # estimation of the autocorrelation matrices - rho_ar_full = np.zeros(n_task) - rho_ar_full[: rho_ar.size] = rho_ar - for i in range(order + 1, n_task): - start = i - order - end = i - rho_ar_full[i] = np.dot(coefficients_ar[::-1], rho_ar_full[start:end]) - correlation_hat = toeplitz(rho_ar_full) - - # estimation of the variance of an AR process - sigma_hat[:] = sigma_epsilon / np.sqrt( - (1 - np.dot(coefficients_ar, rho_ar[1:])) - ) - # estimation of the covariance based on the - # correlation matrix and sigma - # COV(X_t, X_t) = COR(X_t, X_t) * \sigma^2 - covariance_hat = np.outer(sigma_hat, sigma_hat) * correlation_hat - - return covariance_hat - - -def empirical_snr(X, y, beta, noise=None): - """ - Compute the empirical signal-to-noise ratio (SNR) for - the linear model y = X @ beta + noise. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Design matrix. - - y : ndarray, shape (n_samples,) - Target vector. - - beta : ndarray, shape (n_features,) - Parameter vector. - - noise : ndarray, shape (n_samples,), optional - Noise vector. If None, computed as y - X @ beta. - - Returns - ------- - signal_noise_ratio_hat : float - Empirical SNR computed as var(signal) / var(noise). - - Notes - ----- - SNR measures the ratio of signal power to noise power, - indicating model estimation quality. - Higher values suggest better signal recovery. - """ - X = np.asarray(X) - - signal = np.dot(X, beta) - - if noise is None: - noise = y - signal - - # compute signal-to-noise ratio - signal_noise_ratio_ = (np.linalg.norm(signal) / np.linalg.norm(noise)) ** 2 - - return signal_noise_ratio_ diff --git a/test/_utils/test_scenario.py b/test/_utils/test_scenario.py index da7ae1c59..3c3bf612b 100644 --- a/test/_utils/test_scenario.py +++ b/test/_utils/test_scenario.py @@ -11,6 +11,7 @@ _generate_3D_weight, multivariate_simulation, multivariate_simulation_spatial, + empirical_snr, ) @@ -390,3 +391,47 @@ def test_multivariate_simulation_ar_n_target(): """Test n_target validation.""" with pytest.raises(AssertionError, match="n_target must be positive"): multivariate_simulation(n_samples=10, n_features=20, n_targets=0, seed=42) + + +def test_empirical_snr(): + """Computing empirical signal to noise ratio in presence of high level of + noise from the target `y`, the data `X` and the true parameter vector `beta` + in a simple scenario with a 1D data structure.""" + + n_samples, n_features = 100, 20 + support_size = 10 + signal_noise_ratio_expected = 0.5 + + X, y, beta, noise = multivariate_simulation( + n_samples=n_samples, + n_features=n_features, + support_size=support_size, + signal_noise_ratio=signal_noise_ratio_expected, + seed=0, + ) + + signal_noise_ratio = empirical_snr(X, y, beta) + + assert_almost_equal(signal_noise_ratio, signal_noise_ratio_expected, decimal=2) + + +def test_empirical_snr_2(): + """Computing empirical signal to noise ratio from the target `y`, + the data `X` and the true parameter vector `beta` in a simple + scenario with a 1D data structure.""" + + n_samples, n_features = 100, 20 + support_size = 10 + signal_noise_ratio_expected = 10.0 + + X, y, beta, noise = multivariate_simulation( + n_samples=n_samples, + n_features=n_features, + support_size=support_size, + signal_noise_ratio=signal_noise_ratio_expected, + seed=0, + ) + + signal_noise_ratio = empirical_snr(X, y, beta) + + assert_almost_equal(signal_noise_ratio, signal_noise_ratio_expected, decimal=0) diff --git a/test/test_desparsified_lasso.py b/test/test_desparsified_lasso.py index e33967f95..d6b6b6317 100644 --- a/test/test_desparsified_lasso.py +++ b/test/test_desparsified_lasso.py @@ -7,11 +7,11 @@ from numpy.testing import assert_almost_equal from scipy.linalg import toeplitz from sklearn.ensemble import RandomForestClassifier -from sklearn.linear_model import MultiTaskLassoCV +from sklearn.linear_model import MultiTaskLassoCV, LassoCV from sklearn.model_selection import KFold from hidimstat._utils.scenario import multivariate_simulation -from hidimstat.desparsified_lasso import DesparsifiedLasso, desparsified_lasso +from hidimstat.desparsified_lasso import DesparsifiedLasso, desparsified_lasso, reid def test_desparsified_lasso(): @@ -238,3 +238,186 @@ def test_function_not_center(): seed=10, ) selection, importances, pvalues = desparsified_lasso(X, y, centered=False) + + +def test_reid(): + """Estimating noise standard deviation in two scenarios. + First scenario: no structure and a support of size 2. + Second scenario: no structure and an empty support.""" + + n_samples, n_features = 100, 20 + signal_noise_ratio = 2.0 + + # First expe + # ########## + support_size = 2 + + X, y, beta, noise = multivariate_simulation( + n_samples=n_samples, + n_features=n_features, + support_size=support_size, + rho=0.25, + signal_noise_ratio=signal_noise_ratio, + seed=0, + ) + lasso_cv = LassoCV(n_jobs=1).fit(X, y) + residual = lasso_cv.predict(X) - y + + # max_iter=1 to get a better coverage + sigma_hat = reid(lasso_cv.coef_, residual, tolerance=1e-3) + expected_sigma = support_size / signal_noise_ratio + error_relative = np.abs(sigma_hat - expected_sigma) / expected_sigma + assert error_relative < 0.3 + + # Second expe + # ########### + support_size = 0 + + X, y, beta, noise = multivariate_simulation( + n_samples=n_samples, + n_features=n_features, + support_size=support_size, + signal_noise_ratio=signal_noise_ratio, + seed=2, + ) + lasso_cv = LassoCV(n_jobs=1).fit(X, y) + residual = lasso_cv.predict(X) - y + + sigma_hat = reid(lasso_cv.coef_, residual) + expected_sigma = 1.0 # when there is no signal, the variance is 1.0 + error_relative = np.abs(sigma_hat - expected_sigma) / expected_sigma + assert error_relative < 0.2 + + +def test_group_reid(): + """Estimating (temporal) noise covariance matrix in two scenarios. + First scenario: no data structure and a support of size 2. + Second scenario: no data structure and an empty support.""" + + n_samples = 100 + n_features = 20 + n_target = 50 + signal_noise_ratio = 3.0 + rho_serial = 0.9 + random_state = np.random.default_rng(1) + + # First expe + # ########## + support_size = 2 + X, y, beta, noise = multivariate_simulation( + n_samples=n_samples, + n_features=n_features, + n_targets=n_target, + support_size=support_size, + signal_noise_ratio=signal_noise_ratio, + rho_serial=rho_serial, + rho=0.0, + seed=0, + ) + corr = toeplitz(np.geomspace(1, rho_serial ** (n_target - 1), n_target)) + cov = support_size / signal_noise_ratio * corr + + lasso_cv = MultiTaskLassoCV(n_jobs=1).fit(X, y) + residual = lasso_cv.predict(X) - y + + # max_iter=1 to get a better coverage + cov_hat = reid( + lasso_cv.coef_, + residual, + multioutput=True, + tolerance=1e-3, + ) + error_relative = np.abs(cov_hat - cov) / cov + assert np.max(error_relative) < 0.3 + + cov_hat = reid( + lasso_cv.coef_, + residual, + multioutput=True, + method="AR", + ) + error_relative = np.abs(cov_hat - cov) / cov + assert np.max(error_relative) < 0.3 + + cov_hat = reid( + lasso_cv.coef_, + residual, + multioutput=True, + stationary=False, + ) + error_relative = np.abs(cov_hat - cov) / cov + assert np.max(error_relative) > 0.3 + + +def test_group_reid_2(): + """Estimating (temporal) noise covariance matrix in two scenarios. + First scenario: no data structure and a support of size 2. + Second scenario: no data structure and an empty support.""" + + n_samples = 100 + n_features = 20 + n_target = 50 + signal_noise_ratio = 1.0 + rho_serial = 0.9 + + # Second expe + # ########### + support_size = 0 + X, y, beta, noise = multivariate_simulation( + n_samples=n_samples, + n_features=n_features, + n_targets=n_target, + rho=0.25, + support_size=support_size, + signal_noise_ratio=signal_noise_ratio, + rho_serial=rho_serial, + seed=4, + ) + corr = toeplitz(rho_serial ** np.arange(0, n_target)) # covariance matrix of time + cov = 1.0 * corr + + lasso_cv = MultiTaskLassoCV(n_jobs=1).fit(X, y) + residual = lasso_cv.predict(X) - y + + cov_hat = reid(lasso_cv.coef_, residual, multioutput=True) + error_relative = np.abs(cov_hat - cov) / cov + assert np.max(error_relative) < 0.3 + + cov_hat = reid(lasso_cv.coef_, residual, multioutput=True, method="AR") + error_relative = np.abs(cov_hat - cov) / cov + assert np.max(error_relative) < 0.3 + + cov_hat = reid(lasso_cv.coef_, residual, multioutput=True, stationary=False) + error_relative = np.abs(cov_hat - cov) / cov + assert np.max(error_relative) > 0.3 + + +def test_reid_exception(): + "Test for testing the exceptions on the arguments of reid function" + n_samples, n_features = 100, 20 + n_target = 50 + signal_noise_ratio = 1.0 + rho_serial = 0.9 + + # First expe + # ########## + support_size = 2 + + X, y, beta, noise = multivariate_simulation( + n_samples=n_samples, + n_features=n_features, + n_targets=n_target, + support_size=support_size, + signal_noise_ratio=signal_noise_ratio, + rho_serial=rho_serial, + ) + with pytest.raises( + ValueError, match="Unknown method for estimating the covariance matrix" + ): + _, _ = reid(X, y, method="test", multioutput=True) + with pytest.raises( + ValueError, match="The AR method is not compatible with the non-stationary" + ): + _, _ = reid(X, y, method="AR", stationary=False, multioutput=True) + with pytest.raises(ValueError, match="The requested AR order is to high with"): + _, _ = reid(X, y, method="AR", order=1e4, multioutput=True) diff --git a/test/test_noise_std.py b/test/test_noise_std.py deleted file mode 100644 index bc8569abf..000000000 --- a/test/test_noise_std.py +++ /dev/null @@ -1,240 +0,0 @@ -""" -Test the noise_std module -""" - -import numpy as np -import pytest -from numpy.testing import assert_almost_equal -from scipy.linalg import toeplitz -from sklearn.linear_model import LassoCV, MultiTaskLassoCV -from sklearn.model_selection import KFold - -from hidimstat._utils.scenario import multivariate_simulation -from hidimstat.noise_std import empirical_snr, reid - - -def test_reid(): - """Estimating noise standard deviation in two scenarios. - First scenario: no structure and a support of size 2. - Second scenario: no structure and an empty support.""" - - n_samples, n_features = 100, 20 - signal_noise_ratio = 2.0 - - # First expe - # ########## - support_size = 2 - - X, y, beta, noise = multivariate_simulation( - n_samples=n_samples, - n_features=n_features, - support_size=support_size, - rho=0.25, - signal_noise_ratio=signal_noise_ratio, - seed=0, - ) - lasso_cv = LassoCV(n_jobs=1).fit(X, y) - residual = lasso_cv.predict(X) - y - - # max_iter=1 to get a better coverage - sigma_hat = reid(lasso_cv.coef_, residual, tolerance=1e-3) - expected_sigma = support_size / signal_noise_ratio - error_relative = np.abs(sigma_hat - expected_sigma) / expected_sigma - assert error_relative < 0.3 - - # Second expe - # ########### - support_size = 0 - - X, y, beta, noise = multivariate_simulation( - n_samples=n_samples, - n_features=n_features, - support_size=support_size, - signal_noise_ratio=signal_noise_ratio, - seed=2, - ) - lasso_cv = LassoCV(n_jobs=1).fit(X, y) - residual = lasso_cv.predict(X) - y - - sigma_hat = reid(lasso_cv.coef_, residual) - expected_sigma = 1.0 # when there is no signal, the variance is 1.0 - error_relative = np.abs(sigma_hat - expected_sigma) / expected_sigma - assert error_relative < 0.2 - - -def test_group_reid(): - """Estimating (temporal) noise covariance matrix in two scenarios. - First scenario: no data structure and a support of size 2. - Second scenario: no data structure and an empty support.""" - - n_samples = 100 - n_features = 20 - n_target = 50 - signal_noise_ratio = 3.0 - rho_serial = 0.9 - random_state = np.random.default_rng(1) - - # First expe - # ########## - support_size = 2 - X, y, beta, noise = multivariate_simulation( - n_samples=n_samples, - n_features=n_features, - n_targets=n_target, - support_size=support_size, - signal_noise_ratio=signal_noise_ratio, - rho_serial=rho_serial, - rho=0.0, - seed=0, - ) - corr = toeplitz(np.geomspace(1, rho_serial ** (n_target - 1), n_target)) - cov = support_size / signal_noise_ratio * corr - - lasso_cv = MultiTaskLassoCV(n_jobs=1).fit(X, y) - residual = lasso_cv.predict(X) - y - - # max_iter=1 to get a better coverage - cov_hat = reid( - lasso_cv.coef_, - residual, - multioutput=True, - tolerance=1e-3, - ) - error_relative = np.abs(cov_hat - cov) / cov - assert np.max(error_relative) < 0.3 - - cov_hat = reid( - lasso_cv.coef_, - residual, - multioutput=True, - method="AR", - ) - error_relative = np.abs(cov_hat - cov) / cov - assert np.max(error_relative) < 0.3 - - cov_hat = reid( - lasso_cv.coef_, - residual, - multioutput=True, - stationary=False, - ) - error_relative = np.abs(cov_hat - cov) / cov - assert np.max(error_relative) > 0.3 - - -def test_group_reid_2(): - """Estimating (temporal) noise covariance matrix in two scenarios. - First scenario: no data structure and a support of size 2. - Second scenario: no data structure and an empty support.""" - - n_samples = 100 - n_features = 20 - n_target = 50 - signal_noise_ratio = 1.0 - rho_serial = 0.9 - - # Second expe - # ########### - support_size = 0 - X, y, beta, noise = multivariate_simulation( - n_samples=n_samples, - n_features=n_features, - n_targets=n_target, - rho=0.25, - support_size=support_size, - signal_noise_ratio=signal_noise_ratio, - rho_serial=rho_serial, - seed=4, - ) - corr = toeplitz(rho_serial ** np.arange(0, n_target)) # covariance matrix of time - cov = 1.0 * corr - - lasso_cv = MultiTaskLassoCV(n_jobs=1).fit(X, y) - residual = lasso_cv.predict(X) - y - - cov_hat = reid(lasso_cv.coef_, residual, multioutput=True) - error_relative = np.abs(cov_hat - cov) / cov - assert np.max(error_relative) < 0.3 - - cov_hat = reid(lasso_cv.coef_, residual, multioutput=True, method="AR") - error_relative = np.abs(cov_hat - cov) / cov - assert np.max(error_relative) < 0.3 - - cov_hat = reid(lasso_cv.coef_, residual, multioutput=True, stationary=False) - error_relative = np.abs(cov_hat - cov) / cov - assert np.max(error_relative) > 0.3 - - -def test_reid_exception(): - "Test for testing the exceptions on the arguments of reid function" - n_samples, n_features = 100, 20 - n_target = 50 - signal_noise_ratio = 1.0 - rho_serial = 0.9 - - # First expe - # ########## - support_size = 2 - - X, y, beta, noise = multivariate_simulation( - n_samples=n_samples, - n_features=n_features, - n_targets=n_target, - support_size=support_size, - signal_noise_ratio=signal_noise_ratio, - rho_serial=rho_serial, - ) - with pytest.raises( - ValueError, match="Unknown method for estimating the covariance matrix" - ): - _, _ = reid(X, y, method="test", multioutput=True) - with pytest.raises( - ValueError, match="The AR method is not compatible with the non-stationary" - ): - _, _ = reid(X, y, method="AR", stationary=False, multioutput=True) - with pytest.raises(ValueError, match="The requested AR order is to high with"): - _, _ = reid(X, y, method="AR", order=1e4, multioutput=True) - - -def test_empirical_snr(): - """Computing empirical signal to noise ratio in presence of high level of - noise from the target `y`, the data `X` and the true parameter vector `beta` - in a simple scenario with a 1D data structure.""" - - n_samples, n_features = 100, 20 - support_size = 10 - signal_noise_ratio_expected = 0.5 - - X, y, beta, noise = multivariate_simulation( - n_samples=n_samples, - n_features=n_features, - support_size=support_size, - signal_noise_ratio=signal_noise_ratio_expected, - seed=0, - ) - - signal_noise_ratio = empirical_snr(X, y, beta) - - assert_almost_equal(signal_noise_ratio, signal_noise_ratio_expected, decimal=2) - - -def test_empirical_snr_2(): - """Computing empirical signal to noise ratio from the target `y`, - the data `X` and the true parameter vector `beta` in a simple - scenario with a 1D data structure.""" - - n_samples, n_features = 100, 20 - support_size = 10 - signal_noise_ratio_expected = 10.0 - - X, y, beta, noise = multivariate_simulation( - n_samples=n_samples, - n_features=n_features, - support_size=support_size, - signal_noise_ratio=signal_noise_ratio_expected, - seed=0, - ) - - signal_noise_ratio = empirical_snr(X, y, beta) - - assert_almost_equal(signal_noise_ratio, signal_noise_ratio_expected, decimal=0) From 90534e2774dc5105200d876352ee6c47cb7c22d1 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 16 Oct 2025 11:35:05 +0200 Subject: [PATCH 51/93] fix import --- src/hidimstat/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/hidimstat/__init__.py b/src/hidimstat/__init__.py index 4ea86bb73..9e2644721 100644 --- a/src/hidimstat/__init__.py +++ b/src/hidimstat/__init__.py @@ -1,5 +1,5 @@ from .conditional_feature_importance import CFI -from .desparsified_lasso import DesparsifiedLasso, desparsified_lasso +from .desparsified_lasso import DesparsifiedLasso, desparsified_lasso, reid from .distilled_conditional_randomization_test import D0CRT, d0crt from .ensemble_clustered_inference import ( clustered_inference, @@ -14,7 +14,6 @@ model_x_knockoff_pvalue, ) from .leave_one_covariate_out import LOCO -from .noise_std import reid from .permutation_feature_importance import PFI from .statistical_tools.aggregation import quantile_aggregation From aa6a7bd21a72b9d48b20b1dbd33bb69f101ccfec Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 16 Oct 2025 11:36:04 +0200 Subject: [PATCH 52/93] fix import --- src/hidimstat/desparsified_lasso.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index b8f0a3508..2841489c1 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -17,7 +17,6 @@ from hidimstat._utils.regression import _alpha_max from hidimstat._utils.utils import check_random_state, seed_estimator from hidimstat.base_variable_importance import BaseVariableImportance -from hidimstat.noise_std import reid from hidimstat.statistical_tools.p_values import ( pval_from_cb, pval_from_two_sided_pval_and_sign, From 9791c10dc7a6c93c4040811223b4f693d9930883 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 16 Oct 2025 11:37:39 +0200 Subject: [PATCH 53/93] fix definition of the covariance --- src/hidimstat/desparsified_lasso.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index 2841489c1..fa1e479de 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -737,6 +737,7 @@ def reid( residual_rescaled = residual / sigma_hat correlation_empirical = np.corrcoef(residual_rescaled.T) + covariance_hat = None # Median method if not stationary or method == "median": rho_hat = np.median(np.diag(correlation_empirical, 1)) From 19bdf14c0b0deb01c0ae8fce14f04f01ecf2ed2c Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 16 Oct 2025 11:56:45 +0200 Subject: [PATCH 54/93] add an exception --- src/hidimstat/desparsified_lasso.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index fa1e479de..cd4ecbe72 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -787,5 +787,9 @@ def reid( # correlation matrix and sigma # COV(X_t, X_t) = COR(X_t, X_t) * \sigma^2 covariance_hat = np.outer(sigma_hat, sigma_hat) * correlation_hat + else: + raise ValueError( + f"Not support combinaison of stationnary {stationary} and method {method}." + ) return covariance_hat From 15912da3bdbfe2e2497aee1ca03aa06533eeeaf7 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 16 Oct 2025 12:01:27 +0200 Subject: [PATCH 55/93] fix import order --- src/hidimstat/desparsified_lasso.py | 3 +-- test/_utils/test_scenario.py | 2 +- test/test_desparsified_lasso.py | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index cd4ecbe72..cb88c9cb4 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -4,14 +4,13 @@ from joblib import Parallel, delayed from numpy.linalg import multi_dot, norm from scipy import stats -from scipy.linalg import inv +from scipy.linalg import inv, solve, toeplitz from sklearn.base import check_is_fitted, clone from sklearn.exceptions import NotFittedError from sklearn.linear_model import Lasso, LassoCV, MultiTaskLasso, MultiTaskLassoCV from sklearn.model_selection import KFold from sklearn.preprocessing import StandardScaler from sklearn.utils.validation import check_memory -from scipy.linalg import solve, toeplitz from hidimstat._utils.docstring import _aggregate_docstring from hidimstat._utils.regression import _alpha_max diff --git a/test/_utils/test_scenario.py b/test/_utils/test_scenario.py index 3c3bf612b..bd19711d8 100644 --- a/test/_utils/test_scenario.py +++ b/test/_utils/test_scenario.py @@ -9,9 +9,9 @@ from hidimstat._utils.scenario import ( _generate_2D_weight, _generate_3D_weight, + empirical_snr, multivariate_simulation, multivariate_simulation_spatial, - empirical_snr, ) diff --git a/test/test_desparsified_lasso.py b/test/test_desparsified_lasso.py index d6b6b6317..46a70be30 100644 --- a/test/test_desparsified_lasso.py +++ b/test/test_desparsified_lasso.py @@ -7,7 +7,7 @@ from numpy.testing import assert_almost_equal from scipy.linalg import toeplitz from sklearn.ensemble import RandomForestClassifier -from sklearn.linear_model import MultiTaskLassoCV, LassoCV +from sklearn.linear_model import LassoCV, MultiTaskLassoCV from sklearn.model_selection import KFold from hidimstat._utils.scenario import multivariate_simulation From ddb0b1b318d6bbc35fa8cf1f63ebd6ba4752139b Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 16 Oct 2025 12:04:33 +0200 Subject: [PATCH 56/93] format --- src/hidimstat/desparsified_lasso.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index cb88c9cb4..02c4bbb51 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -788,7 +788,7 @@ def reid( covariance_hat = np.outer(sigma_hat, sigma_hat) * correlation_hat else: raise ValueError( - f"Not support combinaison of stationnary {stationary} and method {method}." + f"Not support a combination of stationnary {stationary} and method {method}." ) return covariance_hat From ff709aa54db1679fc5536f85e38ce33f09d793d6 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 16 Oct 2025 18:39:53 +0200 Subject: [PATCH 57/93] change default value of the n_job --- src/hidimstat/desparsified_lasso.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index 02c4bbb51..a8d9218a8 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -131,12 +131,12 @@ def __init__( tol=1e-4, max_iter=5000, random_state=1, - n_jobs=1, + n_jobs=None, ), centered=True, dof_ajdustement=False, # parameters for model_x - model_x=Lasso(max_iter=5000, tol=1e-3), + model_x=Lasso(max_iter=5000, tol=1e-3, n_jobs=None), alphas=None, alpha_max_fraction=0.01, random_state=None, From eda35b32494e793a2f6f575bfe9887d754a5aaf5 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 16 Oct 2025 18:42:16 +0200 Subject: [PATCH 58/93] fix bg --- src/hidimstat/desparsified_lasso.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index a8d9218a8..4ae31d95d 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -136,7 +136,7 @@ def __init__( centered=True, dof_ajdustement=False, # parameters for model_x - model_x=Lasso(max_iter=5000, tol=1e-3, n_jobs=None), + model_x=Lasso(max_iter=5000, tol=1e-3), alphas=None, alpha_max_fraction=0.01, random_state=None, From 0ed3f9c0b3eaa057a6d77f05b788c992e46f264b Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 24 Oct 2025 17:29:09 +0200 Subject: [PATCH 59/93] Update src/hidimstat/ensemble_clustered_inference.py Co-authored-by: bthirion --- src/hidimstat/ensemble_clustered_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/ensemble_clustered_inference.py b/src/hidimstat/ensemble_clustered_inference.py index 802bf2401..e33788c7f 100644 --- a/src/hidimstat/ensemble_clustered_inference.py +++ b/src/hidimstat/ensemble_clustered_inference.py @@ -21,7 +21,7 @@ def _ungroup_beta(beta_hat, n_features, ward): Parameters ---------- - beta_hat : ndarray, shape (n_clusters,) or (n_clusters, n_task) + beta_hat : ndarray, shape (n_clusters,) or (n_clusters, n_tasks) Beta coefficients at cluster level n_features : int Number of features in original space From db8614123923722fcbf7229f3f83daac6ace96a3 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 24 Oct 2025 17:29:20 +0200 Subject: [PATCH 60/93] Update src/hidimstat/ensemble_clustered_inference.py Co-authored-by: bthirion --- src/hidimstat/ensemble_clustered_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/ensemble_clustered_inference.py b/src/hidimstat/ensemble_clustered_inference.py index e33788c7f..3823c616d 100644 --- a/src/hidimstat/ensemble_clustered_inference.py +++ b/src/hidimstat/ensemble_clustered_inference.py @@ -30,7 +30,7 @@ def _ungroup_beta(beta_hat, n_features, ward): Returns ------- - beta_hat_degrouped : ndarray, shape (n_features,) or (n_features, n_task) + beta_hat_degrouped : ndarray, shape (n_features,) or (n_features, n_tasks) Rescaled beta coefficients for individual features, weighted by inverse cluster size From 56073e895542fabf4fd69e1ffe6499ee0d7529e8 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 24 Oct 2025 17:29:31 +0200 Subject: [PATCH 61/93] Update src/hidimstat/ensemble_clustered_inference.py Co-authored-by: bthirion --- src/hidimstat/ensemble_clustered_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/ensemble_clustered_inference.py b/src/hidimstat/ensemble_clustered_inference.py index 3823c616d..8007a0ab9 100644 --- a/src/hidimstat/ensemble_clustered_inference.py +++ b/src/hidimstat/ensemble_clustered_inference.py @@ -51,7 +51,7 @@ def _ungroup_beta(beta_hat, n_features, ward): # weighting the weight of beta with the size of the cluster beta_hat_degrouped = ward.inverse_transform(beta_hat) / clusters_size elif len(beta_hat.shape) == 2: - n_task = beta_hat.shape[1] + n_tasks = beta_hat.shape[1] beta_hat_degrouped = np.zeros((n_features, n_task)) for i in range(n_task): beta_hat_degrouped[:, i] = ( From a4182a7619a38f0f568ae66d92dcec93fa928414 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 24 Oct 2025 17:29:50 +0200 Subject: [PATCH 62/93] Update src/hidimstat/ensemble_clustered_inference.py Co-authored-by: bthirion --- src/hidimstat/ensemble_clustered_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/ensemble_clustered_inference.py b/src/hidimstat/ensemble_clustered_inference.py index 8007a0ab9..b4f374034 100644 --- a/src/hidimstat/ensemble_clustered_inference.py +++ b/src/hidimstat/ensemble_clustered_inference.py @@ -52,7 +52,7 @@ def _ungroup_beta(beta_hat, n_features, ward): beta_hat_degrouped = ward.inverse_transform(beta_hat) / clusters_size elif len(beta_hat.shape) == 2: n_tasks = beta_hat.shape[1] - beta_hat_degrouped = np.zeros((n_features, n_task)) + beta_hat_degrouped = np.zeros((n_features, n_tasks)) for i in range(n_task): beta_hat_degrouped[:, i] = ( ward.inverse_transform(beta_hat[:, i]) / clusters_size From 8231b351a1a2894d974d03ae653276fad44b3c1a Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 24 Oct 2025 17:30:00 +0200 Subject: [PATCH 63/93] Update src/hidimstat/ensemble_clustered_inference.py Co-authored-by: bthirion --- src/hidimstat/ensemble_clustered_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/ensemble_clustered_inference.py b/src/hidimstat/ensemble_clustered_inference.py index b4f374034..4ddf958c6 100644 --- a/src/hidimstat/ensemble_clustered_inference.py +++ b/src/hidimstat/ensemble_clustered_inference.py @@ -53,7 +53,7 @@ def _ungroup_beta(beta_hat, n_features, ward): elif len(beta_hat.shape) == 2: n_tasks = beta_hat.shape[1] beta_hat_degrouped = np.zeros((n_features, n_tasks)) - for i in range(n_task): + for i in range(n_tasks): beta_hat_degrouped[:, i] = ( ward.inverse_transform(beta_hat[:, i]) / clusters_size ) From fbad0713a5a37e83855385fd0ca593bd54c69259 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 24 Oct 2025 17:30:17 +0200 Subject: [PATCH 64/93] Update src/hidimstat/ensemble_clustered_inference.py Co-authored-by: bthirion --- src/hidimstat/ensemble_clustered_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/ensemble_clustered_inference.py b/src/hidimstat/ensemble_clustered_inference.py index 4ddf958c6..5d81be00b 100644 --- a/src/hidimstat/ensemble_clustered_inference.py +++ b/src/hidimstat/ensemble_clustered_inference.py @@ -71,7 +71,7 @@ def _degrouping(ward, beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_ ---------- ward : sklearn.cluster.FeatureAgglomeration Fitted clustering object containing the hierarchical structure - beta_hat : ndarray, shape (n_clusters,) or (n_clusters, n_task) + beta_hat : ndarray, shape (n_clusters,) or (n_clusters, n_tasks) Estimated parameters at cluster level pval : ndarray, shape (n_clusters,) P-values at cluster level From 0429b90fef570dc6be9fe913265f761ccd5bc10a Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 24 Oct 2025 17:30:31 +0200 Subject: [PATCH 65/93] Update src/hidimstat/ensemble_clustered_inference.py Co-authored-by: bthirion --- src/hidimstat/ensemble_clustered_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/ensemble_clustered_inference.py b/src/hidimstat/ensemble_clustered_inference.py index 5d81be00b..ececfa47c 100644 --- a/src/hidimstat/ensemble_clustered_inference.py +++ b/src/hidimstat/ensemble_clustered_inference.py @@ -84,7 +84,7 @@ def _degrouping(ward, beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_ Returns ------- - beta_hat : ndarray, shape (n_features,) or (n_features, n_task) + beta_hat : ndarray, shape (n_features,) or (n_features, n_tasks) Rescaled parameter estimates for individual features pval : ndarray, shape (n_features,) P-values for individual features From 3298f2863233b26d5b0f5bf58c8192bbe34e5310 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 24 Oct 2025 17:30:46 +0200 Subject: [PATCH 66/93] Update src/hidimstat/ensemble_clustered_inference.py Co-authored-by: bthirion --- src/hidimstat/ensemble_clustered_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/ensemble_clustered_inference.py b/src/hidimstat/ensemble_clustered_inference.py index ececfa47c..e6741e1c3 100644 --- a/src/hidimstat/ensemble_clustered_inference.py +++ b/src/hidimstat/ensemble_clustered_inference.py @@ -169,7 +169,7 @@ def clustered_inference( X_init : ndarray, shape (n_samples, n_features) Original high-dimensional input data matrix. - y : ndarray, shape (n_samples,) or (n_samples, n_task) + y : ndarray, shape (n_samples,) or (n_samples, n_tasks) Target variable(s). Can be univariate or multivariate (temporal) data. ward : sklearn.cluster.FeatureAgglomeration From dd98062d173e8755bb872bbf11442c01eb7279ef Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 24 Oct 2025 17:31:01 +0200 Subject: [PATCH 67/93] Update src/hidimstat/ensemble_clustered_inference.py Co-authored-by: bthirion --- src/hidimstat/ensemble_clustered_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/ensemble_clustered_inference.py b/src/hidimstat/ensemble_clustered_inference.py index e6741e1c3..b5695a666 100644 --- a/src/hidimstat/ensemble_clustered_inference.py +++ b/src/hidimstat/ensemble_clustered_inference.py @@ -208,7 +208,7 @@ def clustered_inference( ward_ : FeatureAgglomeration Fitted clustering object. - beta_hat : ndarray, shape (n_clusters,) or (n_clusters, n_task) + beta_hat : ndarray, shape (n_clusters,) or (n_clusters, n_tasks) Estimated coefficients at cluster level. theta_hat : ndarray From 7913b211125c0238603270eaaee5cc5203b5cc8c Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 24 Oct 2025 17:31:15 +0200 Subject: [PATCH 68/93] Update src/hidimstat/ensemble_clustered_inference.py Co-authored-by: bthirion --- src/hidimstat/ensemble_clustered_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/ensemble_clustered_inference.py b/src/hidimstat/ensemble_clustered_inference.py index b5695a666..7c8e2404a 100644 --- a/src/hidimstat/ensemble_clustered_inference.py +++ b/src/hidimstat/ensemble_clustered_inference.py @@ -366,7 +366,7 @@ def ensemble_clustered_inference( X_init : ndarray, shape (n_samples, n_features) Original high-dimensional input data matrix. - y : ndarray, shape (n_samples,) or (n_samples, n_task) + y : ndarray, shape (n_samples,) or (n_samples, n_tasks) Target variable(s). Can be univariate or multivariate (temporal) data. ward : sklearn.cluster.FeatureAgglomeration From a9332a2fd717675e5d808d57e9d9c3ee626d6fcb Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 24 Oct 2025 17:31:34 +0200 Subject: [PATCH 69/93] Update src/hidimstat/ensemble_clustered_inference.py Co-authored-by: bthirion --- src/hidimstat/ensemble_clustered_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/ensemble_clustered_inference.py b/src/hidimstat/ensemble_clustered_inference.py index 7c8e2404a..6af7f1aeb 100644 --- a/src/hidimstat/ensemble_clustered_inference.py +++ b/src/hidimstat/ensemble_clustered_inference.py @@ -545,7 +545,7 @@ def ensemble_clustered_inference_pvalue( Returns ------- - beta_hat : ndarray, shape (n_features,) or (n_features, n_task) + beta_hat : ndarray, shape (n_features,) or (n_features, n_tasks) Averaged coefficients across bootstraps selected : ndarray, shape (n_features,) Selected features: 1 for positive effects, -1 for negative effects, From 2aeef952c7f756dc1aa887c04a34243534e2f5c1 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 24 Oct 2025 17:32:01 +0200 Subject: [PATCH 70/93] Update src/hidimstat/desparsified_lasso.py Co-authored-by: bthirion --- src/hidimstat/desparsified_lasso.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index 4ae31d95d..812609ccc 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -100,7 +100,7 @@ class DesparsifiedLasso(BaseVariableImportance): Attributes ---------- - importances_ : ndarray + importances_ : ndarray of shape (n_features) Debiased coefficient estimates. pvalues_ : ndarray From f81b79d022208b5443f267570a84d62278b8099f Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 24 Oct 2025 17:32:12 +0200 Subject: [PATCH 71/93] Update src/hidimstat/desparsified_lasso.py Co-authored-by: bthirion --- src/hidimstat/desparsified_lasso.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index 812609ccc..7fac168b9 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -482,7 +482,7 @@ def _joblib_compute_residuals(X, id_column, clf, return_clf): X : ndarray of shape (n_samples, n_features) Input data matrix. id_column : int - Index i of feature to regress. + Index of feature to regress. clf : sklearn estimator Pre-configured estimator. return_clf : bool From a2d817abdda659f81c7ee9d0513750f890c07424 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 24 Oct 2025 17:32:33 +0200 Subject: [PATCH 72/93] Update src/hidimstat/desparsified_lasso.py Co-authored-by: bthirion --- src/hidimstat/desparsified_lasso.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index 7fac168b9..870612569 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -494,7 +494,7 @@ def _joblib_compute_residuals(X, id_column, clf, return_clf): Residuals from regression. precision_diagonal_i : float Diagonal entry i of precision matrix estimate, - computed as n * ||z||^2 / ^2. + computed as n * ||z_i||^2 / ^2. clf : sklearn estimator or None Fitted Lasso model if return_clf=True, else None. From cf24ca7860b5ed7fc918074a3305466986c5db49 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 24 Oct 2025 17:34:13 +0200 Subject: [PATCH 73/93] Update test/test_desparsified_lasso.py Co-authored-by: bthirion --- test/test_desparsified_lasso.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_desparsified_lasso.py b/test/test_desparsified_lasso.py index 46a70be30..c17b2e377 100644 --- a/test/test_desparsified_lasso.py +++ b/test/test_desparsified_lasso.py @@ -96,7 +96,7 @@ def test_desparsified_group_lasso(): alpha = 0.1 corr = toeplitz(np.geomspace(1, rho_serial ** (n_target - 1), n_target)) - multitasklassoCV = MultiTaskLassoCV( + multi_task_lasso_cv = MultiTaskLassoCV( eps=1e-2, fit_intercept=False, cv=KFold(n_splits=5, shuffle=True, random_state=0), From cee28e0c3b1ef6948b995c7a19cba203a836f09c Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 24 Oct 2025 17:34:31 +0200 Subject: [PATCH 74/93] Update test/test_desparsified_lasso.py Co-authored-by: bthirion --- test/test_desparsified_lasso.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_desparsified_lasso.py b/test/test_desparsified_lasso.py index c17b2e377..e172612be 100644 --- a/test/test_desparsified_lasso.py +++ b/test/test_desparsified_lasso.py @@ -117,7 +117,7 @@ def test_desparsified_group_lasso(): with pytest.warns(Warning, match="'max_iter' has been increased to "): desparsified_lasso = DesparsifiedLasso( - model_y=multitasklassoCV, covariance=corr, save_model_x=True + model_y=multi_task_lasso_cv, covariance=corr, save_model_x=True ).fit(X, y) importances = desparsified_lasso.importance() From 0654edff72b9e4741877773b4431193e1a0cf8f8 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 24 Oct 2025 17:34:44 +0200 Subject: [PATCH 75/93] Update test/test_desparsified_lasso.py Co-authored-by: bthirion --- test/test_desparsified_lasso.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_desparsified_lasso.py b/test/test_desparsified_lasso.py index e172612be..15826874b 100644 --- a/test/test_desparsified_lasso.py +++ b/test/test_desparsified_lasso.py @@ -136,7 +136,7 @@ def test_desparsified_group_lasso(): and len(desparsified_lasso.clf_) == n_features ) - desparsified_lasso = DesparsifiedLasso(model_y=multitasklassoCV, test="F").fit(X, y) + desparsified_lasso = DesparsifiedLasso(model_y=multi_task_lasso_cv, test="F").fit(X, y) importances = desparsified_lasso.importance() assert_almost_equal(importances, beta, decimal=1) From 1f41c87054711a24ab7b6c61d79014ab870f07d7 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 24 Oct 2025 17:34:58 +0200 Subject: [PATCH 76/93] Update test/test_desparsified_lasso.py Co-authored-by: bthirion --- test/test_desparsified_lasso.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_desparsified_lasso.py b/test/test_desparsified_lasso.py index 15826874b..09ec748e6 100644 --- a/test/test_desparsified_lasso.py +++ b/test/test_desparsified_lasso.py @@ -152,7 +152,7 @@ def test_desparsified_group_lasso(): # ValueError, desparsified_lasso, X=X, y=y, multioutput=True, covariance=bad_cov # ) desparsified_lasso = DesparsifiedLasso( - model_y=multitasklassoCV, covariance=bad_cov + model_y=multi_task_lasso_cv, covariance=bad_cov ).fit(X, y) with pytest.raises(ValueError): desparsified_lasso.importance() From d63e22087cd7690c3bd521db6f4afc8d5733e2cd Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 24 Oct 2025 17:35:10 +0200 Subject: [PATCH 77/93] Update test/test_desparsified_lasso.py Co-authored-by: bthirion --- test/test_desparsified_lasso.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_desparsified_lasso.py b/test/test_desparsified_lasso.py index 09ec748e6..d14550c87 100644 --- a/test/test_desparsified_lasso.py +++ b/test/test_desparsified_lasso.py @@ -158,7 +158,7 @@ def test_desparsified_group_lasso(): desparsified_lasso.importance() with pytest.raises(AssertionError, match="Unknown test 'r2'"): - DesparsifiedLasso(model_y=multitasklassoCV, covariance=bad_cov, test="r2").fit( + DesparsifiedLasso(model_y=multi_task_lasso_cv, covariance=bad_cov, test="r2").fit( X, y ) From 0e23703dc4687b9766fac281a6abcac6e55d5843 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 24 Oct 2025 17:35:24 +0200 Subject: [PATCH 78/93] Update test/test_desparsified_lasso.py Co-authored-by: bthirion --- test/test_desparsified_lasso.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_desparsified_lasso.py b/test/test_desparsified_lasso.py index d14550c87..0b755ecf9 100644 --- a/test/test_desparsified_lasso.py +++ b/test/test_desparsified_lasso.py @@ -172,7 +172,7 @@ def test_exception(): signal_noise_ratio = 50 rho_serial = 0.9 corr = toeplitz(np.geomspace(1, rho_serial ** (n_target - 1), n_target)) - multitasklassoCV = MultiTaskLassoCV( + multi_task_lasso_cv = MultiTaskLassoCV( eps=1e-2, fit_intercept=False, cv=KFold(n_splits=5, shuffle=True, random_state=0), From f4e54e92897c55c597782de5f8c249a855bb2079 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 24 Oct 2025 17:35:35 +0200 Subject: [PATCH 79/93] Update test/test_desparsified_lasso.py Co-authored-by: bthirion --- test/test_desparsified_lasso.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_desparsified_lasso.py b/test/test_desparsified_lasso.py index 0b755ecf9..741a7b303 100644 --- a/test/test_desparsified_lasso.py +++ b/test/test_desparsified_lasso.py @@ -214,7 +214,7 @@ def test_exception(): desparsified_lasso.test = "r2" desparsified_lasso.importance() - desparsified_lasso = DesparsifiedLasso(model_y=multitasklassoCV).fit(X, y) + desparsified_lasso = DesparsifiedLasso(model_y=multi_task_lasso_cv).fit(X, y) with pytest.warns(Warning, match="X won't be used."): desparsified_lasso.importance(X=X) with pytest.warns(Warning, match="y won't be used."): From 680365e5b0f2004e5066e15ac788713f6f251f37 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 24 Oct 2025 17:37:02 +0200 Subject: [PATCH 80/93] fix multitasklassocv change name --- test/test_desparsified_lasso.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/test/test_desparsified_lasso.py b/test/test_desparsified_lasso.py index 741a7b303..b987a2195 100644 --- a/test/test_desparsified_lasso.py +++ b/test/test_desparsified_lasso.py @@ -136,7 +136,9 @@ def test_desparsified_group_lasso(): and len(desparsified_lasso.clf_) == n_features ) - desparsified_lasso = DesparsifiedLasso(model_y=multi_task_lasso_cv, test="F").fit(X, y) + desparsified_lasso = DesparsifiedLasso(model_y=multi_task_lasso_cv, test="F").fit( + X, y + ) importances = desparsified_lasso.importance() assert_almost_equal(importances, beta, decimal=1) @@ -158,9 +160,9 @@ def test_desparsified_group_lasso(): desparsified_lasso.importance() with pytest.raises(AssertionError, match="Unknown test 'r2'"): - DesparsifiedLasso(model_y=multi_task_lasso_cv, covariance=bad_cov, test="r2").fit( - X, y - ) + DesparsifiedLasso( + model_y=multi_task_lasso_cv, covariance=bad_cov, test="r2" + ).fit(X, y) def test_exception(): @@ -202,14 +204,14 @@ def test_exception(): DesparsifiedLasso(model_y=RandomForestClassifier()) with pytest.raises(AssertionError, match="Unknown test 'r2'"): DesparsifiedLasso(test="r2") - desparsified_lasso = DesparsifiedLasso(model_y=multitasklassoCV) + desparsified_lasso = DesparsifiedLasso(model_y=multi_task_lasso_cv) with pytest.raises( ValueError, match="The Desparsified Lasso requires to be fit before any analysis", ): desparsified_lasso.importance() - desparsified_lasso = DesparsifiedLasso(model_y=multitasklassoCV).fit(X, y) + desparsified_lasso = DesparsifiedLasso(model_y=multi_task_lasso_cv).fit(X, y) with pytest.raises(ValueError, match="Unknown test 'r2'"): desparsified_lasso.test = "r2" desparsified_lasso.importance() From e3a73d35a2088f23a0241853b630d642f6dcf688 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 24 Oct 2025 17:39:18 +0200 Subject: [PATCH 81/93] Update src/hidimstat/desparsified_lasso.py Co-authored-by: bthirion --- src/hidimstat/desparsified_lasso.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index 870612569..46ae3bfaf 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -492,7 +492,7 @@ def _joblib_compute_residuals(X, id_column, clf, return_clf): ------- z : ndarray of shape (n_samples,) Residuals from regression. - precision_diagonal_i : float + precision_diagonal : float Diagonal entry i of precision matrix estimate, computed as n * ||z_i||^2 / ^2. clf : sklearn estimator or None From aea60038f1d9f039ec4e2439ff0686539cc74897 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 24 Oct 2025 17:48:44 +0200 Subject: [PATCH 82/93] Add shape in docstring --- src/hidimstat/desparsified_lasso.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index 46ae3bfaf..7bc909a46 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -103,22 +103,22 @@ class DesparsifiedLasso(BaseVariableImportance): importances_ : ndarray of shape (n_features) Debiased coefficient estimates. - pvalues_ : ndarray + pvalues_ : ndarray of shape (n_features) Two-sided p-values. - pvalues_corr_ : ndarray + pvalues_corr_ : ndarray of shape (n_features) Multiple testing corrected p-values. - sigma_hat_ : float or ndarray + sigma_hat_ : float or ndarray of shape (n_task, n_task) Estimated noise level. - precision_diagonal_ : ndarray + precision_diagonal_ : ndarray of shape (n_features) Diagonal entries of precision matrix. - confidence_bound_min_ : ndarray + confidence_bound_min_ : ndarray of shape (n_features) Lower confidence bounds. - confidence_bound_max_ : ndarray + confidence_bound_max_ : ndarray of shape (n_features) Upper confidence bounds. """ From 79effbfaf038bb1f18b4bb0935f5ba85b9f7a4c7 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Thu, 23 Oct 2025 11:00:56 +0200 Subject: [PATCH 83/93] Add new workflow for maintenance (#501) add new workflow --- .github/workflows/ci_maintenance.yml | 70 ++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 .github/workflows/ci_maintenance.yml diff --git a/.github/workflows/ci_maintenance.yml b/.github/workflows/ci_maintenance.yml new file mode 100644 index 000000000..f9796641d --- /dev/null +++ b/.github/workflows/ci_maintenance.yml @@ -0,0 +1,70 @@ +# Management of the action do made during a pull request for maintenance + +name: 'CI maintenance' +on: + pull_request: + branches: *.*.* + push: + branches: *.*.* + +# disable all the permission for the workflow +permissions: {} + +# Cancel existing runs +concurrency: + group: ${{ github.workflow }}-pr-${{ github.event.pull_request.number }} + cancel-in-progress: true + +jobs: + get_information: + name: Get Information + # information is: commit message for PR + runs-on: ubuntu-latest + outputs: + COMMIT_MSG: ${{ steps.get_commit_message.outputs.COMMIT_MSG }} + steps: + - name: Checkout code + uses: actions/checkout@v5 + with: + fetch-depth: 3 + ref: ${{ github.event.pull_request.head.sha || github.ref }} + # require to get the previous commit for getting their messages + # in a PR the last commit is the commit merge with main + + - name: Get commit message + id: get_commit_message + # return an empty message when the event is not from a PR + run: | + EOF=$(dd if=/dev/urandom bs=15 count=1 status=none | base64) + echo "COMMIT_MSG<<$EOF" >> $GITHUB_OUTPUT + if [ "${{ github.event_name }}" == "pull_request_target" ]; then + echo "COMMIT_MSG=$(git log -1 --pretty=%B ${{ github.event.pull_request.head.sha }})" # use for debugging + echo "$(git log -1 --pretty=%B ${{ github.event.pull_request.head.sha }})" >> $GITHUB_OUTPUT + echo "$EOF" >> $GITHUB_OUTPUT + else + echo "$EOF" >> $GITHUB_OUTPUT + fi + + linter: + name: Linter + needs: [get_information] + uses: ./.github/workflows/call_linter.yml + + tests: + name: tests + needs: [linter, get_information] + uses: ./.github/workflows/call_test_package.yml + with: + skip_test: ${{ contains(needs.get_information.outputs.COMMIT_MSG, '[skip tests]') }} + + tests_publish: + name: tests_publish + needs: [tests] + if: | + ${{ github.event.pull_request.draft == false + || contains(needs.get_information.outputs.COMMIT_MSG, '[doc ') + }} + permissions: + pull-requests: write + secrets: inherit + uses: ./.github/workflows/call_publish_result.yml From bb09246b3c65ecbf28a40e10d502be5456c85d27 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Thu, 23 Oct 2025 11:15:31 +0200 Subject: [PATCH 84/93] Pr ci maint (#502) * fix ci * fix condition for ci * add comment --- .github/workflows/ci_maintenance.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci_maintenance.yml b/.github/workflows/ci_maintenance.yml index f9796641d..f47a3bbb5 100644 --- a/.github/workflows/ci_maintenance.yml +++ b/.github/workflows/ci_maintenance.yml @@ -2,10 +2,12 @@ name: 'CI maintenance' on: + # see the following page ofr regualr expression + # https://docs.github.com/en/actions/reference/workflows-and-actions/workflow-syntax#filter-pattern-cheat-sheet pull_request: - branches: *.*.* + branches: '[0-9]+.[0-9]+.[0-9]+' push: - branches: *.*.* + branches: '[0-9]+.[0-9]+.[0-9]+' # disable all the permission for the workflow permissions: {} From 63166262ea1c9a8f7c3205b7ec15c73faea74b3c Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 24 Oct 2025 18:06:41 +0200 Subject: [PATCH 85/93] Remove parallel generation of example (#497) * remove the generation of example in parallel * remove option memory [skip tests] * add the tack of memory * fix issue of memory load from example fmri * fix example * remove error modification * fix codespell * fix number of jobs * fix example * remove parallization of short example * [skip tests] * update examples * remove modification [skip tests] * improve plot fmri --- docs/tools/conf.py | 4 ++-- examples/plot_2D_simulation_example.py | 4 ++-- examples/plot_fmri_data_example.py | 23 ++++++++++++++--------- examples/plot_knockoffs_wisconsin.py | 2 +- pyproject.toml | 1 + 5 files changed, 20 insertions(+), 14 deletions(-) diff --git a/docs/tools/conf.py b/docs/tools/conf.py index 038471ebc..3e719b537 100644 --- a/docs/tools/conf.py +++ b/docs/tools/conf.py @@ -144,8 +144,8 @@ "image_scrapers": ("matplotlib",), "doc_module": "hidimstat", "backreferences_dir": "./generated/gallery/backreference/", - "parallel": True, - "show_memory": False, # can't show memory if it's in parallel + "parallel": False, + "show_memory": True, # can't show memory if it's in parallel "reference_url": { # The module we locally document (so, hidimstat) uses None "hidimstat": None, diff --git a/examples/plot_2D_simulation_example.py b/examples/plot_2D_simulation_example.py index 5e423daca..b6a83da53 100644 --- a/examples/plot_2D_simulation_example.py +++ b/examples/plot_2D_simulation_example.py @@ -109,8 +109,8 @@ fwer_target = 0.1 delta = 6 -# computation parameter -n_jobs = 1 +# number of worker +n_jobs = 3 # %% # Computing z-score thresholds for support estimation diff --git a/examples/plot_fmri_data_example.py b/examples/plot_fmri_data_example.py index 9bd81059d..45bbb7740 100644 --- a/examples/plot_fmri_data_example.py +++ b/examples/plot_fmri_data_example.py @@ -29,7 +29,6 @@ predictive regions across various tasks. """ -import resource import warnings import numpy as np @@ -61,13 +60,8 @@ "ignore", message="The provided image has no sform in its header." ) -# Limit the ressoruce use for the example to 5 G or maximum of possible. -limit_5G = int(5 * 1e9) -soft, hard = resource.getrlimit(resource.RLIMIT_AS) -new_soft_limit = limit_5G if soft < 0 else min(limit_5G, soft) -new_hard_limit = limit_5G if hard < 0 else min(limit_5G, hard) -resource.setrlimit(resource.RLIMIT_AS, (new_soft_limit, new_hard_limit)) -n_jobs = 1 +# number of worker +n_jobs = 3 # %% @@ -144,6 +138,17 @@ def preprocess_haxby(subject=2, memory=None): # Making the inference with several algorithms # -------------------------------------------- +# Limit the resource use for the algorithm to 5 G or maximum of possible. +# +import resource + +limit_5G = int(5 * 1e9) +soft, hard = resource.getrlimit(resource.RLIMIT_AS) +new_soft_limit = limit_5G if soft < 0 else min(limit_5G, soft) +new_hard_limit = limit_5G if hard < 0 else min(limit_5G, hard) +resource.setrlimit(resource.RLIMIT_AS, (new_soft_limit, new_hard_limit)) + +# Default estimator estimator = LassoCV( eps=1e-2, fit_intercept=False, @@ -154,7 +159,7 @@ def preprocess_haxby(subject=2, memory=None): n_jobs=1, ) -# + # First, we try to recover the discriminative pattern by computing # p-values from desparsified lasso. # Due to the size of the X, it's not possible to use this method with a limit diff --git a/examples/plot_knockoffs_wisconsin.py b/examples/plot_knockoffs_wisconsin.py index 257ef2e29..8a001f66b 100644 --- a/examples/plot_knockoffs_wisconsin.py +++ b/examples/plot_knockoffs_wisconsin.py @@ -23,7 +23,6 @@ data = load_breast_cancer() X = data.data y = data.target -# Random seed for reproducibility X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) @@ -165,6 +164,7 @@ tol_gauss=1e-15, preconfigure_estimator=None, fdr=fdr, + n_jobs=3, ) # Count how many selected features are actually noise diff --git a/pyproject.toml b/pyproject.toml index 635d31f64..03d0d5930 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,7 @@ license = { file = "LICENSE" } doc = [ "matplotlib >= 3.8.0, < 4", "nilearn >= 0.11.0, < 1", + "memory_profiler ", "numpydoc >= 1.0.0, < 2", "pydata_sphinx_theme >= 0.15.1, < 1", "seaborn >= 0.13, < 1", From be52459745f946f5282d95266d188e8612d31f9e Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 24 Oct 2025 18:49:49 +0200 Subject: [PATCH 86/93] fix example? --- examples/plot_2D_simulation_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/plot_2D_simulation_example.py b/examples/plot_2D_simulation_example.py index b6a83da53..6c6df4298 100644 --- a/examples/plot_2D_simulation_example.py +++ b/examples/plot_2D_simulation_example.py @@ -110,7 +110,7 @@ delta = 6 # number of worker -n_jobs = 3 +n_jobs = 2 # %% # Computing z-score thresholds for support estimation From 440eea7e23a4493412387db5f0344642d60bd786 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 27 Oct 2025 14:38:47 +0100 Subject: [PATCH 87/93] remove warnings --- examples/plot_2D_simulation_example.py | 4 ++-- src/hidimstat/ensemble_clustered_inference.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/plot_2D_simulation_example.py b/examples/plot_2D_simulation_example.py index 6c6df4298..03de9f3e5 100644 --- a/examples/plot_2D_simulation_example.py +++ b/examples/plot_2D_simulation_example.py @@ -184,8 +184,8 @@ def weight_map_2D_extended(shape, roi_size, delta): # compute desparsified lasso -desparsified_lasso = DesparsifiedLasso(n_jobs=n_jobs, random_state=0).fit(X_init, y) -desparsified_lasso.importance(X_init, y) +desparsified_lasso = DesparsifiedLasso(n_jobs=n_jobs, random_state=0) +desparsified_lasso.fit_importance(X_init, y) # compute estimated support (first method) zscore = zscore_from_pval(desparsified_lasso.pvalues_, 1 - desparsified_lasso.pvalues_) diff --git a/src/hidimstat/ensemble_clustered_inference.py b/src/hidimstat/ensemble_clustered_inference.py index 6af7f1aeb..5ed03dc53 100644 --- a/src/hidimstat/ensemble_clustered_inference.py +++ b/src/hidimstat/ensemble_clustered_inference.py @@ -284,7 +284,7 @@ def clustered_inference( X_reduced, y, ) - desparsified_lassos.importance(X_reduced, y) + desparsified_lassos.importance() return ward_, desparsified_lassos From 7f04ebd94bf7a7ea7d12473ccc37c8754ec689c9 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 27 Oct 2025 14:39:13 +0100 Subject: [PATCH 88/93] remove memory issue --- src/hidimstat/desparsified_lasso.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/hidimstat/desparsified_lasso.py b/src/hidimstat/desparsified_lasso.py index 7bc909a46..9b8ae9f1f 100644 --- a/src/hidimstat/desparsified_lasso.py +++ b/src/hidimstat/desparsified_lasso.py @@ -288,10 +288,10 @@ def fit(self, X, y): clf=seed_estimator( clone(self.model_x).set_params( alpha=alphas[i], - precompute=np.delete(np.delete(gram, i, axis=0), i, axis=1), ), random_state=rng_spwan, ), + gram=gram, # gram matrix is passed to the job to avoid memory issue return_clf=self.save_model_x, ) for i, rng_spwan in enumerate(rng.spawn(n_features)) @@ -467,10 +467,10 @@ def fit_importance(self, X, y): Desparsified lasso coefficient estimates. """ self.fit(X, y) - return self.importance(X, y) + return self.importance() -def _joblib_compute_residuals(X, id_column, clf, return_clf): +def _joblib_compute_residuals(X, id_column, clf, gram, return_clf): """ Compute nodewise Lasso regression for desparsified Lasso estimation. @@ -509,6 +509,9 @@ def _joblib_compute_residuals(X, id_column, clf, return_clf): X_minus_i = np.delete(X, id_column, axis=1) X_i = np.copy(X[:, id_column]) + clf.set_params( + precompute=np.delete(np.delete(gram, id_column, axis=0), id_column, axis=1) + ) # Fitting the Lasso model and computing the residuals clf.fit(X_minus_i, X_i) z = X_i - clf.predict(X_minus_i) From ae3aa7361d1ead93ca7312f837660bd8f74ae64c Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 27 Oct 2025 15:29:33 +0100 Subject: [PATCH 89/93] fix test --- src/hidimstat/base_variable_importance.py | 44 +------------------ src/hidimstat/ensemble_clustered_inference.py | 16 +++---- test/test_ensemble_clustered_inference.py | 8 ++-- 3 files changed, 14 insertions(+), 54 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index 2c4d6bbbb..e7b9a7e25 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -101,10 +101,6 @@ def _selection_generic( return no_mask -from hidimstat.statistical_tools.multiple_testing import fdr_threshold -from hidimstat.statistical_tools.aggregation import quantile_aggregation - - class BaseVariableImportance(BaseEstimator): """ Base class for variable importance methods. @@ -133,45 +129,9 @@ def __init__(self): self.importances_ = None self.pvalues_ = None - def selection_fdr( - self, - fdr, - fdr_control="bhq", - reshaping_function=None, - adaptive_aggregation=False, - gamma=0.5, - ): + def _check_importance(self): """ - Performs feature selection based on False Discovery Rate (FDR) control. - - This method selects features by controlling the FDR using either p-values. - It supports different FDR control methods and optional adaptive aggregation - of the statistical values. - - Parameters - ---------- - fdr : float - The target false discovery rate level (between 0 and 1) - fdr_control: str, default="bhq" - The FDR control method to use. Options are: - - "bhq": Benjamini-Hochberg procedure - - 'bhy': Benjamini-Hochberg-Yekutieli procedure - reshaping_function: callable, default=None - Reshaping function for BHY method, default uses sum of reciprocals - adaptive_aggregation: bool, default=False - If True, uses adaptive weights for p-value aggregation - gamma: float, default=0.5 - The gamma parameter for quantile aggregation of p-values (between 0 and 1) - - Returns - ------- - numpy.ndarray - Boolean array indicating selected features (True for selected, False for not selected) - - Raises - ------ - AssertionError - If list_pvalues_ attribute is missing or fdr_control is invalid + Checks if the importance scores have been computed. """ if self.importances_ is None: raise ValueError( diff --git a/src/hidimstat/ensemble_clustered_inference.py b/src/hidimstat/ensemble_clustered_inference.py index fb28296a5..f9e96971c 100644 --- a/src/hidimstat/ensemble_clustered_inference.py +++ b/src/hidimstat/ensemble_clustered_inference.py @@ -6,10 +6,11 @@ from sklearn.cluster import FeatureAgglomeration from sklearn.exceptions import NotFittedError from sklearn.preprocessing import StandardScaler -from sklearn.utils import check_random_state, resample +from sklearn.utils import resample from hidimstat.base_variable_importance import BaseVariableImportance from hidimstat.desparsified_lasso import DesparsifiedLasso +from hidimstat._utils.utils import check_random_state class EnsembleClusteredInference(BaseVariableImportance): @@ -127,7 +128,6 @@ def __init__( def fit(self, X, y): rng = check_random_state(self.random_state) - seed = rng.randint(1) if self.verbose > 0: print( @@ -142,12 +142,12 @@ def fit(self, X, y): y, self.train_size, self.groups, - i, + rng_spawn, self.ward, self.scaler_sampling, self.variable_importance, ) - for i in np.arange(seed, seed + self.n_bootstraps) + for rng_spawn in rng.spawn(self.n_bootstraps) ) return self @@ -287,7 +287,7 @@ def _bootstrap_run_fit( y, train_size, groups, - seed, + rng, ward, scaler_sampling, variable_importance, @@ -349,7 +349,7 @@ def _bootstrap_run_importance(ward_, scaler_sampling_, variable_importance_, X, return importance, pvalue, pvalue_corr -def _subsampling(n_samples, train_size, groups=None, seed=0): +def _subsampling(n_samples, train_size, groups=None, random_state=None): """ Random subsampling for statistical inference. @@ -362,7 +362,7 @@ def _subsampling(n_samples, train_size, groups=None, seed=0): groups : ndarray, shape (n_samples,), optional (default=None) Group labels for samples. If not None, a subset of groups is selected. - seed : int, optional (default=0) + random_state : int or None (default=None) Random seed for reproducibility. Returns @@ -375,7 +375,7 @@ def _subsampling(n_samples, train_size, groups=None, seed=0): index_row, n_samples=int(len(index_row) * train_size), replace=False, - random_state=seed, + random_state=np.random.RandomState(random_state.bit_generator), ) if groups is not None: train_index = np.arange(n_samples)[np.isin(groups, train_index)] diff --git a/test/test_ensemble_clustered_inference.py b/test/test_ensemble_clustered_inference.py index cd3d9a143..4bc8753a3 100644 --- a/test/test_ensemble_clustered_inference.py +++ b/test/test_ensemble_clustered_inference.py @@ -25,7 +25,7 @@ def set_desparsified_lasso_multi_time(): random_state=1, n_jobs=1, ) - return DesparsifiedLasso(lasso_cv=multitasklassoCV) + return DesparsifiedLasso(model_y=multitasklassoCV) # Scenario 1: data with no temporal dimension @@ -263,7 +263,7 @@ def test_ensemble_clustered_inference(): n_bootstraps=n_bootstraps, ).fit(X_init, y) EnCluDl.importance(X_init, y) - selected = EnCluDl.selection_fdr(fdr=0.1) + selected = EnCluDl.fdr_selection(fdr=0.1) expected = np.zeros(n_features) expected[:support_size] = 1.0 @@ -318,7 +318,7 @@ def test_ensemble_clustered_inference_temporal_data(): n_bootstraps=n_bootstraps, ).fit(X, y) EnCluDl.importance(X, y) - selected = EnCluDl.selection_fdr(fdr=0.1, fdr_control="bhq") + selected = EnCluDl.fdr_selection(fdr=0.1, fdr_control="bhq") expected = np.zeros(n_features) expected[:support_size] = 1.0 @@ -331,7 +331,7 @@ def test_ensemble_clustered_inference_temporal_data(): ) # different aggregation method - selected = EnCluDl.selection_fdr(fdr=0.1, fdr_control="bhy") + selected = EnCluDl.fdr_selection(fdr=0.1, fdr_control="bhy") expected = np.zeros(n_features) expected[:support_size] = 1.0 From 3406819617330f746d23c05919859d94cf9ad6d5 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 27 Oct 2025 15:30:29 +0100 Subject: [PATCH 90/93] fix format --- src/hidimstat/ensemble_clustered_inference.py | 2 +- test/test_ensemble_clustered_inference.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/hidimstat/ensemble_clustered_inference.py b/src/hidimstat/ensemble_clustered_inference.py index f9e96971c..712506c14 100644 --- a/src/hidimstat/ensemble_clustered_inference.py +++ b/src/hidimstat/ensemble_clustered_inference.py @@ -8,9 +8,9 @@ from sklearn.preprocessing import StandardScaler from sklearn.utils import resample +from hidimstat._utils.utils import check_random_state from hidimstat.base_variable_importance import BaseVariableImportance from hidimstat.desparsified_lasso import DesparsifiedLasso -from hidimstat._utils.utils import check_random_state class EnsembleClusteredInference(BaseVariableImportance): diff --git a/test/test_ensemble_clustered_inference.py b/test/test_ensemble_clustered_inference.py index 4bc8753a3..1390de446 100644 --- a/test/test_ensemble_clustered_inference.py +++ b/test/test_ensemble_clustered_inference.py @@ -10,9 +10,9 @@ from sklearn.model_selection import KFold from sklearn.preprocessing import StandardScaler -from hidimstat.ensemble_clustered_inference import EnsembleClusteredInference -from hidimstat.desparsified_lasso import DesparsifiedLasso from hidimstat._utils.scenario import multivariate_simulation +from hidimstat.desparsified_lasso import DesparsifiedLasso +from hidimstat.ensemble_clustered_inference import EnsembleClusteredInference def set_desparsified_lasso_multi_time(): From cc2f4f01e6459e070328ab9025d60e054b64ff11 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 31 Oct 2025 15:25:29 +0100 Subject: [PATCH 91/93] fix example --- examples/plot_2D_simulation_example.py | 6 ++- examples/plot_fmri_data_example.py | 61 ++++++++++++-------------- 2 files changed, 33 insertions(+), 34 deletions(-) diff --git a/examples/plot_2D_simulation_example.py b/examples/plot_2D_simulation_example.py index 7d7e9ad8a..dce974e25 100644 --- a/examples/plot_2D_simulation_example.py +++ b/examples/plot_2D_simulation_example.py @@ -225,10 +225,12 @@ def weight_map_2D_extended(shape, roi_size, delta): # ensemble of clustered desparsified lasso (EnCluDL) ensemble_clustered_inference = EnsembleClusteredInference( - ward=ward, scaler_sampling=StandardScaler(), n_bootstraps=1 + ward=ward, scaler_sampling=StandardScaler(), n_bootstraps=5 ) ensemble_clustered_inference.fit_importance(X_init, y) -selected_ecdl = ensemble_clustered_inference.selection_fdr(fdr=fwer_target) +selected_ecdl = ensemble_clustered_inference.fdr_selection( + fdr=fwer_target, alternative_hypothesis=None +) # %% # Results diff --git a/examples/plot_fmri_data_example.py b/examples/plot_fmri_data_example.py index 7271af7bc..14c7b4d01 100644 --- a/examples/plot_fmri_data_example.py +++ b/examples/plot_fmri_data_example.py @@ -47,12 +47,7 @@ from sklearn.utils import Bunch from hidimstat.desparsified_lasso import DesparsifiedLasso -from hidimstat.ensemble_clustered_inference import ( - clustered_inference, - clustered_inference_pvalue, - ensemble_clustered_inference, - ensemble_clustered_inference_pvalue, -) +from hidimstat.ensemble_clustered_inference import EnsembleClusteredInference from hidimstat.statistical_tools.p_values import zscore_from_pval # Remove warnings during loading data @@ -183,19 +178,22 @@ def preprocess_haxby(subject=2, memory=None): # %% # Now, the clustered inference algorithm which combines parcellation # and high-dimensional inference (c.f. References). -ward_, cl_desparsified_lasso = clustered_inference( - X, - y, - ward, +clustered_inference = EnsembleClusteredInference( + variable_importance=DesparsifiedLasso( + noise_method="median", + model_y=clone(estimator), + tolerance_reid=1e-2, + n_jobs=1, + ), + ward=ward, scaler_sampling=StandardScaler(), - model_y=clone(estimator), - tolerance_reid=1e-2, - random_state=1, - n_jobs=n_jobs, -) -beta_hat, pval_cdl, _, one_minus_pval_cdl, _ = clustered_inference_pvalue( - X.shape[0], None, ward_, cl_desparsified_lasso + n_bootstraps=1, + n_jobs=1, + random_state=0, ) +beta_hat = clustered_inference.fit_importance(X, y) +pval_cdl = clustered_inference.pvalues_ +one_minus_pval_cdl = 1 - clustered_inference.pvalues_ # %% # Below, we run the ensemble clustered inference algorithm which adds a @@ -205,24 +203,23 @@ def preprocess_haxby(subject=2, memory=None): # then 5 statistical maps are produced and aggregated into one. # However you might benefit from clustering randomization taking # `n_bootstraps=25` or `n_bootstraps=100`, also we set `n_jobs=n_jobs`. -list_ward, list_cl_desparsified_lasso = ensemble_clustered_inference( - X, - y, - ward, - groups=groups, +ensemble_clustered_inference = EnsembleClusteredInference( + variable_importance=DesparsifiedLasso( + noise_method="median", + model_y=clone(estimator), + tolerance_reid=1e-2, + n_jobs=1, + ), + ward=ward, scaler_sampling=StandardScaler(), n_bootstraps=5, - model_y=clone(estimator), - tolerance_reid=1e-2, - random_state=2, - n_jobs=n_jobs, + n_jobs=1, + random_state=0, ) -beta_hat, selected = ensemble_clustered_inference_pvalue( - X.shape[0], - False, - list_ward, - list_cl_desparsified_lasso, - fdr=0.1, +beta_hat = ensemble_clustered_inference.fit_importance(X, y) +pval_cdl = ensemble_clustered_inference.pvalues_ +selected = ensemble_clustered_inference.fdr_selection( + fdr=0.1, alternative_hypothesis=None ) # %% From 0d4b68cc9b439f1d16f67c4f2efed220878156c9 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 31 Oct 2025 15:25:56 +0100 Subject: [PATCH 92/93] remove cv from fit --- src/hidimstat/ensemble_clustered_inference.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/hidimstat/ensemble_clustered_inference.py b/src/hidimstat/ensemble_clustered_inference.py index 712506c14..3d2ed6220 100644 --- a/src/hidimstat/ensemble_clustered_inference.py +++ b/src/hidimstat/ensemble_clustered_inference.py @@ -1,5 +1,3 @@ -import warnings - import numpy as np from joblib import Parallel, delayed from sklearn.base import check_is_fitted, clone @@ -249,7 +247,7 @@ def importance(self, X, y): self.pvalues_corr_ = np.mean(self.list_pvalues_corr_, axis=0) return self.importances_ - def fit_importance(self, X, y, cv=None): + def fit_importance(self, X, y): """ Fits the model to the data and computes feature importance. @@ -262,8 +260,6 @@ def fit_importance(self, X, y, cv=None): Training data matrix. y : array-like of shape (n_samples,) Target values. - cv : None or int, optional (default=None) - Not used. Included for compatibility. A warning will be issued if provided. Returns ------- @@ -276,8 +272,6 @@ def fit_importance(self, X, y, cv=None): Also sets the importances\_ and pvalues\_ attributes on the instance. See fit() and importance() for details on the underlying computations. """ - if cv is not None: - warnings.warn("cv won't be used") self.fit(X, y) return self.importance(X, y) From 2666166cf40a3435d83b4802e96dea3cca1e3852 Mon Sep 17 00:00:00 2001 From: jpaillard Date: Fri, 7 Nov 2025 14:21:37 +0100 Subject: [PATCH 93/93] add to API --- docs/src/api.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/src/api.rst b/docs/src/api.rst index dd6d6d2b9..e47e0e0eb 100644 --- a/docs/src/api.rst +++ b/docs/src/api.rst @@ -30,6 +30,7 @@ Feature Importance Classes D0CRT ModelXKnockoff DesparsifiedLasso + EnsembleClusteredInference Feature Importance functions ============================