diff --git a/docs/src/api.rst b/docs/src/api.rst index dd6d6d2b..e47e0e0e 100644 --- a/docs/src/api.rst +++ b/docs/src/api.rst @@ -30,6 +30,7 @@ Feature Importance Classes D0CRT ModelXKnockoff DesparsifiedLasso + EnsembleClusteredInference Feature Importance functions ============================ diff --git a/examples/plot_2D_simulation_example.py b/examples/plot_2D_simulation_example.py index 8778f89f..ae7caa40 100644 --- a/examples/plot_2D_simulation_example.py +++ b/examples/plot_2D_simulation_example.py @@ -211,6 +211,7 @@ def weight_map_2D_extended(shape, roi_size, delta): from sklearn.preprocessing import StandardScaler from hidimstat.ensemble_clustered_inference import ( + EnsembleClusteredInference, clustered_inference, clustered_inference_pvalue, ) @@ -221,21 +222,16 @@ def weight_map_2D_extended(shape, roi_size, delta): ) # clustered desparsified lasso (CluDL) -ward_, desparsified_lasso_ = clustered_inference( - X_init, y, ward, scaler_sampling=StandardScaler(), random_state=0 -) -beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( - clustered_inference_pvalue(n_samples, False, ward_, desparsified_lasso_) +clustered_inference = EnsembleClusteredInference( + ward=ward, scaler_sampling=StandardScaler(), n_bootstraps=1 ) +clustered_inference.fit_importance(X_init, y) # compute estimated support (first method) -zscore = zscore_from_pval(pval, one_minus_pval) -selected_cdl = zscore > thr_c # use the "clustering threshold" - -# compute estimated support (second method) -selected_cdl = np.logical_or( - pval_corr < fwer_target / 2, one_minus_pval_corr < fwer_target / 2 +zscore = zscore_from_pval( + clustered_inference.pvalues_, 1 - clustered_inference.pvalues_ ) +selected_cdl = zscore > thr_c # use the "clustering threshold" # %% @@ -251,20 +247,12 @@ def weight_map_2D_extended(shape, roi_size, delta): ) # ensemble of clustered desparsified lasso (EnCluDL) -list_ward, list_desparsified_lasso = ensemble_clustered_inference( - X_init, - y, - ward, - scaler_sampling=StandardScaler(), - random_state=0, - n_jobs=n_jobs, +ensemble_clustered_inference = EnsembleClusteredInference( + ward=ward, scaler_sampling=StandardScaler(), n_bootstraps=5 ) -beta_hat, selected_ecdl = ensemble_clustered_inference_pvalue( - n_samples, - False, - list_ward, - list_desparsified_lasso, - fdr=fwer_target, +ensemble_clustered_inference.fit_importance(X_init, y) +selected_ecdl = ensemble_clustered_inference.fdr_selection( + fdr=fwer_target, alternative_hypothesis=None ) diff --git a/examples/plot_fmri_data_example.py b/examples/plot_fmri_data_example.py index 63b0b96f..7e7d6e99 100644 --- a/examples/plot_fmri_data_example.py +++ b/examples/plot_fmri_data_example.py @@ -47,12 +47,7 @@ from sklearn.utils import Bunch from hidimstat.desparsified_lasso import DesparsifiedLasso -from hidimstat.ensemble_clustered_inference import ( - clustered_inference, - clustered_inference_pvalue, - ensemble_clustered_inference, - ensemble_clustered_inference_pvalue, -) +from hidimstat.ensemble_clustered_inference import EnsembleClusteredInference from hidimstat.statistical_tools.p_values import zscore_from_pval # Remove warnings during loading data @@ -183,19 +178,22 @@ def preprocess_haxby(subject=2, memory=None): # %% # Now, the clustered inference algorithm which combines parcellation # and high-dimensional inference (c.f. References). -ward_, cl_desparsified_lasso = clustered_inference( - X, - y, - ward, +clustered_inference = EnsembleClusteredInference( + variable_importance=DesparsifiedLasso( + noise_method="median", + estimator=clone(estimator), + tolerance_reid=1e-2, + n_jobs=1, + ), + ward=ward, scaler_sampling=StandardScaler(), - estimator=clone(estimator), - tolerance_reid=1e-2, - random_state=1, - n_jobs=n_jobs, -) -beta_hat, pval_cdl, _, one_minus_pval_cdl, _ = clustered_inference_pvalue( - X.shape[0], None, ward_, cl_desparsified_lasso + n_bootstraps=1, + n_jobs=1, + random_state=0, ) +beta_hat = clustered_inference.fit_importance(X, y) +pval_cdl = clustered_inference.pvalues_ +one_minus_pval_cdl = 1 - clustered_inference.pvalues_ # %% # Below, we run the ensemble clustered inference algorithm which adds a @@ -205,24 +203,23 @@ def preprocess_haxby(subject=2, memory=None): # then 5 statistical maps are produced and aggregated into one. # However you might benefit from clustering randomization taking # `n_bootstraps=25` or `n_bootstraps=100`, also we set `n_jobs=n_jobs`. -list_ward, list_cl_desparsified_lasso = ensemble_clustered_inference( - X, - y, - ward, - groups=groups, +ensemble_clustered_inference = EnsembleClusteredInference( + variable_importance=DesparsifiedLasso( + noise_method="median", + estimator=clone(estimator), + tolerance_reid=1e-2, + n_jobs=1, + ), + ward=ward, scaler_sampling=StandardScaler(), n_bootstraps=5, - estimator=clone(estimator), - tolerance_reid=1e-2, - random_state=2, - n_jobs=n_jobs, + n_jobs=1, + random_state=0, ) -beta_hat, selected = ensemble_clustered_inference_pvalue( - X.shape[0], - False, - list_ward, - list_cl_desparsified_lasso, - fdr=0.1, +beta_hat = ensemble_clustered_inference.fit_importance(X, y) +pval_cdl = ensemble_clustered_inference.pvalues_ +selected = ensemble_clustered_inference.fdr_selection( + fdr=0.1, alternative_hypothesis=None ) # %% diff --git a/src/hidimstat/__init__.py b/src/hidimstat/__init__.py index 3cb37dad..422f3c86 100644 --- a/src/hidimstat/__init__.py +++ b/src/hidimstat/__init__.py @@ -2,6 +2,7 @@ from .desparsified_lasso import DesparsifiedLasso, desparsified_lasso, reid from .distilled_conditional_randomization_test import D0CRT, d0crt from .ensemble_clustered_inference import ( + EnsembleClusteredInference, clustered_inference, clustered_inference_pvalue, ensemble_clustered_inference, @@ -25,6 +26,7 @@ "DesparsifiedLasso", "d0crt", "D0CRT", + "EnsembleClusteredInference", "ensemble_clustered_inference", "ensemble_clustered_inference_pvalue", "reid", diff --git a/src/hidimstat/_utils/bootstrap.py b/src/hidimstat/_utils/bootstrap.py deleted file mode 100644 index 2e519531..00000000 --- a/src/hidimstat/_utils/bootstrap.py +++ /dev/null @@ -1,35 +0,0 @@ -import numpy as np -from sklearn.utils import resample - - -def _subsampling(n_samples, train_size, groups=None, random_state=None): - """ - Random subsampling for statistical inference. - - Parameters - ---------- - n_samples : int - Total number of samples in the dataset. - train_size : float - Fraction of samples to include in the training set (between 0 and 1). - groups : ndarray, shape (n_samples,), optional (default=None) - Group labels for samples. - If not None, a subset of groups is selected. - random_state : int, optional (default=0) - Random seed for reproducibility. - - Returns - ------- - train_index : ndarray - Indices of selected samples for training. - """ - index_row = np.arange(n_samples) if groups is None else np.unique(groups) - train_index = resample( - index_row, - n_samples=int(len(index_row) * train_size), - replace=False, - random_state=np.random.RandomState(random_state.bit_generator), - ) - if groups is not None: - train_index = np.arange(n_samples)[np.isin(groups, train_index)] - return train_index diff --git a/src/hidimstat/ensemble_clustered_inference.py b/src/hidimstat/ensemble_clustered_inference.py index c48b4dcb..d9e378f9 100644 --- a/src/hidimstat/ensemble_clustered_inference.py +++ b/src/hidimstat/ensemble_clustered_inference.py @@ -1,168 +1,26 @@ import numpy as np from joblib import Parallel, delayed -from sklearn.base import clone +from sklearn.base import check_is_fitted, clone from sklearn.cluster import FeatureAgglomeration -from sklearn.linear_model import LassoCV, MultiTaskLassoCV -from sklearn.model_selection import KFold -from sklearn.utils.validation import check_memory -from tqdm import tqdm +from sklearn.exceptions import NotFittedError +from sklearn.preprocessing import StandardScaler +from sklearn.utils import resample -from hidimstat._utils.bootstrap import _subsampling from hidimstat._utils.utils import check_random_state +from hidimstat.base_variable_importance import BaseVariableImportance from hidimstat.desparsified_lasso import DesparsifiedLasso -from hidimstat.statistical_tools.aggregation import quantile_aggregation -from hidimstat.statistical_tools.multiple_testing import fdr_threshold -def _ungroup_beta(beta_hat, n_features, ward): - """ - Ungroup cluster-level beta coefficients to individual feature-level - coefficients. - - Parameters - ---------- - beta_hat : ndarray, shape (n_clusters,) or (n_clusters, n_tasks) - Beta coefficients at cluster level - n_features : int - Number of features in original space - ward : sklearn.cluster.FeatureAgglomeration - Fitted clustering object - - Returns - ------- - beta_hat_degrouped : ndarray, shape (n_features,) or (n_features, n_tasks) - Rescaled beta coefficients for individual features, weighted by - inverse cluster size - - Notes - ----- - Each coefficient is scaled by 1/cluster_size to maintain proper magnitude - when distributing cluster effects to individual features. - Handles both univariate (1D) and multivariate (2D) beta coefficients. - """ - labels = ward.labels_ - # compute the size of each cluster - clusters_size = np.zeros(labels.size) - for label in range(labels.max() + 1): - cluster_size = np.sum(labels == label) - clusters_size[labels == label] = cluster_size - # degroup beta_hat - if len(beta_hat.shape) == 1: - # weighting the weight of beta with the size of the cluster - beta_hat_degrouped = ward.inverse_transform(beta_hat) / clusters_size - elif len(beta_hat.shape) == 2: - n_tasks = beta_hat.shape[1] - beta_hat_degrouped = np.zeros((n_features, n_tasks)) - for i in range(n_tasks): - beta_hat_degrouped[:, i] = ( - ward.inverse_transform(beta_hat[:, i]) / clusters_size - ) - return beta_hat_degrouped - - -def _degrouping(ward, beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr): +class EnsembleClusteredInference(BaseVariableImportance): """ - Degroup and rescale cluster-level statistics to individual features. - This function takes cluster-level statistics and assigns them back - to individual features, while appropriately rescaling the parameter - estimates based on cluster sizes. - - Parameters - ---------- - ward : sklearn.cluster.FeatureAgglomeration - Fitted clustering object containing the hierarchical structure - beta_hat : ndarray, shape (n_clusters,) or (n_clusters, n_tasks) - Estimated parameters at cluster level - pval : ndarray, shape (n_clusters,) - P-values at cluster level - pval_corr : ndarray, shape (n_clusters,) - Corrected p-values at cluster level - one_minus_pval : ndarray, shape (n_clusters,) - 1 - p-values at cluster level - one_minus_pval_corr : ndarray, shape (n_clusters,) - 1 - corrected p-values at cluster level - - Returns - ------- - beta_hat : ndarray, shape (n_features,) or (n_features, n_tasks) - Rescaled parameter estimates for individual features - pval : ndarray, shape (n_features,) - P-values for individual features - pval_corr : ndarray, shape (n_features,) - Corrected p-values for individual features - one_minus_pval : ndarray, shape (n_features,) - 1 - p-values for individual features - one_minus_pval_corr : ndarray, shape (n_features,) - 1 - corrected p-values for individual features - - Notes - ----- - The beta_hat values are rescaled by dividing by the cluster size - to maintain the proper scale of the estimates when moving from - cluster-level to feature-level. - The function handles both 1D and 2D beta_hat arrays for single and - multiple time points. - """ - # degroup variable other than beta_hat - pval, pval_corr, one_minus_pval, one_minus_pval_corr = map( - ward.inverse_transform, - [pval, pval_corr, one_minus_pval, one_minus_pval_corr], - ) - - beta_hat = _ungroup_beta(beta_hat, n_features=pval.shape[0], ward=ward) - - return beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr - - -def _ward_clustering(X_init, ward, train_index): - """ - Performs Ward clustering on data using a training subset. - - This function applies Ward hierarchical clustering to a dataset, where the clustering - is computed based on a subset of samples but then applied to the full dataset. - - Parameters - ---------- - X_init : numpy.ndarray - Initial data matrix of shape (n_samples, n_features) to be clustered - ward : sklearn.cluster.FeatureAgglomeration - Ward clustering estimator instance - train_index : array-like - Indices of samples to use for computing the clustering - - Returns - ------- - tuple - - X_reduced : numpy.ndarray - Transformed data matrix after applying Ward clustering - - ward : sklearn.cluster.FeatureAgglomeration - Fitted Ward clustering estimator - """ - ward = ward.fit(X_init[train_index, :]) - X_reduced = ward.transform(X_init) - return X_reduced, ward - - -def clustered_inference( - X_init, - y, - ward, - scaler_sampling=None, - train_size=1.0, - groups=None, - random_state=None, - n_jobs=1, - memory=None, - verbose=1, - **kwargs, -): - """ - Clustered inference algorithm for statistical analysis of - high-dimensional data. + Ensemble clustered inference algorithm for high-dimensional + statistical inference. This algorithm implements the method described in :cite:`chevalier2022spatially` for - performing statistical inference on high-dimensional linear models - using feature clustering to reduce dimensionality. + + This algorithm combines multiple runs of clustered inference with + different random subsamples to provide more robust statistical estimates. + It uses the desparsified lasso method for inference. Parameters ---------- @@ -216,378 +74,336 @@ def clustered_inference( 3. Transform data to cluster space 4. Perform statistical inference using desparsified lasso """ - memory = check_memory(memory=memory) - rng = check_random_state(random_state) - assert issubclass( - ward.__class__, FeatureAgglomeration - ), "ward need to an instance of sklearn.cluster.FeatureAgglomeration" + + def __init__( + self, + variable_importance=DesparsifiedLasso(), + ward=None, + n_bootstraps=25, + scaler_sampling=None, + train_size=1.0, + groups=None, + random_state=None, + n_jobs=1, + verbose=1, + ): + assert issubclass(DesparsifiedLasso, variable_importance.__class__) + self.variable_importance = variable_importance + assert ward is None or issubclass( + FeatureAgglomeration, ward.__class__ + ), "Ward should a FeatureAgglomeration" + self.ward = ward + assert scaler_sampling is None or issubclass( + StandardScaler, scaler_sampling.__class__ + ) + self.scaler_sampling = scaler_sampling + self.n_bootstraps = n_bootstraps + self.train_size = train_size + self.groups = groups + self.random_state = random_state + self.n_jobs = n_jobs + self.verbose = verbose + + # generalize to all the feature generated + self.list_ward_scaler_vi_ = None + self.list_importances_ = None + self.list_pvalues_ = None + self.list_pvalues_corr_ = None + self.pvalues_corr_ = None + + def fit(self, X, y): + rng = check_random_state(self.random_state) + + if self.verbose > 0: + print( + f"Clustered inference: n_clusters = {self.ward.n_clusters}, " + + f"inference method desparsified lasso, seed = {self.random_state}," + + f"groups = {self.groups is not None} " + ) + parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose) + self.list_ward_scaler_vi_ = parallel( + delayed(_bootstrap_run_fit)( + X, + y, + self.train_size, + self.groups, + rng_spawn, + self.ward, + self.scaler_sampling, + self.variable_importance, + ) + for rng_spawn in rng.spawn(self.n_bootstraps) + ) + return self + + def _check_fit(self): + """ + Check if the model has been fit before performing analysis. + + This private method verifies that all necessary attributes have been set + during the fitting process. + These attributes include: + - clf_x_ + - clf_y_ + - coefficient_ + - non_selection_ + + Raises + ------ + ValueError + If any of the required attributes are missing, indicating the model + hasn't been fit. + """ + if self.list_ward_scaler_vi_ is None: + raise ValueError("The requires to be fit before any analysis") + for ward, scaler, vi in self.list_ward_scaler_vi_: + if ward is not None: + try: + check_is_fitted(ward) + except NotFittedError: + raise ValueError( + "The EnsembleClusteredInference requires to be fit before any analysis" + ) + if scaler is not None: + try: + check_is_fitted(scaler) + except NotFittedError: + raise ValueError( + "The EnsembleClusteredInference requires to be fit before any analysis" + ) + vi._check_fit() + + def importance(self, X, y): + """ + Compute feature importance scores using distilled CRT. + + Calculates test statistics and p-values for each feature using residual + correlations after the distillation process. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data matrix. + y : array-like of shape (n_samples,) + Target values. + + Returns + ------- + importances_ : ndarray of shape (n_features,) + Test statistics/importance scores for each feature. For unselected features, + the score is set to 0. + + Attributes + ---------- + importances_ : same as return value + pvalues_ : ndarray of shape (n_features,) + Two-sided p-values for each feature under Gaussian null. + + Notes + ----- + For each selected feature j: + 1. Computes residuals from regressing X_j on other features + 2. Computes residuals from regressing y on other features + 3. Calculates test statistic from correlation of residuals + 4. Computes p-value assuming standard normal distribution + """ + self._check_fit() + + parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose) + results = parallel( + delayed(_bootstrap_run_importance)( + ward, + scaler, + vi, + X, + y, + ) + for ward, scaler, vi in self.list_ward_scaler_vi_ + ) + + self.list_importances_ = [] + self.list_pvalues_ = [] + self.list_pvalues_corr_ = [] + for importances, pvalues, pvalues_corr in results: + self.list_importances_.append(importances) + self.list_pvalues_.append(pvalues) + self.list_pvalues_corr_.append(pvalues_corr) + + self.importances_ = np.mean(self.list_importances_, axis=0) + self.pvalues_ = np.mean(self.list_pvalues_, axis=0) + self.pvalues_corr_ = np.mean(self.list_pvalues_corr_, axis=0) + return self.importances_ + + def fit_importance(self, X, y): + """ + Fits the model to the data and computes feature importance. + + A convenience method that combines fit() and importance() into a single call. + First fits the dCRT model to the data, then calculates importance scores. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data matrix. + y : array-like of shape (n_samples,) + Target values. + + Returns + ------- + importance : ndarray of shape (n_features,) + Feature importance scores/test statistics. + For features not selected during screening, scores are set to 0. + + Notes + ----- + Also sets the importances\_ and pvalues\_ attributes on the instance. + See fit() and importance() for details on the underlying computations. + """ + self.fit(X, y) + return self.importance(X, y) + + +def _bootstrap_run_fit( + X_init, + y, + train_size, + groups, + rng, + ward, + scaler_sampling, + variable_importance, +): n_samples, n_features = X_init.shape - ## This are the 3 step in first loop of the algorithm 2 of [1] + ## This are the 3 step in first loop of the algorithm 2 of `chevalier2022spatially` # sampling row of X train_index = _subsampling(n_samples, train_size, groups=groups, random_state=rng) # transformation matrix - X_reduced, ward_ = memory.cache(_ward_clustering)(X_init, clone(ward), train_index) + if ward is not None: + ward_ = clone(ward).fit(X_init[train_index, :]) + X_reduced = ward_.transform(X_init) + else: + X_reduced = X_init + ward_ = None # Preprocessing if scaler_sampling is not None: - X_reduced = clone(scaler_sampling).fit_transform(X_reduced) + scaler_sampling_ = clone(scaler_sampling) + X_reduced = scaler_sampling_.fit_transform(X_reduced) + else: + scaler_sampling_ = None # inference methods - if hasattr(kwargs, "lasso_cv") and kwargs["lasso_cv"] is not None: - pass - elif len(y.shape) > 1 and y.shape[1] > 1: - kwargs["estimator"] = MultiTaskLassoCV( - eps=1e-2, - fit_intercept=False, - cv=KFold(n_splits=5), - tol=1e-4, - max_iter=5000, - random_state=1, - n_jobs=1, - ) + variable_importance_ = clone(variable_importance).fit(X_reduced, y) + + return ward_, scaler_sampling_, variable_importance_ + + +def _bootstrap_run_importance(ward_, scaler_sampling_, variable_importance_, X, y): + # apply reduction + if ward_ is not None: + X_ = ward_.transform(X) else: - kwargs["estimator"] = LassoCV( - eps=1e-2, - fit_intercept=False, - cv=KFold(n_splits=5), - tol=1e-4, - max_iter=5000, - random_state=1, - n_jobs=1, - ) + X_ = X - desparsified_lassos = memory.cache( - DesparsifiedLasso( - n_jobs=n_jobs, - memory=memory, - verbose=verbose, - **kwargs, - ).fit, - ignore=["n_jobs", "verbose", "memory"], - )( - X_reduced, - y, - ) - desparsified_lassos.importance() + # apply Preprocessing + if scaler_sampling_ is not None: + X_ = scaler_sampling_.transform(X_) + else: + X_ = X + + variable_importance_.importance(X_, y) - return ward_, desparsified_lassos + if ward_ is not None: + pvalue = ward_.inverse_transform(variable_importance_.pvalues_) + pvalue_corr = ward_.inverse_transform(variable_importance_.pvalues_corr_) + importance = _ungroup_beta( + variable_importance_.importances_, n_features=pvalue.shape[0], ward=ward_ + ) + else: + pvalue = variable_importance_.pvalue_ + pvalue_corr = variable_importance_.pvalue_corr_ + importance = variable_importance_.importances_ + return importance, pvalue, pvalue_corr -def clustered_inference_pvalue(n_samples, group, ward, desparsified_lassos, **kwargs): + +def _subsampling(n_samples, train_size, groups=None, random_state=None): """ - Compute corrected p-values at the cluster level and transform them - back to feature level. + Random subsampling for statistical inference. Parameters ---------- n_samples : int - Number of samples in the dataset - group : bool - If True, uses group lasso p-values for multivariate outcomes - ward : AgglomerativeClustering - Fitted clustering object - beta_hat : ndarray - Estimated coefficients at cluster level - theta_hat : ndarray - Estimated precision matrix - precision_diag : ndarray - Diagonal elements of the covariance matrix - **kwargs : dict - Additional arguments passed to p-value computation functions + Total number of samples in the dataset. + train_size : float + Fraction of samples to include in the training set (between 0 and 1). + groups : ndarray, shape (n_samples,), optional (default=None) + Group labels for samples. + If not None, a subset of groups is selected. + random_state : int or None (default=None) + Random seed for reproducibility. Returns ------- - beta_hat : ndarray - Degrouped coefficients at feature level - pval : ndarray - P-values for each feature - pval_corr : ndarray - Multiple testing corrected p-values - one_minus_pval : ndarray - 1 - p-values for numerical stability - one_minus_pval_corr : ndarray - 1 - corrected p-values + train_index : ndarray + Indices of selected samples for training. """ - # corrected cluster-wise p-values - - # De-grouping - beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = _degrouping( - ward, - desparsified_lassos.importances_, - desparsified_lassos.pvalues_, - desparsified_lassos.pvalues_corr_, - desparsified_lassos.one_minus_pvalues_, - desparsified_lassos.one_minus_pvalues_corr_, + index_row = np.arange(n_samples) if groups is None else np.unique(groups) + train_index = resample( + index_row, + n_samples=int(len(index_row) * train_size), + replace=False, + random_state=np.random.RandomState(random_state.bit_generator), ) - - return beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr + if groups is not None: + train_index = np.arange(n_samples)[np.isin(groups, train_index)] + return train_index -def ensemble_clustered_inference( - X_init, - y, - ward, - scaler_sampling=None, - train_size=0.3, - groups=None, - random_state=None, - n_bootstraps=25, - n_jobs=None, - verbose=1, - memory=None, - **kwargs, -): +def _ungroup_beta(beta_hat, n_features, ward): """ - Ensemble clustered inference algorithm for high-dimensional - statistical inference, as described in :cite:`chevalier2022spatially`. - - This algorithm combines multiple runs of clustered inference with - different random subsamples to provide more robust statistical estimates. - It uses the desparsified lasso method for inference. + Ungroup cluster-level beta coefficients to individual feature-level + coefficients. Parameters ---------- - X_init : ndarray, shape (n_samples, n_features) - Original high-dimensional input data matrix. - - y : ndarray, shape (n_samples,) or (n_samples, n_tasks) - Target variable(s). Can be univariate or multivariate (temporal) data. - + beta_hat : ndarray, shape (n_clusters,) or (n_clusters, n_tasks) + Beta coefficients at cluster level + n_features : int + Number of features in original space ward : sklearn.cluster.FeatureAgglomeration - Feature agglomeration object implementing Ward hierarchical clustering. - - scaler_sampling : sklearn.preprocessing object, optional (default=None) - Scaler to standardize the clustered features. - - train_size : float, optional (default=0.3) - Fraction of samples used for clustering. Using train_size < 1 enables - random subsampling for better generalization. - - groups : ndarray, shape (n_samples,), optional (default=None) - Sample group labels for stratified subsampling. Ensures balanced - representation of groups in subsamples. - - inference_method : str, optional (default='desparsified-lasso') - Method used for inference. - Currently, the two available methods are 'desparsified-lasso' - and 'group-desparsified-lasso'. Use 'desparsified-lasso' for - non-temporal data and 'group-desparsified-lasso' for temporal data. - - random_seed: int, optional (default=None) - Seed used for generating the first random subsample of the data. - This seed controls the clustering randomness. - - ensembling_method : str, optional (default='quantiles') - Method used for ensembling. Currently, the two available methods - are 'quantiles' and 'median'. - - gamma_min : float, optional (default=0.2) - Lowest gamma-quantile considered to compute the adaptive - quantile aggregation formula. This parameter is used only if - `ensembling_method` is 'quantiles'. - - n_bootstraps : int, optional (default=25) - Number of bootstrap iterations for ensemble inference. - - n_jobs : int or None, optional (default=None) - Number of parallel jobs. None means using all processors. - - verbose: int, optional (default=1) - The verbosity level. If `verbose > 0`, a message is printed before - running the clustered inference. - - memory : joblib.Memory or str, optional (default=None) - Used to cache the output of the clustering and inference computation. - By default, no caching is done. If provided, it should be the path - to the caching directory or a joblib.Memory object. - - **kwargs : dict - Additional keyword arguments passed to statistical inference functions. + Fitted clustering object Returns ------- - list_ward : list of FeatureAgglomeration - List of fitted clustering objects from each bootstrap. - - list_beta_hat : list of ndarray - List of estimated coefficients from each bootstrap. - - pval : ndarray, shape (n_features,) - p-value, with numerically accurate values for - positive effects (i.e., for p-values close to zero). - - list_theta_hat : list of ndarray - List of estimated precision matrices. - - list_precision_diag : list of ndarray - List of diagonal elements of covariance matrices. - - one_minus_pval : ndarray, shape (n_features,) - One minus the p-value, with numerically accurate values - for negative effects (i.e., for p-values close to one). + beta_hat_degrouped : ndarray, shape (n_features,) or (n_features, n_tasks) + Rescaled beta coefficients for individual features, weighted by + inverse cluster size Notes ----- - The algorithm performs these steps for each bootstrap iteration: - 1. Subsample the data using stratified sampling if groups are provided - 2. Cluster features using Ward's hierarchical clustering - 3. Transform data to reduced cluster space - 4. Perform statistical inference using desparsified lasso - 5. Aggregate results across all iterations - - References - ---------- - .. footbibliography:: - """ - memory = check_memory(memory=memory) - assert issubclass( - ward.__class__, FeatureAgglomeration - ), "ward need to an instance of sklearn.cluster.FeatureAgglomeration" - rng = check_random_state(random_state) - - # Clustered inference algorithms - results = Parallel(n_jobs=n_jobs, verbose=verbose)( - delayed(clustered_inference)( - X_init, - y, - clone(ward), - scaler_sampling=scaler_sampling, - train_size=train_size, - groups=groups, - random_state=spawned_state, - n_jobs=1, - verbose=verbose, - memory=memory, - **kwargs, - ) - for spawned_state in tqdm(rng.spawn(n_bootstraps), disable=(verbose == 0)) - ) - list_ward, list_desparsified_lassos = [], [] - for ward, desparsified_lassos in results: - list_ward.append(ward) - list_desparsified_lassos.append(desparsified_lassos) - return list_ward, list_desparsified_lassos - - -def ensemble_clustered_inference_pvalue( - n_samples, - group, - list_ward, - list_desparsified_lassos, - fdr=0.1, - fdr_control="bhq", - reshaping_function=None, - adaptive_aggregation=False, - gamma=0.5, - n_jobs=None, - verbose=0, - **kwargs, -): - """ - Compute and aggregate p-values across multiple bootstrap iterations - using an aggregation method. - - This function performs statistical inference on each bootstrap sample - and combines the results using a specified aggregation method to obtain - robust estimates. - The implementation follows the methodology in :footcite:`chevalier2022spatially`. - - Parameters - ---------- - n_samples : int - Number of samples in the dataset - group : bool - If True, uses group lasso p-values for multivariate outcomes - list_ward : list of AgglomerativeClustering - List of fitted clustering objects from bootstraps - list_beta_hat : list of ndarray - List of estimated coefficients at cluster level from each bootstrap - list_theta_hat : list of ndarray - List of estimated precision matrices from each bootstrap - list_precision_diag : list of ndarray - List of diagonal elements of covariance matrices from each bootstrap - fdr : float, default=0.1 - False discovery rate threshold for multiple testing correction - fdr_control : str, default="bhq" - Method for FDR control ('bhq' for Benjamini-Hochberg) - Available methods are: - * 'bhq': Standard Benjamini-Hochberg :footcite:`benjamini1995controlling,bhy_2001` - * 'bhy': Benjamini-Hochberg-Yekutieli :footcite:p:`bhy_2001` - * 'ebh': e-Benjamini-Hochberg :footcite:`wang2022false` - reshaping_function : callable, optional (default=None) - Function to reshape data before FDR control - adaptive_aggregation : bool, default=False - Whether to use adaptive quantile aggregation - gamma : float, default=0.5 - Quantile level for aggregation - n_jobs : int or None, optional (default=None) - Number of parallel jobs. None means using all processors. - verbose : int, default=0 - Verbosity level for computation progress - **kwargs : dict - Additional arguments passed to p-value computation functions - - Returns - ------- - beta_hat : ndarray, shape (n_features,) or (n_features, n_tasks) - Averaged coefficients across bootstraps - selected : ndarray, shape (n_features,) - Selected features: 1 for positive effects, -1 for negative effects, - 0 for non-selected features - - References - ---------- - .. footbibliography:: + Each coefficient is scaled by 1/cluster_size to maintain proper magnitude + when distributing cluster effects to individual features. + Handles both univariate (1D) and multivariate (2D) beta coefficients. """ - results = Parallel(n_jobs=n_jobs, verbose=verbose)( - delayed(clustered_inference_pvalue)( - n_samples, - group, - list_ward[i], - list_desparsified_lassos[i], - **kwargs, - ) - for i in range(len(list_ward)) - ) - # Collecting results - list_beta_hat = [] - list_pval, list_pval_corr = [], [] - list_one_minus_pval, list_one_minus_pval_corr = [], [] - for beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr in results: - list_beta_hat.append(beta_hat) - list_pval.append(pval) - list_pval_corr.append(pval_corr) - list_one_minus_pval.append(one_minus_pval) - list_one_minus_pval_corr.append(one_minus_pval_corr) - - # Ensembling - beta_hat = np.mean(list_beta_hat, axis=0) - # pvalue selection - aggregated_pval = quantile_aggregation( - np.array(list_pval), gamma=gamma, adaptive=adaptive_aggregation - ) - threshold_pval = fdr_threshold( - aggregated_pval, - fdr=fdr, - method=fdr_control, - reshaping_function=reshaping_function, - ) - # 1-pvalue selection - aggregated_one_minus_pval = quantile_aggregation( - np.array(list_one_minus_pval), gamma=gamma, adaptive=adaptive_aggregation - ) - threshold_one_minus_pval = fdr_threshold( - aggregated_one_minus_pval, - fdr=fdr, - method=fdr_control, - reshaping_function=reshaping_function, - ) - # group seelction - selected = np.zeros_like(beta_hat) - selected[np.where(aggregated_pval <= threshold_pval)] = 1 - selected[np.where(aggregated_one_minus_pval <= threshold_one_minus_pval)] = -1 - - return beta_hat, selected + labels = ward.labels_ + # compute the size of each cluster + clusters_size = np.zeros(labels.size) + for label in range(labels.max() + 1): + cluster_size = np.sum(labels == label) + clusters_size[labels == label] = cluster_size + # degroup beta_hat + if len(beta_hat.shape) == 1: + # weighting the weight of beta with the size of the cluster + beta_hat_degrouped = ward.inverse_transform(beta_hat) / clusters_size + elif len(beta_hat.shape) == 2: + n_tasks = beta_hat.shape[1] + beta_hat_degrouped = np.zeros((n_features, n_tasks)) + for i in range(n_tasks): + beta_hat_degrouped[:, i] = ( + ward.inverse_transform(beta_hat[:, i]) / clusters_size + ) + return beta_hat_degrouped diff --git a/test/test_ensemble_clustered_inference.py b/test/test_ensemble_clustered_inference.py index a55d6a52..ab451be9 100644 --- a/test/test_ensemble_clustered_inference.py +++ b/test/test_ensemble_clustered_inference.py @@ -6,15 +6,26 @@ from numpy.testing import assert_almost_equal from sklearn.cluster import FeatureAgglomeration from sklearn.feature_extraction import image +from sklearn.linear_model import MultiTaskLassoCV +from sklearn.model_selection import KFold from sklearn.preprocessing import StandardScaler from hidimstat._utils.scenario import multivariate_simulation -from hidimstat.ensemble_clustered_inference import ( - clustered_inference, - clustered_inference_pvalue, - ensemble_clustered_inference, - ensemble_clustered_inference_pvalue, -) +from hidimstat.desparsified_lasso import DesparsifiedLasso +from hidimstat.ensemble_clustered_inference import EnsembleClusteredInference + + +def set_desparsified_lasso_multi_time(): + multitasklassoCV = MultiTaskLassoCV( + eps=1e-2, + fit_intercept=False, + cv=KFold(n_splits=5, shuffle=True, random_state=0), + tol=1e-4, + max_iter=5000, + random_state=1, + n_jobs=1, + ) + return DesparsifiedLasso(model_y=multitasklassoCV) # Scenario 1: data with no temporal dimension @@ -56,17 +67,14 @@ def test_clustered_inference_no_temporal(): n_clusters=n_clusters, connectivity=connectivity, linkage="ward" ) - ward_, desparsified_lassos = clustered_inference( - X_init, y, ward, scaler_sampling=StandardScaler() - ) - - beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( - clustered_inference_pvalue(n_samples, None, ward_, desparsified_lassos) - ) + clustered_inference = EnsembleClusteredInference( + ward=ward, scaler_sampling=StandardScaler(), n_bootstraps=1 + ).fit(X_init, y) + clustered_inference.importance(X_init, y) alpha = 0.05 - tp = np.sum(pval_corr[:interior_support] < alpha) - fp = np.sum(pval_corr[extended_support:] < alpha) + tp = np.sum(clustered_inference.pval_corr[:interior_support] < alpha) + fp = np.sum(clustered_inference.pval_corr[extended_support:] < alpha) power = tp / interior_support fdp = fp / max(fp + tp, 1) @@ -112,17 +120,17 @@ def test_clustered_inference_temporal(): n_clusters=n_clusters, connectivity=connectivity, linkage="ward" ) - ward_, desparsified_lassos = clustered_inference( - X, y, ward, scaler_sampling=StandardScaler() - ) - - beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( - clustered_inference_pvalue(n_samples, True, ward_, desparsified_lassos) - ) + clustered_inference = EnsembleClusteredInference( + ward=ward, + variable_importance=set_desparsified_lasso_multi_time(), + scaler_sampling=StandardScaler(), + n_bootstraps=1, + ).fit(X, y) + clustered_inference.importance(X, y) alpha = 0.05 - tp = np.sum(pval_corr[:interior_support] < alpha) - fp = np.sum(pval_corr[extended_support:] < alpha) + tp = np.sum(clustered_inference.pval_corr[:interior_support] < alpha) + fp = np.sum(clustered_inference.pval_corr[extended_support:] < alpha) power = tp / interior_support fdp = fp / max(fp + tp, 1) assert power >= 0.5 @@ -179,19 +187,17 @@ def test_clustered_inference_no_temporal_groups(): n_clusters=n_clusters, connectivity=connectivity, linkage="ward" ) - ward_, desparsified_lassos = clustered_inference( - X_, y_, ward, groups=groups, scaler_sampling=StandardScaler() - ) - - beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( - clustered_inference_pvalue( - n_groups * n_samples, False, ward_, desparsified_lassos - ) - ) + clustered_inference = EnsembleClusteredInference( + ward=ward, + scaler_sampling=StandardScaler(), + groups=groups, + n_bootstraps=1, + ).fit(X_, y_) + clustered_inference.importance(X_, y_) alpha = 0.05 - tp = np.sum(pval_corr[:interior_support] < alpha) - fp = np.sum(pval_corr[extended_support:] < alpha) + tp = np.sum(clustered_inference.pval_corr[:interior_support] < alpha) + fp = np.sum(clustered_inference.pval_corr[extended_support:] < alpha) power = tp / interior_support fdp = fp / max(fp + tp, 1) @@ -238,20 +244,13 @@ def test_ensemble_clustered_inference(): n_clusters=n_clusters, connectivity=connectivity, linkage="ward" ) - list_ward, list_desparsified_lassos = ensemble_clustered_inference( - X_init, - y, - ward, + EnCluDl = EnsembleClusteredInference( + ward=ward, scaler_sampling=StandardScaler(), n_bootstraps=n_bootstraps, - ) - beta_hat, selected = ensemble_clustered_inference_pvalue( - n_samples, - False, - list_ward, - list_desparsified_lassos, - fdr=fdr, - ) + ).fit(X_init, y) + EnCluDl.importance(X_init, y) + selected = EnCluDl.fdr_selection(fdr=0.1) tp = np.sum(selected[: support_size - margin_size]) fp = np.sum(selected[support_size + margin_size :]) @@ -296,33 +295,28 @@ def test_ensemble_clustered_inference_temporal_data(): n_clusters=n_clusters, connectivity=connectivity, linkage="ward" ) - list_ward, list_desparsified_lassos = ensemble_clustered_inference( - X, - y, - ward, + EnCluDl = EnsembleClusteredInference( + variable_importance=set_desparsified_lasso_multi_time(), + ward=ward, scaler_sampling=StandardScaler(), n_bootstraps=n_bootstraps, - ) - beta_hat, selected = ensemble_clustered_inference_pvalue( - n_samples, True, list_ward, list_desparsified_lassos, fdr_control="bhq", fdr=fdr - ) + ).fit(X, y) + EnCluDl.importance(X, y) + selected = EnCluDl.fdr_selection(fdr=0.1, fdr_control="bhq") tp = np.sum(selected[:interior_support]) fp = np.sum(selected[extended_support:]) power = tp / (interior_support) fdp = fp / max(fp + tp, 1) - assert power >= 0.5 assert fdp <= fdr # different aggregation method - beta_hat, selected = ensemble_clustered_inference_pvalue( - n_samples, - True, - list_ward, - list_desparsified_lassos, - fdr_control="bhy", - ) + selected = EnCluDl.fdr_selection(fdr=0.1, fdr_control="bhy") + + expected = np.zeros(n_features) + expected[:support_size] = 1.0 + tp = np.sum(selected[:interior_support]) fp = np.sum(selected[extended_support:]) power = tp / (interior_support)