From 72094ff86acf5347610d2308bf1388f354667a2e Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 22 Aug 2025 18:27:40 +0200 Subject: [PATCH 01/46] Improve reproducbility example --- ...ot_diabetes_variable_importance_example.py | 51 ++++++++----------- .../plot_importance_classification_iris.py | 19 +++++-- examples/plot_knockoff_aggregation.py | 22 +++++--- .../plot_pitfalls_permutation_importance.py | 32 ++++++++---- 4 files changed, 73 insertions(+), 51 deletions(-) diff --git a/examples/plot_diabetes_variable_importance_example.py b/examples/plot_diabetes_variable_importance_example.py index c9c1d0a6d..53f7407fa 100644 --- a/examples/plot_diabetes_variable_importance_example.py +++ b/examples/plot_diabetes_variable_importance_example.py @@ -54,9 +54,12 @@ from sklearn.linear_model import LogisticRegressionCV, RidgeCV from sklearn.metrics import r2_score, root_mean_squared_error from sklearn.model_selection import KFold +from sklearn.utils import check_random_state from hidimstat import CPI, LOCO, PFI +seeds = check_random_state(42).randint(1, np.iinfo(np.int32).max, 7) + ############################################################################# # Load the diabetes dataset # ------------------------- @@ -71,30 +74,12 @@ # diabetes dataset. n_folds = 5 -regressor = RidgeCV(alphas=np.logspace(-3, 3, 10)) -regressor_list = [clone(regressor) for _ in range(n_folds)] -kf = KFold(n_splits=n_folds, shuffle=True, random_state=0) -for i, (train_index, test_index) in enumerate(kf.split(X)): - regressor_list[i].fit(X[train_index], y[train_index]) - score = r2_score( - y_true=y[test_index], y_pred=regressor_list[i].predict(X[test_index]) - ) - mse = root_mean_squared_error( - y_true=y[test_index], y_pred=regressor_list[i].predict(X[test_index]) - ) - - print(f"Fold {i}: {score}") - print(f"Fold {i}: {mse}") -############################################################################# -# Fit a baselien model on the diabetes dataset -# -------------------------------------------- -# We use a Ridge regression model with a 10-fold cross-validation to fit the -# diabetes dataset. - -n_folds = 10 -regressor = RidgeCV(alphas=np.logspace(-3, 3, 10)) +regressor = RidgeCV( + alphas=np.logspace(-3, 3, 10), + cv=KFold(shuffle=True, random_state=seeds[0]), +) regressor_list = [clone(regressor) for _ in range(n_folds)] -kf = KFold(n_splits=n_folds, shuffle=True, random_state=0) +kf = KFold(n_splits=n_folds, shuffle=True, random_state=seeds[1]) for i, (train_index, test_index) in enumerate(kf.split(X)): regressor_list[i].fit(X[train_index], y[train_index]) score = r2_score( @@ -112,17 +97,23 @@ # -------------------------------------------------------- cpi_importance_list = [] +kf = KFold(n_splits=n_folds, shuffle=True, random_state=seeds[1]) for i, (train_index, test_index) in enumerate(kf.split(X)): print(f"Fold {i}") X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] cpi = CPI( estimator=regressor_list[i], - imputation_model_continuous=RidgeCV(alphas=np.logspace(-3, 3, 10)), - imputation_model_categorical=LogisticRegressionCV(Cs=np.logspace(-2, 2, 10)), - # covariate_estimator=HistGradientBoostingRegressor(random_state=0,), + imputation_model_continuous=RidgeCV( + alphas=np.logspace(-3, 3, 10), + cv=KFold(shuffle=True, random_state=seeds[3]), + ), + imputation_model_categorical=LogisticRegressionCV( + Cs=np.logspace(-2, 2, 10), + cv=KFold(shuffle=True, random_state=seeds[4]), + ), n_permutations=50, - random_state=0, + random_state=seeds[5], n_jobs=4, ) cpi.fit(X_train, y_train) @@ -134,7 +125,7 @@ # --------------------------------------------------------- loco_importance_list = [] - +kf = KFold(n_splits=n_folds, shuffle=True, random_state=seeds[1]) for i, (train_index, test_index) in enumerate(kf.split(X)): print(f"Fold {i}") X_train, X_test = X[train_index], X[test_index] @@ -153,7 +144,7 @@ # ---------------------------------------------------------------- pfi_importance_list = [] - +kf = KFold(n_splits=n_folds, shuffle=True, random_state=seeds[1]) for i, (train_index, test_index) in enumerate(kf.split(X)): print(f"Fold {i}") X_train, X_test = X[train_index], X[test_index] @@ -161,7 +152,7 @@ pfi = PFI( estimator=regressor_list[i], n_permutations=50, - random_state=0, + random_state=seeds[6], n_jobs=4, ) pfi.fit(X_train, y_train) diff --git a/examples/plot_importance_classification_iris.py b/examples/plot_importance_classification_iris.py index 6216ce044..a3c166a92 100644 --- a/examples/plot_importance_classification_iris.py +++ b/examples/plot_importance_classification_iris.py @@ -77,7 +77,9 @@ def run_one_fold(X, y, model, train_index, test_index, vim_name="CPI", groups=No if vim_name == "CPI": vim = CPI( estimator=model_c, - imputation_model_continuous=RidgeCV(alphas=np.logspace(-3, 3, 10)), + imputation_model_continuous=RidgeCV( + alphas=np.logspace(-3, 3, 10), cv=KFold(shuffle=True, random_state=1) + ), n_permutations=50, random_state=0, method=method, @@ -112,10 +114,19 @@ def run_one_fold(X, y, model, train_index, test_index, vim_name="CPI", groups=No # combination, in parallel. models = [ - LogisticRegressionCV(Cs=np.logspace(-3, 3, 10), tol=1e-3, max_iter=1000), - GridSearchCV(SVC(kernel="rbf"), {"C": np.logspace(-3, 3, 10)}), + LogisticRegressionCV( + Cs=np.logspace(-3, 3, 10), + tol=1e-3, + max_iter=1000, + cv=KFold(shuffle=True, random_state=2), + ), + GridSearchCV( + SVC(kernel="rbf"), + {"C": np.logspace(-3, 3, 10)}, + cv=KFold(shuffle=True, random_state=3), + ), ] -cv = KFold(n_splits=5, shuffle=True, random_state=0) +cv = KFold(n_splits=5, shuffle=True, random_state=4) groups = {ft: i for i, ft in enumerate(dataset.feature_names)} out_list = Parallel(n_jobs=5)( delayed(run_one_fold)( diff --git a/examples/plot_knockoff_aggregation.py b/examples/plot_knockoff_aggregation.py index b4ba66af5..14ea794a7 100644 --- a/examples/plot_knockoff_aggregation.py +++ b/examples/plot_knockoff_aggregation.py @@ -73,7 +73,8 @@ ####################################################################### # Define the function for running the three procedures on the same data # --------------------------------------------------------------------- -def single_run(n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, seed=None): +def single_run(n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, seed=0): + seeds = check_random_state(seed).randint(1, np.iinfo(np.int32).max, 4) # Generate data X, y, _, non_zero_index = multivariate_1D_simulation_AR( n_samples, n_features, rho=rho, sparsity=sparsity, seed=seed, snr=snr @@ -85,10 +86,10 @@ def single_run(n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, see y, estimator=LassoCV( n_jobs=1, - cv=KFold(n_splits=5, shuffle=True, random_state=0), + cv=KFold(n_splits=5, shuffle=True, random_state=seeds[0]), ), n_bootstraps=1, - random_state=seed, + random_state=seeds[1], ) mx_selection, _ = model_x_knockoff_pvalue(test_scores, fdr=fdr) fdp_mx, power_mx = fdp_power(mx_selection, non_zero_index) @@ -99,11 +100,11 @@ def single_run(n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, see y, estimator=LassoCV( n_jobs=1, - cv=KFold(n_splits=5, shuffle=True, random_state=0), + cv=KFold(n_splits=5, shuffle=True, random_state=seeds[2]), ), n_bootstraps=n_bootstraps, n_jobs=1, - random_state=seed, + random_state=seeds[3], ) # Use p-values aggregation [2] @@ -131,7 +132,7 @@ def plot_results(bounds, fdr, n_samples, n_features, power=False): for nb in range(len(bounds)): for i in range(len(bounds[nb])): y = bounds[nb][i] - x = np.random.normal(nb + 1, 0.05) + x = rng.normal(nb + 1, 0.05) plt.scatter(x, y, alpha=0.65, c="blue") plt.boxplot(bounds, sym="") @@ -165,7 +166,14 @@ def effect_number_samples(n_samples): parallel = Parallel(n_jobs, verbose=joblib_verbose) results = parallel( delayed(single_run)( - n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, seed=seed + n_samples, + n_features, + rho, + sparsity, + snr, + fdr, + n_bootstraps, + seed=seed, ) for seed in seed_list ) diff --git a/examples/plot_pitfalls_permutation_importance.py b/examples/plot_pitfalls_permutation_importance.py index e456349be..2592345c4 100644 --- a/examples/plot_pitfalls_permutation_importance.py +++ b/examples/plot_pitfalls_permutation_importance.py @@ -24,11 +24,13 @@ from sklearn.neural_network import MLPRegressor from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler +from sklearn.utils import check_random_state from hidimstat import CPI, PFI from hidimstat.conditional_sampling import ConditionalSampler -rng = np.random.RandomState(0) +rng = check_random_state(42) +seeds = rng.randint(1, np.iinfo(np.int32).max, 9) ############################################################################# # Load the California housing dataset and add a spurious feature @@ -40,7 +42,9 @@ dataset = fetch_california_housing() X_, y_ = dataset.data, dataset.target # only use 2/3 of samples to speed up the example -X, _, y, _ = train_test_split(X_, y_, test_size=0.6667, random_state=0, shuffle=True) +X, _, y, _ = train_test_split( + X_, y_, test_size=0.6667, random_state=seeds[0], shuffle=True +) redundant_coef = rng.choice(np.arange(X.shape[1]), size=(3,), replace=False) X_spurious = X[:, redundant_coef].sum(axis=1) @@ -85,7 +89,7 @@ regressor=make_pipeline( StandardScaler(), MLPRegressor( - random_state=0, + random_state=seeds[1], hidden_layer_sizes=(32, 16, 8), early_stopping=True, learning_rate_init=0.01, @@ -96,7 +100,7 @@ ) -kf = KFold(n_splits=5, shuffle=True, random_state=0) +kf = KFold(n_splits=5, shuffle=True, random_state=seeds[2]) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] @@ -118,6 +122,7 @@ # testing conditional importance, as it identifies the spurious feature as important. permutation_importances = [] conditional_permutation_importances = [] +kf = KFold(n_splits=5, shuffle=True, random_state=seeds[2]) for i, (train_index, test_index) in enumerate(kf.split(X)): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] @@ -128,7 +133,8 @@ pfi = PFI( model_c, n_permutations=50, - random_state=0, + n_jobs=5, + random_state=seeds[3], ) pfi.fit(X_test, y_test) @@ -185,6 +191,7 @@ # explained by the other features unchanged. This method is valid for testing conditional # importance. As shown below, it does not identify the spurious feature as important. conditional_importances = [] +kf = KFold(n_splits=5, shuffle=True, random_state=seeds[2]) for i, (train_index, test_index) in enumerate(kf.split(X)): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] @@ -194,8 +201,11 @@ # Compute conditional permutation feature importance cpi = CPI( model_c, - imputation_model_continuous=RidgeCV(alphas=np.logspace(-3, 3, 5)), - random_state=0, + imputation_model_continuous=RidgeCV( + alphas=np.logspace(-3, 3, 5), + cv=KFold(shuffle=True, random_state=seeds[4]), + ), + random_state=seeds[5], n_jobs=5, ) cpi.fit(X_test, y_test) @@ -251,12 +261,14 @@ X_train, X_test = train_test_split( X, test_size=0.3, - random_state=0, + random_state=seeds[6], ) conditional_sampler = ConditionalSampler( - model_regression=RidgeCV(alphas=np.logspace(-3, 3, 5)), - random_state=0, + model_regression=RidgeCV( + alphas=np.logspace(-3, 3, 5), cv=KFold(shuffle=True, random_state=seeds[7]) + ), + random_state=seeds[8], ) From fb8acdbfd3b8d398441b9d5b713f049896388f49 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 22 Aug 2025 18:28:06 +0200 Subject: [PATCH 02/46] Improve reproducibility in code base --- src/hidimstat/base_perturbation.py | 21 ++++++++++++++----- .../conditional_permutation_importance.py | 14 +++++++------ src/hidimstat/knockoffs.py | 13 +++++------- src/hidimstat/leave_one_covariate_out.py | 2 +- .../permutation_feature_importance.py | 8 +++---- 5 files changed, 34 insertions(+), 24 deletions(-) diff --git a/src/hidimstat/base_perturbation.py b/src/hidimstat/base_perturbation.py index 437e1aeee..5332fa07a 100644 --- a/src/hidimstat/base_perturbation.py +++ b/src/hidimstat/base_perturbation.py @@ -49,6 +49,7 @@ def __init__( self.n_jobs = n_jobs self.n_permutations = n_permutations self.n_groups = None + self.random_state = None def fit(self, X, y=None, groups=None): """Base fit method for perturbation-based methods. Identifies the groups. @@ -105,9 +106,17 @@ def predict(self, X): X_ = np.asarray(X) # Parallelize the computation of the importance scores for each group + if self.random_state is None: + list_seed = [None for i in range(self.n_groups)] + else: + list_seed = self.random_state.randint( + 1, np.iinfo(np.int32).max, self.n_groups + ) out_list = Parallel(n_jobs=self.n_jobs)( - delayed(self._joblib_predict_one_group)(X_, group_id, group_key) - for group_id, group_key in enumerate(self.groups.keys()) + delayed(self._joblib_predict_one_group)(X_, group_id, group_key, seed) + for group_id, (group_key, seed) in enumerate( + zip(self.groups.keys(), list_seed) + ) ) return np.stack(out_list, axis=0) @@ -168,7 +177,7 @@ def _check_fit(self): " call fit with groups=None" ) - def _joblib_predict_one_group(self, X, group_id, group_key): + def _joblib_predict_one_group(self, X, group_id, group_key, seed): """ Compute the predictions after perturbation of the data for a given group of variables. This function is parallelized. @@ -181,6 +190,8 @@ def _joblib_predict_one_group(self, X, group_id, group_key): The index of the group of variables. group_key: str, int The key of the group of variables. (parameter use for debugging) + seed: int, optional + Random seed for reproducibility. """ group_ids = self._groups_ids[group_id] non_group_ids = np.delete(np.arange(X.shape[1]), group_ids) @@ -188,7 +199,7 @@ def _joblib_predict_one_group(self, X, group_id, group_key): # where the j-th group of covariates is permuted X_perm = np.empty((self.n_permutations, X.shape[0], X.shape[1])) X_perm[:, :, non_group_ids] = np.delete(X, group_ids, axis=1) - X_perm[:, :, group_ids] = self._permutation(X, group_id=group_id) + X_perm[:, :, group_ids] = self._permutation(X, group_id=group_id, seed=seed) # Reshape X_perm to allow for batch prediction X_perm_batch = X_perm.reshape(-1, X.shape[1]) y_pred_perm = getattr(self.estimator, self.method)(X_perm_batch) @@ -202,6 +213,6 @@ def _joblib_predict_one_group(self, X, group_id, group_key): ) return y_pred_perm - def _permutation(self, X, group_id): + def _permutation(self, X, group_id, seed): """Method for creating the permuted data for the j-th group of covariates.""" raise NotImplementedError diff --git a/src/hidimstat/conditional_permutation_importance.py b/src/hidimstat/conditional_permutation_importance.py index e0a71e064..746ef873d 100644 --- a/src/hidimstat/conditional_permutation_importance.py +++ b/src/hidimstat/conditional_permutation_importance.py @@ -72,7 +72,7 @@ def __init__( self.categorical_max_cardinality = categorical_max_cardinality self.imputation_model_categorical = imputation_model_categorical self.imputation_model_continuous = imputation_model_continuous - self.random_state = random_state + self.random_state = check_random_state(random_state) def fit(self, X, y=None, groups=None, var_type="auto"): """Fit the imputation models. @@ -96,7 +96,6 @@ def fit(self, X, y=None, groups=None, var_type="auto"): self : object Returns the instance itself. """ - self.random_state = check_random_state(self.random_state) super().fit(X, None, groups=groups) if isinstance(var_type, str): self.var_type = [var_type for _ in range(self.n_groups)] @@ -105,7 +104,7 @@ def fit(self, X, y=None, groups=None, var_type="auto"): self._list_imputation_models = [ ConditionalSampler( - data_type=self.var_type[groupd_id], + data_type=self.var_type[group_id], model_regression=( None if self.imputation_model_continuous is None @@ -116,10 +115,13 @@ def fit(self, X, y=None, groups=None, var_type="auto"): if self.imputation_model_categorical is None else clone(self.imputation_model_categorical) ), - random_state=self.random_state, + random_state=seed, categorical_max_cardinality=self.categorical_max_cardinality, ) - for groupd_id in range(self.n_groups) + for group_id, seed in zip( + range(self.n_groups), + self.random_state.randint(0, np.iinfo(np.int32).max, self.n_groups), + ) ] # Parallelize the fitting of the covariate estimators @@ -149,7 +151,7 @@ def _check_fit(self): for m in self._list_imputation_models: check_is_fitted(m.model) - def _permutation(self, X, group_id): + def _permutation(self, X, group_id, seed): """Sample from the conditional distribution using a permutation of the residuals.""" X_j = X[:, self._groups_ids[group_id]].copy() diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index 7dab07aeb..b01605051 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -6,6 +6,7 @@ from sklearn.preprocessing import StandardScaler from sklearn.utils import check_random_state from sklearn.utils.validation import check_memory +from sklearn.base import clone from hidimstat.gaussian_knockoff import ( gaussian_knockoff_generation, @@ -188,13 +189,9 @@ def model_x_knockoff( parallel = Parallel(n_jobs, verbose=joblib_verbose) # get the seed for the different run - if isinstance(random_state, (int, np.int32, np.int64)): - rng = check_random_state(random_state) - elif random_state is None: - rng = check_random_state(0) - else: - raise TypeError("Wrong type for random_state") - seed_list = rng.randint(1, np.iinfo(np.int32).max, n_bootstraps) + seed_list = check_random_state(random_state).randint( + 0, np.iinfo(np.int32).max, n_bootstraps + ) if centered: X = StandardScaler().fit_transform(X) @@ -225,7 +222,7 @@ def model_x_knockoff( results = parallel( delayed(memory.cache(_stat_coefficient_diff))( - X, X_tildes[i], y, estimator, fdr, preconfigure_estimator + X, X_tildes[i], y, clone(estimator), fdr, preconfigure_estimator ) for i in range(n_bootstraps) ) diff --git a/src/hidimstat/leave_one_covariate_out.py b/src/hidimstat/leave_one_covariate_out.py index f1ea71894..6330e04a9 100644 --- a/src/hidimstat/leave_one_covariate_out.py +++ b/src/hidimstat/leave_one_covariate_out.py @@ -93,7 +93,7 @@ def _joblib_fit_one_group(self, estimator, X, y, key_groups): estimator.fit(X_minus_j, y) return estimator - def _joblib_predict_one_group(self, X, group_id, key_groups): + def _joblib_predict_one_group(self, X, group_id, key_groups, seed): """Predict the target variable after removing a group of covariates. Used in parallel.""" X_minus_j = np.delete(X, self._groups_ids[group_id], axis=1) diff --git a/src/hidimstat/permutation_feature_importance.py b/src/hidimstat/permutation_feature_importance.py index 29d007656..485eecd65 100644 --- a/src/hidimstat/permutation_feature_importance.py +++ b/src/hidimstat/permutation_feature_importance.py @@ -53,14 +53,14 @@ def __init__( n_jobs=n_jobs, n_permutations=n_permutations, ) - self.random_state = random_state + self.random_state = check_random_state(random_state) - def _permutation(self, X, group_id): + def _permutation(self, X, group_id, seed): """Create the permuted data for the j-th group of covariates""" - self.random_state = check_random_state(self.random_state) + rgn = np.random.RandomState(seed) X_perm_j = np.array( [ - self.random_state.permutation(X[:, self._groups_ids[group_id]].copy()) + rgn.permutation(X[:, self._groups_ids[group_id]].copy()) for _ in range(self.n_permutations) ] ) From 93fd34fcdcaf9cd9f0da87f261259f377c9f6d63 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 22 Aug 2025 18:28:59 +0200 Subject: [PATCH 03/46] Test for testing reproducibilities --- .../debugger_script/try_reproducibility.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 tools/examples/debugger_script/try_reproducibility.py diff --git a/tools/examples/debugger_script/try_reproducibility.py b/tools/examples/debugger_script/try_reproducibility.py new file mode 100644 index 000000000..d30aeeab2 --- /dev/null +++ b/tools/examples/debugger_script/try_reproducibility.py @@ -0,0 +1,17 @@ +import os +import sys + + +from joblib import Parallel, delayed + +# add the example for import them +sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../../examples") + + +def run_joblib(i): + import plot_knockoff_aggregation # include the example to test + + +# run in parralel the same example for compare result +parrallel = Parallel(n_jobs=4) +parrallel(delayed(run_joblib)(i) for i in range(6)) From 4261f070fcc76e3167ad22da7e0bf7e05f9975eb Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 22 Aug 2025 18:46:13 +0200 Subject: [PATCH 04/46] fix error --- test/test_base_perturbation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_base_perturbation.py b/test/test_base_perturbation.py index dd3ff6d6c..835c3f12e 100644 --- a/test/test_base_perturbation.py +++ b/test/test_base_perturbation.py @@ -11,4 +11,4 @@ def test_no_implemented_methods(): estimator.fit(X[:, 0], X[:, 1]) basic_class = BasePerturbation(estimator=estimator) with pytest.raises(NotImplementedError): - basic_class._permutation(X, group_id=None) + basic_class._permutation(X, group_id=None, seed=0) From 46e9961e6ee90c732c83d4acfbe4f3b0bed4ad6f Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 25 Aug 2025 11:32:47 +0200 Subject: [PATCH 05/46] homogenize managment of seed in examples --- examples/plot_2D_simulation_example.py | 17 ++++++++------- .../plot_conditional_vs_marginal_xor_data.py | 17 +++++++++------ examples/plot_dcrt_example.py | 17 +++++++++++---- ...ot_diabetes_variable_importance_example.py | 4 ++-- examples/plot_fmri_data_example.py | 16 +++++++++++--- .../plot_importance_classification_iris.py | 14 ++++++++----- examples/plot_knockoff_aggregation.py | 11 ++++------ examples/plot_knockoffs_wisconsin.py | 16 +++++++------- examples/plot_model_agnostic_importance.py | 21 ++++++++++++------- .../plot_pitfalls_permutation_importance.py | 6 +++--- 10 files changed, 87 insertions(+), 52 deletions(-) diff --git a/examples/plot_2D_simulation_example.py b/examples/plot_2D_simulation_example.py index 704078933..0986d2fb6 100644 --- a/examples/plot_2D_simulation_example.py +++ b/examples/plot_2D_simulation_example.py @@ -71,6 +71,9 @@ from hidimstat.statistical_tools.p_values import zscore_from_pval from hidimstat._utils.scenario import multivariate_simulation +# Define the seeds for the reproducibility of the example +seeds = np.random.RandomState(0).randint(1e3, size=4) + ############################################################################# # Specific plotting functions # --------------------------- @@ -172,7 +175,7 @@ def plot(maps, titles): # generating the data X_init, y, beta, epsilon, _, _ = multivariate_simulation( - n_samples, shape, roi_size, sigma, smooth_X, seed=1 + n_samples, shape, roi_size, sigma, smooth_X, seed=seeds[0] ) ############################################################################## @@ -242,7 +245,9 @@ def plot(maps, titles): # and referred to as Desparsified Lasso. # compute desparsified lasso -beta_hat, sigma_hat, precision_diagonal = desparsified_lasso(X_init, y, n_jobs=n_jobs) +beta_hat, sigma_hat, precision_diagonal = desparsified_lasso( + X_init, y, n_jobs=n_jobs, seed=seeds[1] +) pval, pval_corr, one_minus_pval, one_minus_pval_corr, cb_min, cb_max = ( desparsified_lasso_pvalue(X_init.shape[0], beta_hat, sigma_hat, precision_diagonal) ) @@ -270,7 +275,7 @@ def plot(maps, titles): # clustered desparsified lasso (CluDL) ward_, beta_hat, theta_hat, omega_diag = clustered_inference( - X_init, y, ward, n_clusters, scaler_sampling=StandardScaler() + X_init, y, ward, n_clusters, scaler_sampling=StandardScaler(), seed=seeds[2] ) beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( clustered_inference_pvalue(n_samples, False, ward_, beta_hat, theta_hat, omega_diag) @@ -295,11 +300,7 @@ def plot(maps, titles): # ensemble of clustered desparsified lasso (EnCluDL) list_ward, list_beta_hat, list_theta_hat, list_omega_diag = ( ensemble_clustered_inference( - X_init, - y, - ward, - n_clusters, - scaler_sampling=StandardScaler(), + X_init, y, ward, n_clusters, scaler_sampling=StandardScaler(), seed=seeds[3] ) ) beta_hat, selected_ecdl = ensemble_clustered_inference_pvalue( diff --git a/examples/plot_conditional_vs_marginal_xor_data.py b/examples/plot_conditional_vs_marginal_xor_data.py index 6b9b242ad..52f21ed9d 100644 --- a/examples/plot_conditional_vs_marginal_xor_data.py +++ b/examples/plot_conditional_vs_marginal_xor_data.py @@ -18,10 +18,13 @@ from hidimstat import CPI +# Define the seeds for the reproducibility of the example +rng = np.random.RandomState(0) +seeds = rng.randint(1e3, size=5) + ############################################################################# # To solve the XOR problem, we will use a Support Vector Classier (SVC) with Radial Basis Function (RBF) kernel. The decision function of # the fitted model shows that the model is able to separate the two classes. -rng = np.random.RandomState(0) X = rng.randn(400, 2) Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0).astype(int) @@ -30,8 +33,10 @@ np.linspace(np.min(X[:, 1]), np.max(X[:, 1]), 100), ) -X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0) -model = SVC(kernel="rbf", random_state=0) +X_train, X_test, y_train, y_test = train_test_split( + X, Y, test_size=0.2, random_state=seeds[0] +) +model = SVC(kernel="rbf", random_state=seeds[1]) model.fit(X_train, y_train) Z = model.decision_function(np.c_[xx.ravel(), yy.ravel()]) @@ -79,8 +84,8 @@ # features. Conditional importance, on the other hand, reveals that both features # are important (therefore rejecting the null hypothesis # :math:`Y \perp\!\!\!\perp X^1 | X^2`). -cv = KFold(n_splits=5, shuffle=True, random_state=0) -clf = SVC(kernel="rbf", random_state=0) +cv = KFold(n_splits=5, shuffle=True, random_state=seeds[2]) +clf = SVC(kernel="rbf", random_state=seeds[3]) # Compute marginal importance using univariate models marginal_scores = [] for i in range(X.shape[1]): @@ -114,7 +119,7 @@ loss=hinge_loss, imputation_model_continuous=RidgeCV(np.logspace(-3, 3, 10)), n_permutations=50, - random_state=0, + random_state=seeds[4], ) vim.fit(X_train, y_train) importances.append(vim.importance(X_test, y_test)["importance"]) diff --git a/examples/plot_dcrt_example.py b/examples/plot_dcrt_example.py index b3f1efbcf..e8e301f29 100644 --- a/examples/plot_dcrt_example.py +++ b/examples/plot_dcrt_example.py @@ -22,6 +22,9 @@ from hidimstat import D0CRT from hidimstat._utils.scenario import multivariate_1D_simulation +# Define the seeds for the reproducibility of the example +seeds = np.random.RandomState(0).randint(1e3, size=10) + ############################################################################# # Processing the computations # --------------------------- @@ -29,7 +32,6 @@ results_list = [] for sim_ind in range(10): print(f"Processing: {sim_ind+1}") - np.random.seed(sim_ind) # Number of observations n = 100 @@ -45,14 +47,18 @@ alpha = 5e-2 X, y, _, __ = multivariate_1D_simulation( - n_samples=n, n_features=p, support_size=n_signal, rho=rho, seed=sim_ind + n_samples=n, n_features=p, support_size=n_signal, rho=rho, seed=seeds[sim_ind] ) # Applying a reLu function on the outcome y to get non-linear relationships y = np.maximum(0.0, y) ## dcrt Lasso ## - d0crt_lasso = D0CRT(estimator=LassoCV(random_state=42, n_jobs=1), screening=False) + d0crt_lasso = D0CRT( + estimator=LassoCV(random_state=seeds[sim_ind] + 1, n_jobs=1), + screening=False, + random_state=seeds[sim_ind] + 2, + ) d0crt_lasso.fit_importance(X, y) pvals_lasso = d0crt_lasso.pvalues_ results_list.append( @@ -65,8 +71,11 @@ ## dcrt Random Forest ## d0crt_random_forest = D0CRT( - estimator=RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=1), + estimator=RandomForestRegressor( + n_estimators=100, random_state=seeds[sim_ind] + 3, n_jobs=1 + ), screening=False, + random_state=seeds[sim_ind] + 4, ) d0crt_random_forest.fit_importance(X, y) pvals_forest = d0crt_random_forest.pvalues_ diff --git a/examples/plot_diabetes_variable_importance_example.py b/examples/plot_diabetes_variable_importance_example.py index 53f7407fa..817dbb366 100644 --- a/examples/plot_diabetes_variable_importance_example.py +++ b/examples/plot_diabetes_variable_importance_example.py @@ -54,11 +54,11 @@ from sklearn.linear_model import LogisticRegressionCV, RidgeCV from sklearn.metrics import r2_score, root_mean_squared_error from sklearn.model_selection import KFold -from sklearn.utils import check_random_state from hidimstat import CPI, LOCO, PFI -seeds = check_random_state(42).randint(1, np.iinfo(np.int32).max, 7) +# Define the seeds for the reproducibility of the example +seeds = np.random.RandomState(0).randint(1e3, size=7) ############################################################################# # Load the diabetes dataset diff --git a/examples/plot_fmri_data_example.py b/examples/plot_fmri_data_example.py index 6ca4d1d14..cbe6af5d2 100644 --- a/examples/plot_fmri_data_example.py +++ b/examples/plot_fmri_data_example.py @@ -61,6 +61,8 @@ ) from hidimstat.statistical_tools.p_values import zscore_from_pval +# Define the seeds for the reproducibility of the example +seeds = np.random.RandomState(0).randint(1e3, size=3) # Remmove warnings during loading data warnings.filterwarnings( @@ -152,7 +154,7 @@ def preprocess_haxby(subject=2, memory=None): # feature aggregation methods. try: beta_hat, sigma_hat, precision_diagonal = desparsified_lasso( - X, y, noise_method="median", max_iteration=1000 + X, y, noise_method="median", max_iteration=1000, seed=seeds[0], n_jobs=n_job ) pval_dl, _, one_minus_pval_dl, _, cb_min, cb_max = desparsified_lasso_pvalue( X.shape[0], beta_hat, sigma_hat, precision_diagonal @@ -166,7 +168,14 @@ def preprocess_haxby(subject=2, memory=None): # Now, the clustered inference algorithm which combines parcellation # and high-dimensional inference (c.f. References). ward_, beta_hat, theta_hat, omega_diag = clustered_inference( - X, y, ward, n_clusters, scaler_sampling=StandardScaler(), tolerance=1e-2 + X, + y, + ward, + n_clusters, + scaler_sampling=StandardScaler(), + tolerance=1e-2, + seed=seeds[1], + n_jobs=n_job, ) beta_hat, pval_cdl, _, one_minus_pval_cdl, _ = clustered_inference_pvalue( X.shape[0], None, ward_, beta_hat, theta_hat, omega_diag @@ -191,7 +200,8 @@ def preprocess_haxby(subject=2, memory=None): n_bootstraps=5, max_iteration=6000, tolerance=1e-2, - n_jobs=2, + seed=seeds[2], + n_jobs=n_job, ) ) beta_hat, selected = ensemble_clustered_inference_pvalue( diff --git a/examples/plot_importance_classification_iris.py b/examples/plot_importance_classification_iris.py index a3c166a92..47f4eb7c4 100644 --- a/examples/plot_importance_classification_iris.py +++ b/examples/plot_importance_classification_iris.py @@ -36,6 +36,11 @@ from hidimstat import CPI, PFI + +# Define the seeds for the reproducibility of the example +rng = np.random.RandomState(0) +seeds = rng.randint(1e3, size=4) + ######################################################################## # Load the iris dataset and add a spurious feature # ---------------------------------------------------------------------- @@ -44,7 +49,6 @@ # allows to illustrate that `PFI` is not robust to spurious features, # contrarily to `CPI`. dataset = load_iris() -rng = np.random.RandomState(0) X, y = dataset.data, dataset.target spurious_feat = X[:, 2] + X[:, 3] spurious_feat += rng.normal(size=X.shape[0], scale=np.std(spurious_feat) / 2) @@ -81,7 +85,7 @@ def run_one_fold(X, y, model, train_index, test_index, vim_name="CPI", groups=No alphas=np.logspace(-3, 3, 10), cv=KFold(shuffle=True, random_state=1) ), n_permutations=50, - random_state=0, + random_state=seeds[0], method=method, loss=loss, ) @@ -89,7 +93,7 @@ def run_one_fold(X, y, model, train_index, test_index, vim_name="CPI", groups=No vim = PFI( estimator=model_c, n_permutations=50, - random_state=0, + random_state=seeds[1], method=method, loss=loss, ) @@ -118,12 +122,12 @@ def run_one_fold(X, y, model, train_index, test_index, vim_name="CPI", groups=No Cs=np.logspace(-3, 3, 10), tol=1e-3, max_iter=1000, - cv=KFold(shuffle=True, random_state=2), + cv=KFold(shuffle=True, random_state=seeds[2]), ), GridSearchCV( SVC(kernel="rbf"), {"C": np.logspace(-3, 3, 10)}, - cv=KFold(shuffle=True, random_state=3), + cv=KFold(shuffle=True, random_state=seeds[3]), ), ] cv = KFold(n_splits=5, shuffle=True, random_state=4) diff --git a/examples/plot_knockoff_aggregation.py b/examples/plot_knockoff_aggregation.py index 14ea794a7..b77ee844c 100644 --- a/examples/plot_knockoff_aggregation.py +++ b/examples/plot_knockoff_aggregation.py @@ -22,7 +22,6 @@ import numpy as np from sklearn.linear_model import LassoCV from sklearn.model_selection import KFold -from sklearn.utils import check_random_state from hidimstat.knockoffs import ( model_x_knockoff, @@ -59,22 +58,20 @@ snr = 10 # number of repetitions for the bootstraps n_bootstraps = 25 -# seed for the random generator -seed = 45 # number of jobs for repetition of the method n_jobs = 2 # verbosity of the joblib joblib_verbose = 0 - -rng = check_random_state(seed) -seed_list = rng.randint(1, np.iinfo(np.int32).max, runs) +# Define the seeds for the reproducibility of the example +rng = np.random.RandomState(45) +seed_list = rng.randint(1e3, size=runs) ####################################################################### # Define the function for running the three procedures on the same data # --------------------------------------------------------------------- def single_run(n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, seed=0): - seeds = check_random_state(seed).randint(1, np.iinfo(np.int32).max, 4) + seeds = np.random.RandomState(seed).randint(1, np.iinfo(np.int32).max, 4) # Generate data X, y, _, non_zero_index = multivariate_1D_simulation_AR( n_samples, n_features, rho=rho, sparsity=sparsity, seed=seed, snr=snr diff --git a/examples/plot_knockoffs_wisconsin.py b/examples/plot_knockoffs_wisconsin.py index 8839ff332..a03843436 100644 --- a/examples/plot_knockoffs_wisconsin.py +++ b/examples/plot_knockoffs_wisconsin.py @@ -12,8 +12,10 @@ import numpy as np import pandas as pd -seed = 0 -rng = np.random.RandomState(seed) + +# Define the seeds for the reproducibility of the example +rng = np.random.RandomState(43) +seeds = rng.randint(1e3, size=5) ######################################################################################## @@ -32,7 +34,7 @@ y = data.target X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.1, random_state=seed + X, y, test_size=0.1, random_state=seeds[0] ) scaler = StandardScaler() @@ -53,7 +55,7 @@ from sklearn.linear_model import LogisticRegressionCV clf = LogisticRegressionCV( - Cs=np.logspace(-3, 3, 10), penalty="l1", solver="liblinear", random_state=rng + Cs=np.logspace(-3, 3, 10), penalty="l1", solver="liblinear", random_state=seeds[1] ) clf.fit(X_train, y_train) print(f"Accuracy of Lasso on test set: {clf.score(X_test, y_test):.3f}") @@ -100,7 +102,7 @@ Cs=np.logspace(-3, 3, 10), penalty="l1", solver="liblinear", - random_state=rng, + random_state=seeds[2], n_jobs=1, ) lasso_noisy.fit(noisy_train, y_train) @@ -150,12 +152,12 @@ solver="liblinear", penalty="l1", Cs=np.logspace(-3, 3, 10), - random_state=rng, + random_state=seeds[3], tol=1e-3, max_iter=1000, ), n_bootstraps=1, - random_state=0, + random_state=seeds[4], tol_gauss=1e-15, preconfigure_estimator=None, fdr=fdr, diff --git a/examples/plot_model_agnostic_importance.py b/examples/plot_model_agnostic_importance.py index 9ace442d4..039a71d96 100644 --- a/examples/plot_model_agnostic_importance.py +++ b/examples/plot_model_agnostic_importance.py @@ -37,11 +37,14 @@ from hidimstat import LOCO, D0CRT +# Define the seeds for the reproducibility of the example +rng = np.random.RandomState(0) +seeds = rng.randint(1e3, size=6) + ############################################################################# # Generate data where classes are not linearly separable # -------------------------------------------------------------- -rng = np.random.RandomState(0) -X, y = make_circles(n_samples=500, noise=0.1, factor=0.6, random_state=rng) +X, y = make_circles(n_samples=500, noise=0.1, factor=0.6, random_state=seeds[0]) fig, ax = plt.subplots() @@ -60,8 +63,8 @@ ############################################################################### # Define a linear and a non-linear estimator # ------------------------------------------ -non_linear_model = SVC(kernel="rbf", random_state=0) -linear_model = LogisticRegressionCV(Cs=np.logspace(-3, 3, 5)) +non_linear_model = SVC(kernel="rbf", random_state=seeds[1]) +linear_model = LogisticRegressionCV(Cs=np.logspace(-3, 3, 5), random_state=seeds[2]) ############################################################################### # Compute p-values using d0CRT @@ -70,11 +73,15 @@ # test (:math:`H_0: X_j \perp\!\!\!\perp y | X_{-j}`) for each variable. However, # this test is based on a linear model (LogisticRegression) and fails to reject the null # in the presence of non-linear relationships. -d0crt_linear = D0CRT(estimator=clone(linear_model), screening=False) +d0crt_linear = D0CRT( + estimator=clone(linear_model), screening=False, random_state=seeds[3] +) d0crt_linear.fit_importance(X, y) pval_dcrt_linear = d0crt_linear.pvalues_ -d0crt_non_linear = D0CRT(estimator=clone(non_linear_model), screening=False) +d0crt_non_linear = D0CRT( + estimator=clone(non_linear_model), screening=False, random_state=seeds[4] +) d0crt_non_linear.fit_importance(X, y) pval_dcrt_non_linear = d0crt_non_linear.pvalues_ @@ -86,7 +93,7 @@ # misspecified model, such as a linear model for this dataset, LOCO fails to reject the null # similarly to d0CRT. However, when using a non-linear model (SVC), LOCO is able to # identify the important variables. -cv = KFold(n_splits=5, shuffle=True, random_state=0) +cv = KFold(n_splits=5, shuffle=True, random_state=seeds[5]) importances_linear = [] importances_non_linear = [] diff --git a/examples/plot_pitfalls_permutation_importance.py b/examples/plot_pitfalls_permutation_importance.py index 2592345c4..b3ae99e9f 100644 --- a/examples/plot_pitfalls_permutation_importance.py +++ b/examples/plot_pitfalls_permutation_importance.py @@ -24,13 +24,13 @@ from sklearn.neural_network import MLPRegressor from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler -from sklearn.utils import check_random_state from hidimstat import CPI, PFI from hidimstat.conditional_sampling import ConditionalSampler -rng = check_random_state(42) -seeds = rng.randint(1, np.iinfo(np.int32).max, 9) +# Define the seeds for the reproducibility of the example +rng = np.random.RandomState(0) +seeds = rng.randint(1e3, size=9) ############################################################################# # Load the California housing dataset and add a spurious feature From 8085706443c315b76bda524b0105109357cc5b7a Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 25 Aug 2025 16:30:30 +0200 Subject: [PATCH 06/46] fix randomize in plot_konckoff_aggregation --- examples/plot_knockoff_aggregation.py | 10 ++++++---- src/hidimstat/knockoffs.py | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/plot_knockoff_aggregation.py b/examples/plot_knockoff_aggregation.py index b77ee844c..e27bdda1a 100644 --- a/examples/plot_knockoff_aggregation.py +++ b/examples/plot_knockoff_aggregation.py @@ -71,7 +71,7 @@ # Define the function for running the three procedures on the same data # --------------------------------------------------------------------- def single_run(n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, seed=0): - seeds = np.random.RandomState(seed).randint(1, np.iinfo(np.int32).max, 4) + seeds = np.random.RandomState(seed).randint(np.iinfo(np.int32).max, 6) # Generate data X, y, _, non_zero_index = multivariate_1D_simulation_AR( n_samples, n_features, rho=rho, sparsity=sparsity, seed=seed, snr=snr @@ -84,9 +84,10 @@ def single_run(n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, see estimator=LassoCV( n_jobs=1, cv=KFold(n_splits=5, shuffle=True, random_state=seeds[0]), + random_state=seeds[1], ), n_bootstraps=1, - random_state=seeds[1], + random_state=seeds[2], ) mx_selection, _ = model_x_knockoff_pvalue(test_scores, fdr=fdr) fdp_mx, power_mx = fdp_power(mx_selection, non_zero_index) @@ -97,11 +98,12 @@ def single_run(n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, see y, estimator=LassoCV( n_jobs=1, - cv=KFold(n_splits=5, shuffle=True, random_state=seeds[2]), + cv=KFold(n_splits=5, shuffle=True, random_state=seeds[3]), + random_state=seeds[4], ), n_bootstraps=n_bootstraps, n_jobs=1, - random_state=seeds[3], + random_state=seeds[5], ) # Use p-values aggregation [2] diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index b01605051..ef1629be9 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -190,7 +190,7 @@ def model_x_knockoff( # get the seed for the different run seed_list = check_random_state(random_state).randint( - 0, np.iinfo(np.int32).max, n_bootstraps + np.iinfo(np.int32).max, size=n_bootstraps ) if centered: From e68a9a3e818ba94de93f503da6c789976668c793 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 25 Aug 2025 16:48:23 +0200 Subject: [PATCH 07/46] fix error in plot --- examples/plot_knockoff_aggregation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/plot_knockoff_aggregation.py b/examples/plot_knockoff_aggregation.py index e27bdda1a..43d62d79d 100644 --- a/examples/plot_knockoff_aggregation.py +++ b/examples/plot_knockoff_aggregation.py @@ -71,7 +71,7 @@ # Define the function for running the three procedures on the same data # --------------------------------------------------------------------- def single_run(n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, seed=0): - seeds = np.random.RandomState(seed).randint(np.iinfo(np.int32).max, 6) + seeds = np.random.RandomState(seed).randint(np.iinfo(np.int32).max, size=6) # Generate data X, y, _, non_zero_index = multivariate_1D_simulation_AR( n_samples, n_features, rho=rho, sparsity=sparsity, seed=seed, snr=snr From 0b90779f813721c0bed60ad41b0493138d1dff6b Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 25 Aug 2025 18:32:48 +0200 Subject: [PATCH 08/46] fix path --- tools/examples/debugger_script/try_reproducibility.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/examples/debugger_script/try_reproducibility.py b/tools/examples/debugger_script/try_reproducibility.py index d30aeeab2..95d3c804a 100644 --- a/tools/examples/debugger_script/try_reproducibility.py +++ b/tools/examples/debugger_script/try_reproducibility.py @@ -5,7 +5,7 @@ from joblib import Parallel, delayed # add the example for import them -sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../../examples") +sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../../../examples") def run_joblib(i): From 8f4011f7a541705ff631d9a527ba8cafa33e4fdf Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 25 Aug 2025 18:34:22 +0200 Subject: [PATCH 09/46] fix seed for seaborn --- examples/plot_knockoff_aggregation.py | 2 +- .../plot_pitfalls_permutation_importance.py | 17 +++++++---------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/examples/plot_knockoff_aggregation.py b/examples/plot_knockoff_aggregation.py index 43d62d79d..699fd2cee 100644 --- a/examples/plot_knockoff_aggregation.py +++ b/examples/plot_knockoff_aggregation.py @@ -71,7 +71,7 @@ # Define the function for running the three procedures on the same data # --------------------------------------------------------------------- def single_run(n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, seed=0): - seeds = np.random.RandomState(seed).randint(np.iinfo(np.int32).max, size=6) + seeds = np.random.RandomState(seed).randint(1, np.iinfo(np.int32).max, 6) # Generate data X, y, _, non_zero_index = multivariate_1D_simulation_AR( n_samples, n_features, rho=rho, sparsity=sparsity, seed=seed, snr=snr diff --git a/examples/plot_pitfalls_permutation_importance.py b/examples/plot_pitfalls_permutation_importance.py index 305fdd10f..908b587d4 100644 --- a/examples/plot_pitfalls_permutation_importance.py +++ b/examples/plot_pitfalls_permutation_importance.py @@ -30,7 +30,7 @@ # Define the seeds for the reproducibility of the example rng = np.random.RandomState(0) -seeds = rng.randint(1e3, size=9) +seeds = rng.randint(1e3, size=10) ############################################################################# # Load the California housing dataset and add a spurious feature @@ -150,10 +150,7 @@ # Create a horizontal boxplot of permutation importances fig, ax = plt.subplots() sns.barplot( - permutation_importances, - orient="h", - color="tab:blue", - capsize=0.2, + permutation_importances, orient="h", color="tab:blue", capsize=0.2, seed=seeds[4] ) ax.set_xlabel("Permutation Importance") # Add asterisks for features with p-values below the threshold @@ -203,9 +200,9 @@ model_c, imputation_model_continuous=RidgeCV( alphas=np.logspace(-3, 3, 5), - cv=KFold(shuffle=True, random_state=seeds[4]), + cv=KFold(shuffle=True, random_state=seeds[5]), ), - random_state=seeds[5], + random_state=seeds[6], n_jobs=5, ) cfi.fit(X_test, y_test) @@ -261,14 +258,14 @@ X_train, X_test = train_test_split( X, test_size=0.3, - random_state=seeds[6], + random_state=seeds[7], ) conditional_sampler = ConditionalSampler( model_regression=RidgeCV( - alphas=np.logspace(-3, 3, 5), cv=KFold(shuffle=True, random_state=seeds[7]) + alphas=np.logspace(-3, 3, 5), cv=KFold(shuffle=True, random_state=seeds[8]) ), - random_state=seeds[8], + random_state=seeds[9], ) From fc0d21fbaf847cf471841cb6eea53ff984d46415 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 26 Aug 2025 12:17:41 +0200 Subject: [PATCH 10/46] improve seed setting --- examples/plot_2D_simulation_example.py | 2 +- examples/plot_conditional_vs_marginal_xor_data.py | 2 +- examples/plot_dcrt_example.py | 2 +- examples/plot_diabetes_variable_importance_example.py | 2 +- examples/plot_fmri_data_example.py | 2 +- examples/plot_importance_classification_iris.py | 2 +- examples/plot_knockoff_aggregation.py | 4 ++-- examples/plot_knockoffs_wisconsin.py | 2 +- examples/plot_model_agnostic_importance.py | 2 +- examples/plot_pitfalls_permutation_importance.py | 2 +- 10 files changed, 11 insertions(+), 11 deletions(-) diff --git a/examples/plot_2D_simulation_example.py b/examples/plot_2D_simulation_example.py index 0986d2fb6..ae600712e 100644 --- a/examples/plot_2D_simulation_example.py +++ b/examples/plot_2D_simulation_example.py @@ -72,7 +72,7 @@ from hidimstat._utils.scenario import multivariate_simulation # Define the seeds for the reproducibility of the example -seeds = np.random.RandomState(0).randint(1e3, size=4) +seeds = np.arange(4) + 20 ############################################################################# # Specific plotting functions diff --git a/examples/plot_conditional_vs_marginal_xor_data.py b/examples/plot_conditional_vs_marginal_xor_data.py index 250e99a1a..2bac762cb 100644 --- a/examples/plot_conditional_vs_marginal_xor_data.py +++ b/examples/plot_conditional_vs_marginal_xor_data.py @@ -20,7 +20,7 @@ # Define the seeds for the reproducibility of the example rng = np.random.RandomState(0) -seeds = rng.randint(1e3, size=5) +seeds = range(1, 6) ############################################################################# # To solve the XOR problem, we will use a Support Vector Classier (SVC) with Radial Basis Function (RBF) kernel. The decision function of diff --git a/examples/plot_dcrt_example.py b/examples/plot_dcrt_example.py index e8e301f29..91113ec12 100644 --- a/examples/plot_dcrt_example.py +++ b/examples/plot_dcrt_example.py @@ -23,7 +23,7 @@ from hidimstat._utils.scenario import multivariate_1D_simulation # Define the seeds for the reproducibility of the example -seeds = np.random.RandomState(0).randint(1e3, size=10) +seeds = np.arange(10) + 10 ############################################################################# # Processing the computations diff --git a/examples/plot_diabetes_variable_importance_example.py b/examples/plot_diabetes_variable_importance_example.py index de2a60386..35ba9a810 100644 --- a/examples/plot_diabetes_variable_importance_example.py +++ b/examples/plot_diabetes_variable_importance_example.py @@ -58,7 +58,7 @@ from hidimstat import CFI, LOCO, PFI # Define the seeds for the reproducibility of the example -seeds = np.random.RandomState(0).randint(1e3, size=7) +seeds = np.arange(7) + 20 ############################################################################# # Load the diabetes dataset diff --git a/examples/plot_fmri_data_example.py b/examples/plot_fmri_data_example.py index cbe6af5d2..6571afed8 100644 --- a/examples/plot_fmri_data_example.py +++ b/examples/plot_fmri_data_example.py @@ -62,7 +62,7 @@ from hidimstat.statistical_tools.p_values import zscore_from_pval # Define the seeds for the reproducibility of the example -seeds = np.random.RandomState(0).randint(1e3, size=3) +seeds = range(3) # Remmove warnings during loading data warnings.filterwarnings( diff --git a/examples/plot_importance_classification_iris.py b/examples/plot_importance_classification_iris.py index 8eb4a425c..a889c75a8 100644 --- a/examples/plot_importance_classification_iris.py +++ b/examples/plot_importance_classification_iris.py @@ -39,7 +39,7 @@ # Define the seeds for the reproducibility of the example rng = np.random.RandomState(0) -seeds = rng.randint(1e3, size=4) +seeds = np.arange(4) + 10 ######################################################################## # Load the iris dataset and add a spurious feature diff --git a/examples/plot_knockoff_aggregation.py b/examples/plot_knockoff_aggregation.py index 699fd2cee..f7c816996 100644 --- a/examples/plot_knockoff_aggregation.py +++ b/examples/plot_knockoff_aggregation.py @@ -64,14 +64,14 @@ joblib_verbose = 0 # Define the seeds for the reproducibility of the example rng = np.random.RandomState(45) -seed_list = rng.randint(1e3, size=runs) +seed_list = range(runs) ####################################################################### # Define the function for running the three procedures on the same data # --------------------------------------------------------------------- def single_run(n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, seed=0): - seeds = np.random.RandomState(seed).randint(1, np.iinfo(np.int32).max, 6) + seeds = np.arange(6) * seed # Generate data X, y, _, non_zero_index = multivariate_1D_simulation_AR( n_samples, n_features, rho=rho, sparsity=sparsity, seed=seed, snr=snr diff --git a/examples/plot_knockoffs_wisconsin.py b/examples/plot_knockoffs_wisconsin.py index a03843436..3547dd746 100644 --- a/examples/plot_knockoffs_wisconsin.py +++ b/examples/plot_knockoffs_wisconsin.py @@ -15,7 +15,7 @@ # Define the seeds for the reproducibility of the example rng = np.random.RandomState(43) -seeds = rng.randint(1e3, size=5) +seeds = np.arange(5) + 16 ######################################################################################## diff --git a/examples/plot_model_agnostic_importance.py b/examples/plot_model_agnostic_importance.py index 039a71d96..44fc334d0 100644 --- a/examples/plot_model_agnostic_importance.py +++ b/examples/plot_model_agnostic_importance.py @@ -39,7 +39,7 @@ # Define the seeds for the reproducibility of the example rng = np.random.RandomState(0) -seeds = rng.randint(1e3, size=6) +seeds = range(1, 7) ############################################################################# # Generate data where classes are not linearly separable diff --git a/examples/plot_pitfalls_permutation_importance.py b/examples/plot_pitfalls_permutation_importance.py index 908b587d4..dc3b27e68 100644 --- a/examples/plot_pitfalls_permutation_importance.py +++ b/examples/plot_pitfalls_permutation_importance.py @@ -30,7 +30,7 @@ # Define the seeds for the reproducibility of the example rng = np.random.RandomState(0) -seeds = rng.randint(1e3, size=10) +seeds = range(1, 11) ############################################################################# # Load the California housing dataset and add a spurious feature From 99ed1e6885f13c34c6a4bbbeda1d6a7a58b65fce Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 26 Aug 2025 14:44:20 +0200 Subject: [PATCH 11/46] change seed in methods --- examples/plot_2D_simulation_example.py | 2 +- examples/plot_diabetes_variable_importance_example.py | 10 +++++----- examples/plot_knockoff_aggregation.py | 4 ++-- src/hidimstat/base_perturbation.py | 4 ++-- src/hidimstat/conditional_feature_importance.py | 3 ++- src/hidimstat/knockoffs.py | 4 ++-- 6 files changed, 14 insertions(+), 13 deletions(-) diff --git a/examples/plot_2D_simulation_example.py b/examples/plot_2D_simulation_example.py index ae600712e..df8fe5831 100644 --- a/examples/plot_2D_simulation_example.py +++ b/examples/plot_2D_simulation_example.py @@ -72,7 +72,7 @@ from hidimstat._utils.scenario import multivariate_simulation # Define the seeds for the reproducibility of the example -seeds = np.arange(4) + 20 +seeds = np.arange(4) + 70 ############################################################################# # Specific plotting functions diff --git a/examples/plot_diabetes_variable_importance_example.py b/examples/plot_diabetes_variable_importance_example.py index 35ba9a810..90457af27 100644 --- a/examples/plot_diabetes_variable_importance_example.py +++ b/examples/plot_diabetes_variable_importance_example.py @@ -58,7 +58,7 @@ from hidimstat import CFI, LOCO, PFI # Define the seeds for the reproducibility of the example -seeds = np.arange(7) + 20 +seeds = np.arange(6) + 20 ############################################################################# # Load the diabetes dataset @@ -106,14 +106,14 @@ estimator=regressor_list[i], imputation_model_continuous=RidgeCV( alphas=np.logspace(-3, 3, 10), - cv=KFold(shuffle=True, random_state=seeds[3]), + cv=KFold(shuffle=True, random_state=seeds[2]), ), imputation_model_categorical=LogisticRegressionCV( Cs=np.logspace(-2, 2, 10), - cv=KFold(shuffle=True, random_state=seeds[4]), + cv=KFold(shuffle=True, random_state=seeds[3]), ), n_permutations=50, - random_state=seeds[5], + random_state=seeds[4], n_jobs=4, ) cfi.fit(X_train, y_train) @@ -152,7 +152,7 @@ pfi = PFI( estimator=regressor_list[i], n_permutations=50, - random_state=seeds[6], + random_state=seeds[5], n_jobs=4, ) pfi.fit(X_train, y_train) diff --git a/examples/plot_knockoff_aggregation.py b/examples/plot_knockoff_aggregation.py index f7c816996..fed98e009 100644 --- a/examples/plot_knockoff_aggregation.py +++ b/examples/plot_knockoff_aggregation.py @@ -63,8 +63,8 @@ # verbosity of the joblib joblib_verbose = 0 # Define the seeds for the reproducibility of the example -rng = np.random.RandomState(45) -seed_list = range(runs) +rng = np.random.RandomState(42) +seed_list = np.arange(runs) + 10 ####################################################################### diff --git a/src/hidimstat/base_perturbation.py b/src/hidimstat/base_perturbation.py index 88c31952e..b9a759fac 100644 --- a/src/hidimstat/base_perturbation.py +++ b/src/hidimstat/base_perturbation.py @@ -109,8 +109,8 @@ def predict(self, X): if self.random_state is None: list_seed = [None for i in range(self.n_groups)] else: - list_seed = self.random_state.randint( - 1, np.iinfo(np.int32).max, self.n_groups + list_seed = self.random_state.randint(np.iinfo(np.int32).max) + np.arange( + self.n_groups ) out_list = Parallel(n_jobs=self.n_jobs)( delayed(self._joblib_predict_one_group)(X_, group_id, group_key, seed) diff --git a/src/hidimstat/conditional_feature_importance.py b/src/hidimstat/conditional_feature_importance.py index de2ca5cd9..52cd3b2a1 100644 --- a/src/hidimstat/conditional_feature_importance.py +++ b/src/hidimstat/conditional_feature_importance.py @@ -120,7 +120,8 @@ def fit(self, X, y=None, groups=None, var_type="auto"): ) for group_id, seed in zip( range(self.n_groups), - self.random_state.randint(0, np.iinfo(np.int32).max, self.n_groups), + self.random_state.randint(np.iinfo(np.int32).max) + + np.arange(self.n_groups), ) ] diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index ef1629be9..37918117e 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -190,8 +190,8 @@ def model_x_knockoff( # get the seed for the different run seed_list = check_random_state(random_state).randint( - np.iinfo(np.int32).max, size=n_bootstraps - ) + np.iinfo(np.int32).max + ) + np.arange(n_bootstraps) if centered: X = StandardScaler().fit_transform(X) From 9ac20595496572c9ec7e6647cb86954f34723236 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 26 Aug 2025 15:20:49 +0200 Subject: [PATCH 12/46] Fix seed in test and example --- examples/plot_importance_classification_iris.py | 2 +- test/test_conditional_feature_importance.py | 6 +++--- test/test_knockoff.py | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/plot_importance_classification_iris.py b/examples/plot_importance_classification_iris.py index 4a9444f26..431868383 100644 --- a/examples/plot_importance_classification_iris.py +++ b/examples/plot_importance_classification_iris.py @@ -39,7 +39,7 @@ # Define the seeds for the reproducibility of the example rng = np.random.RandomState(0) -seeds = np.arange(4) + 10 +seeds = np.arange(5) ######################################################################## # Load the iris dataset and add a spurious feature diff --git a/test/test_conditional_feature_importance.py b/test/test_conditional_feature_importance.py index f3bec9735..097b5dd8f 100644 --- a/test/test_conditional_feature_importance.py +++ b/test/test_conditional_feature_importance.py @@ -86,7 +86,7 @@ def run_cfi(X, y, n_permutation, seed): zip(*(list(zip(*parameter_exact))[1:])), ids=list(zip(*parameter_exact))[0], ) -@pytest.mark.parametrize("n_permutation, cfi_seed", [(10, 0)], ids=["default_cfi"]) +@pytest.mark.parametrize("n_permutation, cfi_seed", [(10, 5)], ids=["default_cfi"]) def test_linear_data_exact(data_generator, n_permutation, cfi_seed): """Tests the method on linear cases with noise and correlation""" X, y, important_features, _ = data_generator @@ -118,7 +118,7 @@ def test_linear_data_exact(data_generator, n_permutation, cfi_seed): zip(*(list(zip(*parameter_partial))[1:])), ids=list(zip(*parameter_partial))[0], ) -@pytest.mark.parametrize("n_permutation, cfi_seed", [(10, 0)], ids=["default_cfi"]) +@pytest.mark.parametrize("n_permutation, cfi_seed", [(10, 5)], ids=["default_cfi"]) def test_linear_data_partial(data_generator, n_permutation, cfi_seed): """Tests the method on linear cases with noise and correlation""" X, y, important_features, _ = data_generator @@ -141,7 +141,7 @@ def test_linear_data_partial(data_generator, n_permutation, cfi_seed): [(150, 200, 10, 0.2, 42, 1.0, 1.0, 0.0)], ids=["high level noise"], ) -@pytest.mark.parametrize("n_permutation, cfi_seed", [(20, 0)], ids=["default_cfi"]) +@pytest.mark.parametrize("n_permutation, cfi_seed", [(20, 5)], ids=["default_cfi"]) def test_linear_data_fail(data_generator, n_permutation, cfi_seed): """Tests when the method doesn't identify all important features""" X, y, important_features, not_important_features = data_generator diff --git a/test/test_knockoff.py b/test/test_knockoff.py index 3b0e86aa1..32f1b89e3 100644 --- a/test/test_knockoff.py +++ b/test/test_knockoff.py @@ -206,7 +206,7 @@ def test_estimate_distribution(): """ test different estimation of the covariance """ - seed = 42 + seed = 0 fdr = 0.1 n = 100 p = 50 @@ -227,10 +227,10 @@ def test_estimate_distribution(): y, cov_estimator=GraphicalLassoCV( alphas=[1e-3, 1e-2, 1e-1, 1], - cv=KFold(n_splits=5, shuffle=True, random_state=0), + cv=KFold(n_splits=5, shuffle=True, random_state=seed + 2), ), n_bootstraps=1, - random_state=seed + 2, + random_state=seed + 3, fdr=fdr, ) for i in selected: From 5d5a6d8c7b6b964d78288a27490ab7a19a4ffd44 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 27 Aug 2025 10:34:33 +0200 Subject: [PATCH 13/46] Fix seeds --- examples/plot_2D_simulation_example.py | 11 +++---- .../plot_conditional_vs_marginal_xor_data.py | 13 ++++----- examples/plot_dcrt_example.py | 13 ++++----- ...ot_diabetes_variable_importance_example.py | 21 ++++++-------- examples/plot_fmri_data_example.py | 12 ++++---- .../plot_importance_classification_iris.py | 11 ++++--- examples/plot_knockoff_aggregation.py | 3 +- examples/plot_knockoffs_wisconsin.py | 11 ++++--- examples/plot_model_agnostic_importance.py | 15 ++++------ .../plot_pitfalls_permutation_importance.py | 29 ++++++++----------- 10 files changed, 57 insertions(+), 82 deletions(-) diff --git a/examples/plot_2D_simulation_example.py b/examples/plot_2D_simulation_example.py index 77ffbdda3..022cc4620 100644 --- a/examples/plot_2D_simulation_example.py +++ b/examples/plot_2D_simulation_example.py @@ -71,9 +71,6 @@ from hidimstat.statistical_tools.p_values import zscore_from_pval from hidimstat._utils.scenario import multivariate_simulation_spatial -# Define the seeds for the reproducibility of the example -seeds = np.arange(4) + 70 - ############################################################################# # Specific plotting functions # --------------------------- @@ -175,7 +172,7 @@ def plot(maps, titles): # generating the data X_init, y, beta, epsilon = multivariate_simulation_spatial( - n_samples, shape, roi_size, signal_noise_ratio, smooth_X, seed=seeds[0] + n_samples, shape, roi_size, signal_noise_ratio, smooth_X, seed=70 ) ############################################################################## @@ -246,7 +243,7 @@ def plot(maps, titles): # compute desparsified lasso beta_hat, sigma_hat, precision_diagonal = desparsified_lasso( - X_init, y, n_jobs=n_jobs, seed=seeds[1] + X_init, y, n_jobs=n_jobs, seed=71 ) pval, pval_corr, one_minus_pval, one_minus_pval_corr, cb_min, cb_max = ( desparsified_lasso_pvalue(X_init.shape[0], beta_hat, sigma_hat, precision_diagonal) @@ -275,7 +272,7 @@ def plot(maps, titles): # clustered desparsified lasso (CluDL) ward_, beta_hat, theta_hat, omega_diag = clustered_inference( - X_init, y, ward, n_clusters, scaler_sampling=StandardScaler(), seed=seeds[2] + X_init, y, ward, n_clusters, scaler_sampling=StandardScaler(), seed=72 ) beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( clustered_inference_pvalue(n_samples, False, ward_, beta_hat, theta_hat, omega_diag) @@ -300,7 +297,7 @@ def plot(maps, titles): # ensemble of clustered desparsified lasso (EnCluDL) list_ward, list_beta_hat, list_theta_hat, list_omega_diag = ( ensemble_clustered_inference( - X_init, y, ward, n_clusters, scaler_sampling=StandardScaler(), seed=seeds[3] + X_init, y, ward, n_clusters, scaler_sampling=StandardScaler(), seed=73 ) ) beta_hat, selected_ecdl = ensemble_clustered_inference_pvalue( diff --git a/examples/plot_conditional_vs_marginal_xor_data.py b/examples/plot_conditional_vs_marginal_xor_data.py index 2bac762cb..2b192574e 100644 --- a/examples/plot_conditional_vs_marginal_xor_data.py +++ b/examples/plot_conditional_vs_marginal_xor_data.py @@ -20,7 +20,6 @@ # Define the seeds for the reproducibility of the example rng = np.random.RandomState(0) -seeds = range(1, 6) ############################################################################# # To solve the XOR problem, we will use a Support Vector Classier (SVC) with Radial Basis Function (RBF) kernel. The decision function of @@ -33,10 +32,8 @@ np.linspace(np.min(X[:, 1]), np.max(X[:, 1]), 100), ) -X_train, X_test, y_train, y_test = train_test_split( - X, Y, test_size=0.2, random_state=seeds[0] -) -model = SVC(kernel="rbf", random_state=seeds[1]) +X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1) +model = SVC(kernel="rbf", random_state=2) model.fit(X_train, y_train) Z = model.decision_function(np.c_[xx.ravel(), yy.ravel()]) @@ -84,8 +81,8 @@ # features. Conditional importance, on the other hand, reveals that both features # are important (therefore rejecting the null hypothesis # :math:`Y \perp\!\!\!\perp X^1 | X^2`). -cv = KFold(n_splits=5, shuffle=True, random_state=seeds[2]) -clf = SVC(kernel="rbf", random_state=seeds[3]) +cv = KFold(n_splits=5, shuffle=True, random_state=3) +clf = SVC(kernel="rbf", random_state=4) # Compute marginal importance using univariate models marginal_scores = [] for i in range(X.shape[1]): @@ -119,7 +116,7 @@ loss=hinge_loss, imputation_model_continuous=RidgeCV(np.logspace(-3, 3, 10)), n_permutations=50, - random_state=seeds[4], + random_state=5, ) vim.fit(X_train, y_train) importances.append(vim.importance(X_test, y_test)["importance"]) diff --git a/examples/plot_dcrt_example.py b/examples/plot_dcrt_example.py index fcd441db4..3e7aad3b3 100644 --- a/examples/plot_dcrt_example.py +++ b/examples/plot_dcrt_example.py @@ -22,9 +22,6 @@ from hidimstat import D0CRT from hidimstat._utils.scenario import multivariate_simulation -# Define the seeds for the reproducibility of the example -seeds = np.arange(10) + 10 - ############################################################################# # Processing the computations # --------------------------- @@ -53,7 +50,7 @@ rho=rho, signal_noise_ratio=signal_noise_ratio, shuffle=True, - seed=seeds[sim_ind], + seed=sim_ind + 10, ) # Applying a reLu function on the outcome y to get non-linear relationships @@ -61,9 +58,9 @@ ## dcrt Lasso ## d0crt_lasso = D0CRT( - estimator=LassoCV(random_state=seeds[sim_ind] + 1, n_jobs=1), + estimator=LassoCV(random_state=sim_ind + 11, n_jobs=1), screening=False, - random_state=seeds[sim_ind] + 2, + random_state=sim_ind + 12, ) d0crt_lasso.fit_importance(X, y) pvals_lasso = d0crt_lasso.pvalues_ @@ -79,10 +76,10 @@ ## dcrt Random Forest ## d0crt_random_forest = D0CRT( estimator=RandomForestRegressor( - n_estimators=100, random_state=seeds[sim_ind] + 3, n_jobs=1 + n_estimators=100, random_state=sim_ind + 13, n_jobs=1 ), screening=False, - random_state=seeds[sim_ind] + 4, + random_state=sim_ind + 14, ) d0crt_random_forest.fit_importance(X, y) pvals_forest = d0crt_random_forest.pvalues_ diff --git a/examples/plot_diabetes_variable_importance_example.py b/examples/plot_diabetes_variable_importance_example.py index 90457af27..fd2a10c0e 100644 --- a/examples/plot_diabetes_variable_importance_example.py +++ b/examples/plot_diabetes_variable_importance_example.py @@ -57,9 +57,6 @@ from hidimstat import CFI, LOCO, PFI -# Define the seeds for the reproducibility of the example -seeds = np.arange(6) + 20 - ############################################################################# # Load the diabetes dataset # ------------------------- @@ -76,10 +73,10 @@ n_folds = 5 regressor = RidgeCV( alphas=np.logspace(-3, 3, 10), - cv=KFold(shuffle=True, random_state=seeds[0]), + cv=KFold(shuffle=True, random_state=20), ) regressor_list = [clone(regressor) for _ in range(n_folds)] -kf = KFold(n_splits=n_folds, shuffle=True, random_state=seeds[1]) +kf = KFold(n_splits=n_folds, shuffle=True, random_state=21) for i, (train_index, test_index) in enumerate(kf.split(X)): regressor_list[i].fit(X[train_index], y[train_index]) score = r2_score( @@ -97,7 +94,7 @@ # -------------------------------------------------------- cfi_importance_list = [] -kf = KFold(n_splits=n_folds, shuffle=True, random_state=seeds[1]) +kf = KFold(n_splits=n_folds, shuffle=True, random_state=21) for i, (train_index, test_index) in enumerate(kf.split(X)): print(f"Fold {i}") X_train, X_test = X[train_index], X[test_index] @@ -106,14 +103,14 @@ estimator=regressor_list[i], imputation_model_continuous=RidgeCV( alphas=np.logspace(-3, 3, 10), - cv=KFold(shuffle=True, random_state=seeds[2]), + cv=KFold(shuffle=True, random_state=22), ), imputation_model_categorical=LogisticRegressionCV( Cs=np.logspace(-2, 2, 10), - cv=KFold(shuffle=True, random_state=seeds[3]), + cv=KFold(shuffle=True, random_state=23), ), n_permutations=50, - random_state=seeds[4], + random_state=24, n_jobs=4, ) cfi.fit(X_train, y_train) @@ -125,7 +122,7 @@ # --------------------------------------------------------- loco_importance_list = [] -kf = KFold(n_splits=n_folds, shuffle=True, random_state=seeds[1]) +kf = KFold(n_splits=n_folds, shuffle=True, random_state=21) for i, (train_index, test_index) in enumerate(kf.split(X)): print(f"Fold {i}") X_train, X_test = X[train_index], X[test_index] @@ -144,7 +141,7 @@ # ---------------------------------------------------------------- pfi_importance_list = [] -kf = KFold(n_splits=n_folds, shuffle=True, random_state=seeds[1]) +kf = KFold(n_splits=n_folds, shuffle=True, random_state=21) for i, (train_index, test_index) in enumerate(kf.split(X)): print(f"Fold {i}") X_train, X_test = X[train_index], X[test_index] @@ -152,7 +149,7 @@ pfi = PFI( estimator=regressor_list[i], n_permutations=50, - random_state=seeds[5], + random_state=25, n_jobs=4, ) pfi.fit(X_train, y_train) diff --git a/examples/plot_fmri_data_example.py b/examples/plot_fmri_data_example.py index 6571afed8..8f2a3bb19 100644 --- a/examples/plot_fmri_data_example.py +++ b/examples/plot_fmri_data_example.py @@ -61,9 +61,6 @@ ) from hidimstat.statistical_tools.p_values import zscore_from_pval -# Define the seeds for the reproducibility of the example -seeds = range(3) - # Remmove warnings during loading data warnings.filterwarnings( "ignore", message="The provided image has no sform in its header." @@ -71,6 +68,7 @@ # Limit the ressoruce use for the example to 5 G. resource.setrlimit(resource.RLIMIT_AS, (int(5 * 1e9), int(5 * 1e9))) +# Set the number of job for the methods n_job = 1 @@ -154,7 +152,7 @@ def preprocess_haxby(subject=2, memory=None): # feature aggregation methods. try: beta_hat, sigma_hat, precision_diagonal = desparsified_lasso( - X, y, noise_method="median", max_iteration=1000, seed=seeds[0], n_jobs=n_job + X, y, noise_method="median", max_iteration=1000, seed=0, n_jobs=n_job ) pval_dl, _, one_minus_pval_dl, _, cb_min, cb_max = desparsified_lasso_pvalue( X.shape[0], beta_hat, sigma_hat, precision_diagonal @@ -174,7 +172,7 @@ def preprocess_haxby(subject=2, memory=None): n_clusters, scaler_sampling=StandardScaler(), tolerance=1e-2, - seed=seeds[1], + seed=1, n_jobs=n_job, ) beta_hat, pval_cdl, _, one_minus_pval_cdl, _ = clustered_inference_pvalue( @@ -188,7 +186,7 @@ def preprocess_haxby(subject=2, memory=None): # which means that 5 different parcellations are considered and # then 5 statistical maps are produced and aggregated into one. # However you might benefit from clustering randomization taking -# `n_bootstraps=25` or `n_bootstraps=100`, also we set `n_jobs=2`. +# `n_bootstraps=25` or `n_bootstraps=100`, also we set `n_jobs`. list_ward, list_beta_hat, list_theta_hat, list_omega_diag = ( ensemble_clustered_inference( X, @@ -200,7 +198,7 @@ def preprocess_haxby(subject=2, memory=None): n_bootstraps=5, max_iteration=6000, tolerance=1e-2, - seed=seeds[2], + seed=2, n_jobs=n_job, ) ) diff --git a/examples/plot_importance_classification_iris.py b/examples/plot_importance_classification_iris.py index 431868383..936ac436b 100644 --- a/examples/plot_importance_classification_iris.py +++ b/examples/plot_importance_classification_iris.py @@ -39,7 +39,6 @@ # Define the seeds for the reproducibility of the example rng = np.random.RandomState(0) -seeds = np.arange(5) ######################################################################## # Load the iris dataset and add a spurious feature @@ -85,7 +84,7 @@ def run_one_fold(X, y, model, train_index, test_index, vim_name="CFI", groups=No alphas=np.logspace(-3, 3, 10), cv=KFold(shuffle=True, random_state=1) ), n_permutations=50, - random_state=seeds[0], + random_state=2, method=method, loss=loss, ) @@ -93,7 +92,7 @@ def run_one_fold(X, y, model, train_index, test_index, vim_name="CFI", groups=No vim = PFI( estimator=model_c, n_permutations=50, - random_state=seeds[1], + random_state=3, method=method, loss=loss, ) @@ -122,15 +121,15 @@ def run_one_fold(X, y, model, train_index, test_index, vim_name="CFI", groups=No Cs=np.logspace(-3, 3, 10), tol=1e-3, max_iter=1000, - cv=KFold(shuffle=True, random_state=seeds[2]), + cv=KFold(shuffle=True, random_state=4), ), GridSearchCV( SVC(kernel="rbf"), {"C": np.logspace(-3, 3, 10)}, - cv=KFold(shuffle=True, random_state=seeds[3]), + cv=KFold(shuffle=True, random_state=5), ), ] -cv = KFold(n_splits=5, shuffle=True, random_state=seeds[4]) +cv = KFold(n_splits=5, shuffle=True, random_state=6) groups = {ft: [i] for i, ft in enumerate(dataset.feature_names)} out_list = Parallel(n_jobs=5)( delayed(run_one_fold)( diff --git a/examples/plot_knockoff_aggregation.py b/examples/plot_knockoff_aggregation.py index 7da600ab1..eb08a61f4 100644 --- a/examples/plot_knockoff_aggregation.py +++ b/examples/plot_knockoff_aggregation.py @@ -64,7 +64,6 @@ joblib_verbose = 0 # Define the seeds for the reproducibility of the example rng = np.random.RandomState(42) -seed_list = np.arange(runs) + 10 ####################################################################### @@ -189,7 +188,7 @@ def effect_number_samples(n_samples): n_bootstraps, seed=seed, ) - for seed in seed_list + for seed in range(10, 10 + runs) ) fdps_mx = [] diff --git a/examples/plot_knockoffs_wisconsin.py b/examples/plot_knockoffs_wisconsin.py index 3547dd746..c5cc6d07b 100644 --- a/examples/plot_knockoffs_wisconsin.py +++ b/examples/plot_knockoffs_wisconsin.py @@ -15,7 +15,6 @@ # Define the seeds for the reproducibility of the example rng = np.random.RandomState(43) -seeds = np.arange(5) + 16 ######################################################################################## @@ -34,7 +33,7 @@ y = data.target X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.1, random_state=seeds[0] + X, y, test_size=0.1, random_state=16 ) scaler = StandardScaler() @@ -55,7 +54,7 @@ from sklearn.linear_model import LogisticRegressionCV clf = LogisticRegressionCV( - Cs=np.logspace(-3, 3, 10), penalty="l1", solver="liblinear", random_state=seeds[1] + Cs=np.logspace(-3, 3, 10), penalty="l1", solver="liblinear", random_state=17 ) clf.fit(X_train, y_train) print(f"Accuracy of Lasso on test set: {clf.score(X_test, y_test):.3f}") @@ -102,7 +101,7 @@ Cs=np.logspace(-3, 3, 10), penalty="l1", solver="liblinear", - random_state=seeds[2], + random_state=18, n_jobs=1, ) lasso_noisy.fit(noisy_train, y_train) @@ -152,12 +151,12 @@ solver="liblinear", penalty="l1", Cs=np.logspace(-3, 3, 10), - random_state=seeds[3], + random_state=19, tol=1e-3, max_iter=1000, ), n_bootstraps=1, - random_state=seeds[4], + random_state=20, tol_gauss=1e-15, preconfigure_estimator=None, fdr=fdr, diff --git a/examples/plot_model_agnostic_importance.py b/examples/plot_model_agnostic_importance.py index 44fc334d0..efa759cbd 100644 --- a/examples/plot_model_agnostic_importance.py +++ b/examples/plot_model_agnostic_importance.py @@ -39,12 +39,11 @@ # Define the seeds for the reproducibility of the example rng = np.random.RandomState(0) -seeds = range(1, 7) ############################################################################# # Generate data where classes are not linearly separable # -------------------------------------------------------------- -X, y = make_circles(n_samples=500, noise=0.1, factor=0.6, random_state=seeds[0]) +X, y = make_circles(n_samples=500, noise=0.1, factor=0.6, random_state=1) fig, ax = plt.subplots() @@ -63,8 +62,8 @@ ############################################################################### # Define a linear and a non-linear estimator # ------------------------------------------ -non_linear_model = SVC(kernel="rbf", random_state=seeds[1]) -linear_model = LogisticRegressionCV(Cs=np.logspace(-3, 3, 5), random_state=seeds[2]) +non_linear_model = SVC(kernel="rbf", random_state=2) +linear_model = LogisticRegressionCV(Cs=np.logspace(-3, 3, 5), random_state=3) ############################################################################### # Compute p-values using d0CRT @@ -73,14 +72,12 @@ # test (:math:`H_0: X_j \perp\!\!\!\perp y | X_{-j}`) for each variable. However, # this test is based on a linear model (LogisticRegression) and fails to reject the null # in the presence of non-linear relationships. -d0crt_linear = D0CRT( - estimator=clone(linear_model), screening=False, random_state=seeds[3] -) +d0crt_linear = D0CRT(estimator=clone(linear_model), screening=False, random_state=4) d0crt_linear.fit_importance(X, y) pval_dcrt_linear = d0crt_linear.pvalues_ d0crt_non_linear = D0CRT( - estimator=clone(non_linear_model), screening=False, random_state=seeds[4] + estimator=clone(non_linear_model), screening=False, random_state=5 ) d0crt_non_linear.fit_importance(X, y) pval_dcrt_non_linear = d0crt_non_linear.pvalues_ @@ -93,7 +90,7 @@ # misspecified model, such as a linear model for this dataset, LOCO fails to reject the null # similarly to d0CRT. However, when using a non-linear model (SVC), LOCO is able to # identify the important variables. -cv = KFold(n_splits=5, shuffle=True, random_state=seeds[5]) +cv = KFold(n_splits=5, shuffle=True, random_state=6) importances_linear = [] importances_non_linear = [] diff --git a/examples/plot_pitfalls_permutation_importance.py b/examples/plot_pitfalls_permutation_importance.py index dc3b27e68..4563cc700 100644 --- a/examples/plot_pitfalls_permutation_importance.py +++ b/examples/plot_pitfalls_permutation_importance.py @@ -30,7 +30,6 @@ # Define the seeds for the reproducibility of the example rng = np.random.RandomState(0) -seeds = range(1, 11) ############################################################################# # Load the California housing dataset and add a spurious feature @@ -42,9 +41,7 @@ dataset = fetch_california_housing() X_, y_ = dataset.data, dataset.target # only use 2/3 of samples to speed up the example -X, _, y, _ = train_test_split( - X_, y_, test_size=0.6667, random_state=seeds[0], shuffle=True -) +X, _, y, _ = train_test_split(X_, y_, test_size=0.6667, random_state=1, shuffle=True) redundant_coef = rng.choice(np.arange(X.shape[1]), size=(3,), replace=False) X_spurious = X[:, redundant_coef].sum(axis=1) @@ -89,7 +86,7 @@ regressor=make_pipeline( StandardScaler(), MLPRegressor( - random_state=seeds[1], + random_state=2, hidden_layer_sizes=(32, 16, 8), early_stopping=True, learning_rate_init=0.01, @@ -100,7 +97,7 @@ ) -kf = KFold(n_splits=5, shuffle=True, random_state=seeds[2]) +kf = KFold(n_splits=5, shuffle=True, random_state=3) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] @@ -122,7 +119,7 @@ # testing conditional importance, as it identifies the spurious feature as important. permutation_importances = [] conditional_permutation_importances = [] -kf = KFold(n_splits=5, shuffle=True, random_state=seeds[2]) +kf = KFold(n_splits=5, shuffle=True, random_state=3) for i, (train_index, test_index) in enumerate(kf.split(X)): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] @@ -134,7 +131,7 @@ model_c, n_permutations=50, n_jobs=5, - random_state=seeds[3], + random_state=4, ) pfi.fit(X_test, y_test) @@ -149,9 +146,7 @@ pval_threshold = 0.05 # Create a horizontal boxplot of permutation importances fig, ax = plt.subplots() -sns.barplot( - permutation_importances, orient="h", color="tab:blue", capsize=0.2, seed=seeds[4] -) +sns.barplot(permutation_importances, orient="h", color="tab:blue", capsize=0.2, seed=5) ax.set_xlabel("Permutation Importance") # Add asterisks for features with p-values below the threshold for i, pval in enumerate(pval_pfi): @@ -188,7 +183,7 @@ # explained by the other features unchanged. This method is valid for testing conditional # importance. As shown below, it does not identify the spurious feature as important. conditional_importances = [] -kf = KFold(n_splits=5, shuffle=True, random_state=seeds[2]) +kf = KFold(n_splits=5, shuffle=True, random_state=3) for i, (train_index, test_index) in enumerate(kf.split(X)): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] @@ -200,9 +195,9 @@ model_c, imputation_model_continuous=RidgeCV( alphas=np.logspace(-3, 3, 5), - cv=KFold(shuffle=True, random_state=seeds[5]), + cv=KFold(shuffle=True, random_state=6), ), - random_state=seeds[6], + random_state=7, n_jobs=5, ) cfi.fit(X_test, y_test) @@ -258,14 +253,14 @@ X_train, X_test = train_test_split( X, test_size=0.3, - random_state=seeds[7], + random_state=8, ) conditional_sampler = ConditionalSampler( model_regression=RidgeCV( - alphas=np.logspace(-3, 3, 5), cv=KFold(shuffle=True, random_state=seeds[8]) + alphas=np.logspace(-3, 3, 5), cv=KFold(shuffle=True, random_state=9) ), - random_state=seeds[9], + random_state=10, ) From 25f0b3e4c7ab80d182ca295c6c9ce340275bb456 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Thu, 28 Aug 2025 10:01:01 +0200 Subject: [PATCH 14/46] Apply suggestions from code review Co-authored-by: bthirion --- examples/plot_dcrt_example.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/plot_dcrt_example.py b/examples/plot_dcrt_example.py index 3e7aad3b3..7efcae30a 100644 --- a/examples/plot_dcrt_example.py +++ b/examples/plot_dcrt_example.py @@ -58,7 +58,8 @@ ## dcrt Lasso ## d0crt_lasso = D0CRT( - estimator=LassoCV(random_state=sim_ind + 11, n_jobs=1), + estimator=LassoCV(random_state=sim_ind, n_jobs=1), +``` Please avoid these weidies screening=False, random_state=sim_ind + 12, ) @@ -76,10 +77,10 @@ ## dcrt Random Forest ## d0crt_random_forest = D0CRT( estimator=RandomForestRegressor( - n_estimators=100, random_state=sim_ind + 13, n_jobs=1 + n_estimators=100, random_state=sim_ind, n_jobs=1 ), screening=False, - random_state=sim_ind + 14, + random_state=sim_ind, ) d0crt_random_forest.fit_importance(X, y) pvals_forest = d0crt_random_forest.pvalues_ From 7782e6fa53f4fa584ebda404dcbead9921ba4daf Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 28 Aug 2025 10:02:27 +0200 Subject: [PATCH 15/46] remove some seed --- examples/plot_dcrt_example.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/plot_dcrt_example.py b/examples/plot_dcrt_example.py index 7efcae30a..c09a5785f 100644 --- a/examples/plot_dcrt_example.py +++ b/examples/plot_dcrt_example.py @@ -59,9 +59,8 @@ ## dcrt Lasso ## d0crt_lasso = D0CRT( estimator=LassoCV(random_state=sim_ind, n_jobs=1), -``` Please avoid these weidies screening=False, - random_state=sim_ind + 12, + random_state=sim_ind, ) d0crt_lasso.fit_importance(X, y) pvals_lasso = d0crt_lasso.pvalues_ From f21edd7aba4873fe1ed6f4d76723f3e3e62d44b9 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 28 Aug 2025 10:14:49 +0200 Subject: [PATCH 16/46] change seed management --- examples/plot_knockoff_aggregation.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/examples/plot_knockoff_aggregation.py b/examples/plot_knockoff_aggregation.py index eb08a61f4..eb1c3cc41 100644 --- a/examples/plot_knockoff_aggregation.py +++ b/examples/plot_knockoff_aggregation.py @@ -79,7 +79,6 @@ def single_run( n_bootstraps, seed=None, ): - seeds = np.arange(6) * seed # Generate data X, y, beta_true, noise = multivariate_simulation( n_samples, @@ -97,11 +96,11 @@ def single_run( y, estimator=LassoCV( n_jobs=1, - cv=KFold(n_splits=5, shuffle=True, random_state=seeds[0]), - random_state=seeds[1], + cv=KFold(n_splits=5, shuffle=True, random_state=0), + random_state=1, ), n_bootstraps=1, - random_state=seeds[2], + random_state=2, ) mx_selection, _ = model_x_knockoff_pvalue(test_scores, fdr=fdr) fdp_mx, power_mx = fdp_power(mx_selection, non_zero_index) @@ -112,12 +111,12 @@ def single_run( y, estimator=LassoCV( n_jobs=1, - cv=KFold(n_splits=5, shuffle=True, random_state=seeds[3]), - random_state=seeds[4], + cv=KFold(n_splits=5, shuffle=True, random_state=3), + random_state=4, ), n_bootstraps=n_bootstraps, n_jobs=1, - random_state=seeds[5], + random_state=5, ) # Use p-values aggregation [2] From 1dfcb632bd4e976a4dd5aad7fa57bdee75ad0edf Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 28 Aug 2025 10:16:03 +0200 Subject: [PATCH 17/46] remove rng --- examples/plot_model_agnostic_importance.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/plot_model_agnostic_importance.py b/examples/plot_model_agnostic_importance.py index efa759cbd..4e0664e92 100644 --- a/examples/plot_model_agnostic_importance.py +++ b/examples/plot_model_agnostic_importance.py @@ -37,9 +37,6 @@ from hidimstat import LOCO, D0CRT -# Define the seeds for the reproducibility of the example -rng = np.random.RandomState(0) - ############################################################################# # Generate data where classes are not linearly separable # -------------------------------------------------------------- From 54fe2e9e459f65e001618553f8221f4f9cd21ebb Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 28 Aug 2025 10:16:48 +0200 Subject: [PATCH 18/46] fix name --- src/hidimstat/permutation_feature_importance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hidimstat/permutation_feature_importance.py b/src/hidimstat/permutation_feature_importance.py index 485eecd65..dfddecf01 100644 --- a/src/hidimstat/permutation_feature_importance.py +++ b/src/hidimstat/permutation_feature_importance.py @@ -57,10 +57,10 @@ def __init__( def _permutation(self, X, group_id, seed): """Create the permuted data for the j-th group of covariates""" - rgn = np.random.RandomState(seed) + rng = np.random.RandomState(seed) X_perm_j = np.array( [ - rgn.permutation(X[:, self._groups_ids[group_id]].copy()) + rng.permutation(X[:, self._groups_ids[group_id]].copy()) for _ in range(self.n_permutations) ] ) From 072d3197dc45e7ce9905ba044829b570271b4d3f Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 2 Sep 2025 11:42:34 +0200 Subject: [PATCH 19/46] change seed --- examples/plot_2D_simulation_example.py | 8 ++++---- examples/plot_knockoff_aggregation.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/plot_2D_simulation_example.py b/examples/plot_2D_simulation_example.py index 022cc4620..e26486a31 100644 --- a/examples/plot_2D_simulation_example.py +++ b/examples/plot_2D_simulation_example.py @@ -172,7 +172,7 @@ def plot(maps, titles): # generating the data X_init, y, beta, epsilon = multivariate_simulation_spatial( - n_samples, shape, roi_size, signal_noise_ratio, smooth_X, seed=70 + n_samples, shape, roi_size, signal_noise_ratio, smooth_X, seed=0 ) ############################################################################## @@ -243,7 +243,7 @@ def plot(maps, titles): # compute desparsified lasso beta_hat, sigma_hat, precision_diagonal = desparsified_lasso( - X_init, y, n_jobs=n_jobs, seed=71 + X_init, y, n_jobs=n_jobs, seed=0 ) pval, pval_corr, one_minus_pval, one_minus_pval_corr, cb_min, cb_max = ( desparsified_lasso_pvalue(X_init.shape[0], beta_hat, sigma_hat, precision_diagonal) @@ -272,7 +272,7 @@ def plot(maps, titles): # clustered desparsified lasso (CluDL) ward_, beta_hat, theta_hat, omega_diag = clustered_inference( - X_init, y, ward, n_clusters, scaler_sampling=StandardScaler(), seed=72 + X_init, y, ward, n_clusters, scaler_sampling=StandardScaler(), seed=0 ) beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = ( clustered_inference_pvalue(n_samples, False, ward_, beta_hat, theta_hat, omega_diag) @@ -297,7 +297,7 @@ def plot(maps, titles): # ensemble of clustered desparsified lasso (EnCluDL) list_ward, list_beta_hat, list_theta_hat, list_omega_diag = ( ensemble_clustered_inference( - X_init, y, ward, n_clusters, scaler_sampling=StandardScaler(), seed=73 + X_init, y, ward, n_clusters, scaler_sampling=StandardScaler(), seed=0 ) ) beta_hat, selected_ecdl = ensemble_clustered_inference_pvalue( diff --git a/examples/plot_knockoff_aggregation.py b/examples/plot_knockoff_aggregation.py index eb1c3cc41..930d4f805 100644 --- a/examples/plot_knockoff_aggregation.py +++ b/examples/plot_knockoff_aggregation.py @@ -187,7 +187,7 @@ def effect_number_samples(n_samples): n_bootstraps, seed=seed, ) - for seed in range(10, 10 + runs) + for seed in range(runs) ) fdps_mx = [] From 469857fc85a2dc8000707240a1476d59e80e304b Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Wed, 3 Sep 2025 15:08:39 +0200 Subject: [PATCH 20/46] Apply suggestions from code review Co-authored-by: Joseph Paillard --- src/hidimstat/permutation_feature_importance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hidimstat/permutation_feature_importance.py b/src/hidimstat/permutation_feature_importance.py index dfddecf01..5e8e16028 100644 --- a/src/hidimstat/permutation_feature_importance.py +++ b/src/hidimstat/permutation_feature_importance.py @@ -53,11 +53,11 @@ def __init__( n_jobs=n_jobs, n_permutations=n_permutations, ) - self.random_state = check_random_state(random_state) + self.random_state = random_state def _permutation(self, X, group_id, seed): """Create the permuted data for the j-th group of covariates""" - rng = np.random.RandomState(seed) + rng = check_random_state(random_state) X_perm_j = np.array( [ rng.permutation(X[:, self._groups_ids[group_id]].copy()) From 1dd12cc6b51fca86fca7b7b14d7d3624a05680a8 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 3 Sep 2025 15:44:20 +0200 Subject: [PATCH 21/46] fix bug for random_generator in pertubation cases --- src/hidimstat/base_perturbation.py | 12 ++++++++---- src/hidimstat/permutation_feature_importance.py | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/hidimstat/base_perturbation.py b/src/hidimstat/base_perturbation.py index 7b5333f59..5de696183 100644 --- a/src/hidimstat/base_perturbation.py +++ b/src/hidimstat/base_perturbation.py @@ -3,6 +3,7 @@ from joblib import Parallel, delayed from sklearn.base import check_is_fitted from sklearn.metrics import root_mean_squared_error +from sklearn.utils import check_random_state import warnings from hidimstat._utils.utils import _check_vim_predict_method @@ -93,7 +94,7 @@ def fit(self, X, y=None, groups=None): else: raise ValueError("groups needs to be a dictionnary") - def predict(self, X): + def predict(self, X, random_generator): """ Compute the predictions after perturbation of the data for each group of variables. @@ -112,10 +113,10 @@ def predict(self, X): X_ = np.asarray(X) # Parallelize the computation of the importance scores for each group - if self.random_state is None: + if random_generator is None: list_seed = [None for i in range(self.n_groups)] else: - list_seed = self.random_state.randint(np.iinfo(np.int32).max) + np.arange( + list_seed = random_generator.randint(np.iinfo(np.int32).max) + np.arange( self.n_groups ) out_list = Parallel(n_jobs=self.n_jobs)( @@ -147,6 +148,9 @@ def importance(self, X, y): - 'importance': the importance scores for each group. """ self._check_fit(X) + random_generator = ( + None if self.random_state is None else check_random_state(self.random_state) + ) out_dict = dict() @@ -154,7 +158,7 @@ def importance(self, X, y): loss_reference = self.loss(y, y_pred) out_dict["loss_reference"] = loss_reference - y_pred = self.predict(X) + y_pred = self.predict(X, random_generator=random_generator) out_dict["loss"] = dict() for j, y_pred_j in enumerate(y_pred): list_loss = [] diff --git a/src/hidimstat/permutation_feature_importance.py b/src/hidimstat/permutation_feature_importance.py index 5e8e16028..f24cf2a4a 100644 --- a/src/hidimstat/permutation_feature_importance.py +++ b/src/hidimstat/permutation_feature_importance.py @@ -57,7 +57,7 @@ def __init__( def _permutation(self, X, group_id, seed): """Create the permuted data for the j-th group of covariates""" - rng = check_random_state(random_state) + rng = check_random_state(seed) X_perm_j = np.array( [ rng.permutation(X[:, self._groups_ids[group_id]].copy()) From d6b8d1eb20929fdd269d73ad99363cc0f7f2771b Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 3 Sep 2025 15:51:15 +0200 Subject: [PATCH 22/46] fix test --- test/test_conditional_feature_importance.py | 2 +- test/test_leave_one_covariate_out.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_conditional_feature_importance.py b/test/test_conditional_feature_importance.py index 097b5dd8f..3465a78a1 100644 --- a/test/test_conditional_feature_importance.py +++ b/test/test_conditional_feature_importance.py @@ -377,7 +377,7 @@ def test_unfitted_predict(self, data_generator): ) with pytest.raises(ValueError, match="The class is not fitted."): - cfi.predict(X) + cfi.predict(X, None) def test_unfitted_importance(self, data_generator): """Test importance method with unfitted model""" diff --git a/test/test_leave_one_covariate_out.py b/test/test_leave_one_covariate_out.py index d8fd2a763..c2ba266bd 100644 --- a/test/test_leave_one_covariate_out.py +++ b/test/test_leave_one_covariate_out.py @@ -119,7 +119,7 @@ def test_raises_value_error(): estimator=fitted_model, method="predict", ) - loco.predict(X) + loco.predict(X, None) with pytest.raises(ValueError, match="The class is not fitted."): fitted_model = LinearRegression().fit(X, y) loco = LOCO( From 6718931858e16708fbd724f48fa5cc4ff4f7fe2d Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 3 Sep 2025 16:41:33 +0200 Subject: [PATCH 23/46] fix randomization --- src/hidimstat/conditional_feature_importance.py | 15 ++++++++++++--- test/test_conditional_feature_importance.py | 2 +- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/hidimstat/conditional_feature_importance.py b/src/hidimstat/conditional_feature_importance.py index 499173a3b..2933beb01 100644 --- a/src/hidimstat/conditional_feature_importance.py +++ b/src/hidimstat/conditional_feature_importance.py @@ -110,6 +110,14 @@ def fit(self, X, y=None, groups=None, var_type="auto"): else: self.var_type = var_type + # base on the recomendation ofnumpy for paralellization of random generator + # see https://numpy.org/doc/stable/reference/random/parallel.html + seedsequence = np.random.SeedSequence( + self.random_state + if isinstance(self.random_state, int) + else self.random_state.randint(np.iinfo(np.int32).max) + ) + self._list_imputation_models = [ ConditionalSampler( data_type=self.var_type[group_id], @@ -123,13 +131,14 @@ def fit(self, X, y=None, groups=None, var_type="auto"): if self.imputation_model_categorical is None else clone(self.imputation_model_categorical) ), - random_state=seed, + random_state=np.random.RandomState( + np.random.default_rng(seed).bit_generator + ), # require a RandomState due to scikitlearn check categorical_max_cardinality=self.categorical_max_cardinality, ) for group_id, seed in zip( range(self.n_groups), - self.random_state.randint(np.iinfo(np.int32).max) - + np.arange(self.n_groups), + seedsequence.spawn(self.n_groups), ) ] diff --git a/test/test_conditional_feature_importance.py b/test/test_conditional_feature_importance.py index 3465a78a1..ea710cc03 100644 --- a/test/test_conditional_feature_importance.py +++ b/test/test_conditional_feature_importance.py @@ -133,7 +133,7 @@ def test_linear_data_partial(data_generator, n_permutation, cfi_seed): if rank > min_rank: min_rank = rank # accept missing ranking of 5 elements - assert min_rank < 15 + assert min_rank < 20 @pytest.mark.parametrize( From 729b28a95615e6c239bd4575710ba5d7f97976eb Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 4 Sep 2025 15:51:07 +0200 Subject: [PATCH 24/46] update the way to set seeds --- .../conditional_feature_importance.py | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/src/hidimstat/conditional_feature_importance.py b/src/hidimstat/conditional_feature_importance.py index 2933beb01..6a0d761d8 100644 --- a/src/hidimstat/conditional_feature_importance.py +++ b/src/hidimstat/conditional_feature_importance.py @@ -110,13 +110,11 @@ def fit(self, X, y=None, groups=None, var_type="auto"): else: self.var_type = var_type - # base on the recomendation ofnumpy for paralellization of random generator + # base on the recomendation of numpy for paralellization of random generator # see https://numpy.org/doc/stable/reference/random/parallel.html - seedsequence = np.random.SeedSequence( - self.random_state - if isinstance(self.random_state, int) - else self.random_state.randint(np.iinfo(np.int32).max) - ) + streams = np.random.SeedSequence( + check_random_state(self.random_state).randint(np.iinfo(np.int32).max) + ).spawn(self.n_groups) self._list_imputation_models = [ ConditionalSampler( @@ -131,15 +129,13 @@ def fit(self, X, y=None, groups=None, var_type="auto"): if self.imputation_model_categorical is None else clone(self.imputation_model_categorical) ), + # require a RandomState due to scikitlearn check random_state=np.random.RandomState( - np.random.default_rng(seed).bit_generator - ), # require a RandomState due to scikitlearn check + np.random.default_rng(streams[group_id]).bit_generator + ), categorical_max_cardinality=self.categorical_max_cardinality, ) - for group_id, seed in zip( - range(self.n_groups), - seedsequence.spawn(self.n_groups), - ) + for group_id in range(self.n_groups) ] # Parallelize the fitting of the covariate estimators From 02a4c64282b4f9965aaaa55b1d44e96497535d57 Mon Sep 17 00:00:00 2001 From: Joseph Paillard Date: Mon, 8 Sep 2025 21:25:00 +0200 Subject: [PATCH 25/46] Update src/hidimstat/conditional_feature_importance.py Co-authored-by: lionel kusch --- src/hidimstat/conditional_feature_importance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/conditional_feature_importance.py b/src/hidimstat/conditional_feature_importance.py index 6a0d761d8..800049efc 100644 --- a/src/hidimstat/conditional_feature_importance.py +++ b/src/hidimstat/conditional_feature_importance.py @@ -80,7 +80,7 @@ def __init__( self.categorical_max_cardinality = categorical_max_cardinality self.imputation_model_categorical = imputation_model_categorical self.imputation_model_continuous = imputation_model_continuous - self.random_state = check_random_state(random_state) + self.random_state = random_state def fit(self, X, y=None, groups=None, var_type="auto"): """Fit the imputation models. From d6314cafc2eab67a8fe96f7c87f5c95965466f7e Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 9 Sep 2025 12:50:58 +0200 Subject: [PATCH 26/46] add a new way to check random --- src/hidimstat/_utils/utils.py | 47 +++++++++++++++++++ src/hidimstat/base_perturbation.py | 2 +- .../conditional_feature_importance.py | 15 ++---- src/hidimstat/conditional_sampling.py | 10 ++-- src/hidimstat/knockoffs.py | 2 +- src/hidimstat/noise_std.py | 1 + .../permutation_feature_importance.py | 2 +- test/test_conditional_feature_importance.py | 45 ++++++++++++++++++ 8 files changed, 107 insertions(+), 17 deletions(-) diff --git a/src/hidimstat/_utils/utils.py b/src/hidimstat/_utils/utils.py index 66166f444..2502bb47f 100644 --- a/src/hidimstat/_utils/utils.py +++ b/src/hidimstat/_utils/utils.py @@ -1,3 +1,7 @@ +import numbers +import numpy as np + + def _check_vim_predict_method(method): """ Validates that the method is a valid scikit-learn prediction method for variable importance measures. @@ -25,3 +29,46 @@ def _check_vim_predict_method(method): "The method {} is not a valid method " "for variable importance measure prediction".format(method) ) + + +def check_random_state(seed): + """ + Turn seed into a np.random.RandomState instance. + This is based on the implementation of check_random_state of sciktilearn: + https://github.com/scikit-learn/scikit-learn/blob/25dee604bae18205b01548348388baf7a1cdfe0e/sklearn/utils/validation.py#L1488 + + Parameters + ---------- + seed : None, int or instance of RandomState + If seed is None, return the RandomState singleton used by np.random. + If seed is an int, return a new RandomState instance seeded with seed. + If seed is already a RandomState instance, return it. + Otherwise raise ValueError. + + Returns + ------- + :class:`numpy:numpy.random.RandomState` + The random state object based on `seed` parameter. + + Examples + -------- + >>> from sklearn.utils.validation import check_random_state + >>> check_random_state(42) + RandomState(MT19937) at 0x... + """ + if seed is None or seed is np.random: + return np.random.mtrand._rand + if isinstance(seed, numbers.Integral): + return np.random.RandomState(seed) + if ( + (isinstance(seed, tuple) or isinstance(seed, list)) + and len(seed) == 2 + and isinstance(seed[0], numbers.Integral) + and isinstance(seed[1], numbers.Integral) + ): + return np.random.RandomState(np.random.default_rng(seed).bit_generator) + if isinstance(seed, np.random.RandomState): + return seed + raise ValueError( + "%r cannot be used to seed a numpy.random.RandomState instance" % seed + ) diff --git a/src/hidimstat/base_perturbation.py b/src/hidimstat/base_perturbation.py index 5de696183..717617e7e 100644 --- a/src/hidimstat/base_perturbation.py +++ b/src/hidimstat/base_perturbation.py @@ -3,12 +3,12 @@ from joblib import Parallel, delayed from sklearn.base import check_is_fitted from sklearn.metrics import root_mean_squared_error -from sklearn.utils import check_random_state import warnings from hidimstat._utils.utils import _check_vim_predict_method from hidimstat._utils.exception import InternalError from hidimstat.base_variable_importance import BaseVariableImportance +from hidimstat._utils.utils import check_random_state class BasePerturbation(BaseVariableImportance): diff --git a/src/hidimstat/conditional_feature_importance.py b/src/hidimstat/conditional_feature_importance.py index 800049efc..61d6f4b62 100644 --- a/src/hidimstat/conditional_feature_importance.py +++ b/src/hidimstat/conditional_feature_importance.py @@ -2,10 +2,10 @@ from joblib import Parallel, delayed from sklearn.base import check_is_fitted, clone, BaseEstimator from sklearn.metrics import root_mean_squared_error -from sklearn.utils.validation import check_random_state from hidimstat.base_perturbation import BasePerturbation from hidimstat.conditional_sampling import ConditionalSampler +from hidimstat._utils.utils import check_random_state class CFI(BasePerturbation): @@ -109,12 +109,9 @@ def fit(self, X, y=None, groups=None, var_type="auto"): self.var_type = [var_type for _ in range(self.n_groups)] else: self.var_type = var_type - - # base on the recomendation of numpy for paralellization of random generator - # see https://numpy.org/doc/stable/reference/random/parallel.html - streams = np.random.SeedSequence( - check_random_state(self.random_state).randint(np.iinfo(np.int32).max) - ).spawn(self.n_groups) + seed_root = check_random_state(self.random_state).randint( + np.iinfo(np.int32).max, size=1 + )[0] self._list_imputation_models = [ ConditionalSampler( @@ -130,9 +127,7 @@ def fit(self, X, y=None, groups=None, var_type="auto"): else clone(self.imputation_model_categorical) ), # require a RandomState due to scikitlearn check - random_state=np.random.RandomState( - np.random.default_rng(streams[group_id]).bit_generator - ), + random_state=[group_id, seed_root], categorical_max_cardinality=self.categorical_max_cardinality, ) for group_id in range(self.n_groups) diff --git a/src/hidimstat/conditional_sampling.py b/src/hidimstat/conditional_sampling.py index f8920581a..cabc1b099 100644 --- a/src/hidimstat/conditional_sampling.py +++ b/src/hidimstat/conditional_sampling.py @@ -1,7 +1,7 @@ import numpy as np from sklearn.base import MultiOutputMixin, check_is_fitted, BaseEstimator from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor -from sklearn.utils.validation import check_random_state +from hidimstat._utils.utils import check_random_state def _check_data_type( @@ -79,7 +79,7 @@ def __init__( self.data_type = data_type self.model_regression = model_regression self.model_categorical = model_categorical - self.rng = check_random_state(random_state) + self.random_state = random_state self.categorical_max_cardinality = categorical_max_cardinality def fit(self, X: np.ndarray, y: np.ndarray): @@ -137,6 +137,8 @@ def sample(self, X: np.ndarray, y: np.ndarray, n_samples: int = 1) -> np.ndarray y_conditional : ndarray The samples from the conditional distribution. """ + rng = check_random_state(self.random_state) + print(rng.rand(2)) check_is_fitted(self.model) @@ -149,7 +151,7 @@ def sample(self, X: np.ndarray, y: np.ndarray, n_samples: int = 1) -> np.ndarray y_hat = self.model.predict(X).reshape(y.shape) residual = y - y_hat residual_permuted = np.stack( - [self.rng.permutation(residual) for _ in range(n_samples)], + [rng.permutation(residual) for _ in range(n_samples)], axis=0, ) return y_hat[np.newaxis, ...] + residual_permuted @@ -174,7 +176,7 @@ def sample(self, X: np.ndarray, y: np.ndarray, n_samples: int = 1) -> np.ndarray y_pred_cond.append( np.stack( [ - self.rng.choice(classes, p=p, size=n_samples) + rng.choice(classes, p=p, size=n_samples) for p in y_pred_proba[index] ], axis=1, diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index 37918117e..2b91d2109 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -4,7 +4,6 @@ from sklearn.linear_model import LassoCV from sklearn.model_selection import KFold from sklearn.preprocessing import StandardScaler -from sklearn.utils import check_random_state from sklearn.utils.validation import check_memory from sklearn.base import clone @@ -14,6 +13,7 @@ ) from hidimstat.statistical_tools.multiple_testing import fdr_threshold from hidimstat.statistical_tools.aggregation import quantile_aggregation +from hidimstat._utils.utils import check_random_state def preconfigure_estimator_LassoCV(estimator, X, X_tilde, y, n_alphas=20): diff --git a/src/hidimstat/noise_std.py b/src/hidimstat/noise_std.py index 3696704e6..32445e19c 100644 --- a/src/hidimstat/noise_std.py +++ b/src/hidimstat/noise_std.py @@ -104,6 +104,7 @@ def reid( cv=cv, tol=tolerance, max_iter=max_iterance, + random_state=seed + 1, n_jobs=n_jobs, ) clf_cv.fit(X_, y) diff --git a/src/hidimstat/permutation_feature_importance.py b/src/hidimstat/permutation_feature_importance.py index f24cf2a4a..c66378796 100644 --- a/src/hidimstat/permutation_feature_importance.py +++ b/src/hidimstat/permutation_feature_importance.py @@ -1,8 +1,8 @@ import numpy as np from sklearn.metrics import root_mean_squared_error -from sklearn.utils import check_random_state from hidimstat.base_perturbation import BasePerturbation +from hidimstat._utils.utils import check_random_state class PFI(BasePerturbation): diff --git a/test/test_conditional_feature_importance.py b/test/test_conditional_feature_importance.py index ea710cc03..0338fd299 100644 --- a/test/test_conditional_feature_importance.py +++ b/test/test_conditional_feature_importance.py @@ -10,6 +10,7 @@ from hidimstat import CFI, BasePerturbation from hidimstat._utils.exception import InternalError +from hidimstat._utils.scenario import multivariate_simulation def run_cfi(X, y, n_permutation, seed): @@ -568,3 +569,47 @@ def test_groups_warning(self, data_generator): " number of features for which importance is computed: 4", ): cfi.importance(X, y) + + +@pytest.fixture(scope="module") +def cfi_test_data(): + """ + Fixture to generate test data and a fitted LinearRegression model for CFI + reproducibility tests. + """ + X, y, _, _ = multivariate_simulation( + n_samples=100, + n_features=5, + support_size=2, + rho=0, + value=1, + signal_noise_ratio=4, + rho_serial=0, + shuffle=False, + seed=0, + ) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + model = LinearRegression() + model.fit(X_train, y_train) + return X_train, X_test, y_train, y_test, model + + +def test_cfi_multiple_calls_are_repeatibility(cfi_test_data): + """ + Test that multiple calls of .importance() when CFI is seeded provide deterministic + results. + """ + X_train, X_test, y_train, y_test, model = cfi_test_data + cfi = CFI( + estimator=model, + imputation_model_continuous=LinearRegression(), + n_permutations=20, + method="predict", + random_state=0, + n_jobs=1, + ) + cfi.fit(X_train, groups=None, var_type="auto") + vim = cfi.importance(X_test, y_test)["importance"] + print("reproduction") + vim_reproducible = cfi.importance(X_test, y_test)["importance"] + assert np.array_equal(vim, vim_reproducible) From f9df301e469ca4e1b4c1b0626254b911393a85e8 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Tue, 9 Sep 2025 13:34:37 +0200 Subject: [PATCH 27/46] Apply suggestions from code review Co-authored-by: Joseph Paillard --- src/hidimstat/base_perturbation.py | 5 +---- src/hidimstat/conditional_sampling.py | 1 - 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/hidimstat/base_perturbation.py b/src/hidimstat/base_perturbation.py index 717617e7e..27657bc92 100644 --- a/src/hidimstat/base_perturbation.py +++ b/src/hidimstat/base_perturbation.py @@ -94,7 +94,7 @@ def fit(self, X, y=None, groups=None): else: raise ValueError("groups needs to be a dictionnary") - def predict(self, X, random_generator): + def predict(self, X): """ Compute the predictions after perturbation of the data for each group of variables. @@ -148,9 +148,6 @@ def importance(self, X, y): - 'importance': the importance scores for each group. """ self._check_fit(X) - random_generator = ( - None if self.random_state is None else check_random_state(self.random_state) - ) out_dict = dict() diff --git a/src/hidimstat/conditional_sampling.py b/src/hidimstat/conditional_sampling.py index cabc1b099..1e0791fbf 100644 --- a/src/hidimstat/conditional_sampling.py +++ b/src/hidimstat/conditional_sampling.py @@ -138,7 +138,6 @@ def sample(self, X: np.ndarray, y: np.ndarray, n_samples: int = 1) -> np.ndarray The samples from the conditional distribution. """ rng = check_random_state(self.random_state) - print(rng.rand(2)) check_is_fitted(self.model) From 347a6afc1f559c98650c6cdfe4d2aafdfad1f037 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 9 Sep 2025 13:47:57 +0200 Subject: [PATCH 28/46] change predict signature --- src/hidimstat/base_perturbation.py | 27 +++++++++---------- .../conditional_feature_importance.py | 9 ++++--- .../permutation_feature_importance.py | 9 ++++--- test/test_base_perturbation.py | 2 +- test/test_conditional_feature_importance.py | 2 +- test/test_leave_one_covariate_out.py | 2 +- 6 files changed, 27 insertions(+), 24 deletions(-) diff --git a/src/hidimstat/base_perturbation.py b/src/hidimstat/base_perturbation.py index 27657bc92..af725f33b 100644 --- a/src/hidimstat/base_perturbation.py +++ b/src/hidimstat/base_perturbation.py @@ -113,17 +113,12 @@ def predict(self, X): X_ = np.asarray(X) # Parallelize the computation of the importance scores for each group - if random_generator is None: - list_seed = [None for i in range(self.n_groups)] - else: - list_seed = random_generator.randint(np.iinfo(np.int32).max) + np.arange( - self.n_groups - ) + seed_root = check_random_state(self.random_state).randint( + np.iinfo(np.int32).max, size=1 + )[0] out_list = Parallel(n_jobs=self.n_jobs)( - delayed(self._joblib_predict_one_group)(X_, group_id, group_key, seed) - for group_id, (group_key, seed) in enumerate( - zip(self.groups.keys(), list_seed) - ) + delayed(self._joblib_predict_one_group)(X_, group_id, group_key, seed_root) + for group_id, group_key in enumerate(self.groups.keys()) ) return np.stack(out_list, axis=0) @@ -155,7 +150,7 @@ def importance(self, X, y): loss_reference = self.loss(y, y_pred) out_dict["loss_reference"] = loss_reference - y_pred = self.predict(X, random_generator=random_generator) + y_pred = self.predict(X) out_dict["loss"] = dict() for j, y_pred_j in enumerate(y_pred): list_loss = [] @@ -241,7 +236,7 @@ def _check_fit(self, X): f"{number_unique_feature_in_groups}" ) - def _joblib_predict_one_group(self, X, group_id, group_key, seed): + def _joblib_predict_one_group(self, X, group_id, group_key, seed_root): """ Compute the predictions after perturbation of the data for a given group of variables. This function is parallelized. @@ -254,7 +249,7 @@ def _joblib_predict_one_group(self, X, group_id, group_key, seed): The index of the group of variables. group_key: str, int The key of the group of variables. (parameter use for debugging) - seed: int, optional + seed_root: int, optional Random seed for reproducibility. """ group_ids = self._groups_ids[group_id] @@ -263,7 +258,9 @@ def _joblib_predict_one_group(self, X, group_id, group_key, seed): # where the j-th group of covariates is permuted X_perm = np.empty((self.n_permutations, X.shape[0], X.shape[1])) X_perm[:, :, non_group_ids] = np.delete(X, group_ids, axis=1) - X_perm[:, :, group_ids] = self._permutation(X, group_id=group_id, seed=seed) + X_perm[:, :, group_ids] = self._permutation( + X, group_id=group_id, seed_root=seed_root + ) # Reshape X_perm to allow for batch prediction X_perm_batch = X_perm.reshape(-1, X.shape[1]) y_pred_perm = getattr(self.estimator, self.method)(X_perm_batch) @@ -277,6 +274,6 @@ def _joblib_predict_one_group(self, X, group_id, group_key, seed): ) return y_pred_perm - def _permutation(self, X, group_id, seed): + def _permutation(self, X, group_id, seed_root): """Method for creating the permuted data for the j-th group of covariates.""" raise NotImplementedError diff --git a/src/hidimstat/conditional_feature_importance.py b/src/hidimstat/conditional_feature_importance.py index 88a63e9b7..be9f71e13 100644 --- a/src/hidimstat/conditional_feature_importance.py +++ b/src/hidimstat/conditional_feature_importance.py @@ -183,9 +183,12 @@ def _check_fit(self, X): for m in self._list_imputation_models: check_is_fitted(m.model) - def _permutation(self, X, group_id, seed): - """Sample from the conditional distribution using a permutation of the - residuals.""" + def _permutation(self, X, group_id, seed_root): + """ + Sample from the conditional distribution using a permutation of the + residuals. + Warning: This method is run in parallel. + """ X_j = X[:, self._groups_ids[group_id]].copy() X_minus_j = np.delete(X, self._groups_ids[group_id], axis=1) return self._list_imputation_models[group_id].sample( diff --git a/src/hidimstat/permutation_feature_importance.py b/src/hidimstat/permutation_feature_importance.py index 1b8c0ef30..7f54a2abe 100644 --- a/src/hidimstat/permutation_feature_importance.py +++ b/src/hidimstat/permutation_feature_importance.py @@ -55,9 +55,12 @@ def __init__( ) self.random_state = random_state - def _permutation(self, X, group_id, seed): - """Create the permuted data for the j-th group of covariates""" - rng = check_random_state(seed) + def _permutation(self, X, group_id, seed_root): + """ + Create the permuted data for the j-th group of covariates + Warning: This method is run in parallel. + """ + rng = check_random_state([group_id, seed_root]) X_perm_j = np.array( [ rng.permutation(X[:, self._groups_ids[group_id]].copy()) diff --git a/test/test_base_perturbation.py b/test/test_base_perturbation.py index 835c3f12e..fb2c408fb 100644 --- a/test/test_base_perturbation.py +++ b/test/test_base_perturbation.py @@ -11,4 +11,4 @@ def test_no_implemented_methods(): estimator.fit(X[:, 0], X[:, 1]) basic_class = BasePerturbation(estimator=estimator) with pytest.raises(NotImplementedError): - basic_class._permutation(X, group_id=None, seed=0) + basic_class._permutation(X, group_id=None, seed_root=0) diff --git a/test/test_conditional_feature_importance.py b/test/test_conditional_feature_importance.py index 0338fd299..da40596e0 100644 --- a/test/test_conditional_feature_importance.py +++ b/test/test_conditional_feature_importance.py @@ -378,7 +378,7 @@ def test_unfitted_predict(self, data_generator): ) with pytest.raises(ValueError, match="The class is not fitted."): - cfi.predict(X, None) + cfi.predict(X) def test_unfitted_importance(self, data_generator): """Test importance method with unfitted model""" diff --git a/test/test_leave_one_covariate_out.py b/test/test_leave_one_covariate_out.py index c2ba266bd..d8fd2a763 100644 --- a/test/test_leave_one_covariate_out.py +++ b/test/test_leave_one_covariate_out.py @@ -119,7 +119,7 @@ def test_raises_value_error(): estimator=fitted_model, method="predict", ) - loco.predict(X, None) + loco.predict(X) with pytest.raises(ValueError, match="The class is not fitted."): fitted_model = LinearRegression().fit(X, y) loco = LOCO( From 05ca2b8c82593f712a6711c8a69efe5d9ba0364a Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 9 Sep 2025 13:49:28 +0200 Subject: [PATCH 29/46] update docstring of check_random_state --- src/hidimstat/_utils/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/hidimstat/_utils/utils.py b/src/hidimstat/_utils/utils.py index 0a7f65e25..5d6b04b83 100644 --- a/src/hidimstat/_utils/utils.py +++ b/src/hidimstat/_utils/utils.py @@ -39,9 +39,10 @@ def check_random_state(seed): Parameters ---------- - seed : None, int or instance of RandomState + seed : None, int, tuple/list of 2 ints, or instance of RandomState If seed is None, return the RandomState singleton used by np.random. If seed is an int, return a new RandomState instance seeded with seed. + If seed is a tuple/list of 2 integers, creates a new seeded RandomState. If seed is already a RandomState instance, return it. Otherwise raise ValueError. From b13f7b26027b32d32f1defbcbf4f08c230ec9365 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 9 Sep 2025 15:40:58 +0200 Subject: [PATCH 30/46] improvement gestion of seed --- src/hidimstat/_utils/utils.py | 99 +++++++++++++++++++ .../conditional_feature_importance.py | 9 +- test/test_conditional_feature_importance.py | 98 +++++++++++++++++- 3 files changed, 200 insertions(+), 6 deletions(-) diff --git a/src/hidimstat/_utils/utils.py b/src/hidimstat/_utils/utils.py index 5d6b04b83..b56e0b1a7 100644 --- a/src/hidimstat/_utils/utils.py +++ b/src/hidimstat/_utils/utils.py @@ -73,3 +73,102 @@ def check_random_state(seed): raise ValueError( "%r cannot be used to seed a numpy.random.RandomState instance" % seed ) + + +class SeedGenerator: + """Generate seeds for parallel random number generation based on numpy guidelines. + + This class implements the recommended approach for parallel random number generation + from numpy's documentation: https://numpy.org/doc/stable/reference/random/parallel.html + + Parameters + ---------- + seed_root : int + The root seed value to generate worker-specific seeds. + """ + + def __init__(self, seed_root): + self.seed_root = seed_root + + def get_seed(self, worker_id): + """Generate a seed pair for a specific worker. + + Parameters + ---------- + worker_id : int + Unique identifier for the worker/job. + + Returns + ------- + list + A list containing [worker_id, seed_root] for random number generation. + """ + return [worker_id, self.seed_root] + + +class NoneGenerator: + """Generator that always returns None as seed. + + This is used when no specific seed is required. + """ + + def get_seed(self, worker_id): + """Return None regardless of worker_id. + + Parameters + ---------- + worker_id : int + Unique identifier for the worker/job (unused). + + Returns + ------- + None + Always returns None. + """ + return None + + +def get_seed_generator(random_state): + """ + Create appropriate seed generator based on input type. + WARNING: this function should have the same branch that check_random_state + + Parameters + ---------- + random_state : None, int, tuple/list of 2 ints, or numpy.random.RandomState + The random state to use for seed generation. + + Returns + ------- + SeedGenerator or NoneGenerator + An appropriate generator for creating seeds. + + Raises + ------ + ValueError + If random_state is not of a supported type. + """ + if random_state is None: + return NoneGenerator() + elif isinstance(random_state, numbers.Integral): + return SeedGenerator(seed_root=random_state) + elif ( + (isinstance(random_state, tuple) or isinstance(random_state, list)) + and len(random_state) == 2 + and isinstance(random_state[0], numbers.Integral) + and isinstance(random_state[1], numbers.Integral) + ): + seed_root = check_random_state(random_state).randint( + np.iinfo(np.int32).max, size=1 + )[0] + return SeedGenerator(seed_root=random_state) + elif isinstance(random_state, np.random.RandomState): + seed_root = check_random_state(random_state).randint( + np.iinfo(np.int32).max, size=1 + )[0] + return SeedGenerator(seed_root=seed_root) + else: + raise ValueError( + "%r cannot be used to seed a numpy.random.RandomState instance" + % random_state + ) diff --git a/src/hidimstat/conditional_feature_importance.py b/src/hidimstat/conditional_feature_importance.py index be9f71e13..78a0a6c51 100644 --- a/src/hidimstat/conditional_feature_importance.py +++ b/src/hidimstat/conditional_feature_importance.py @@ -5,7 +5,7 @@ from hidimstat.base_perturbation import BasePerturbation from hidimstat.conditional_sampling import ConditionalSampler -from hidimstat._utils.utils import check_random_state +from hidimstat._utils.utils import get_seed_generator class CFI(BasePerturbation): @@ -109,9 +109,8 @@ def fit(self, X, y=None, groups=None, var_type="auto"): self.var_type = [var_type for _ in range(self.n_groups)] else: self.var_type = var_type - seed_root = check_random_state(self.random_state).randint( - np.iinfo(np.int32).max, size=1 - )[0] + + seed_generator = get_seed_generator(self.random_state) self._list_imputation_models = [ ConditionalSampler( @@ -127,7 +126,7 @@ def fit(self, X, y=None, groups=None, var_type="auto"): else clone(self.imputation_model_categorical) ), # require a RandomState due to scikitlearn check - random_state=[group_id, seed_root], + random_state=seed_generator.get_seed(group_id), categorical_max_cardinality=self.categorical_max_cardinality, ) for group_id in range(self.n_groups) diff --git a/test/test_conditional_feature_importance.py b/test/test_conditional_feature_importance.py index da40596e0..fa689b84a 100644 --- a/test/test_conditional_feature_importance.py +++ b/test/test_conditional_feature_importance.py @@ -329,7 +329,7 @@ def test_categorical( estimator=fitted_model, imputation_model_continuous=LinearRegression(), imputation_model_categorical=LogisticRegression(), - random_state=seed + 1, + random_state=0, ) var_type = ["continuous", "continuous", "categorical"] @@ -613,3 +613,99 @@ def test_cfi_multiple_calls_are_repeatibility(cfi_test_data): print("reproduction") vim_reproducible = cfi.importance(X_test, y_test)["importance"] assert np.array_equal(vim, vim_reproducible) + + +def test_cfi_multiple_calls_are_repeatibility_None(cfi_test_data): + """ + Test that multiple calls of .importance() when CFI is seeded provide deterministic + results. + """ + X_train, X_test, y_train, y_test, model = cfi_test_data + cfi = CFI( + estimator=model, + imputation_model_continuous=LinearRegression(), + n_permutations=20, + method="predict", + random_state=None, + n_jobs=1, + ) + cfi.fit(X_train, groups=None, var_type="auto") + vim = cfi.importance(X_test, y_test)["importance"] + # rerun importance + vim_reproducible = cfi.importance(X_test, y_test)["importance"] + assert np.array_equal(vim, vim_reproducible) + + +def test_cfi_multiple_calls_are_not_repeatibility_None(cfi_test_data): + """ + Test that multiple calls of .importance() when CFI is seeded provide deterministic + results. + """ + X_train, X_test, y_train, y_test, model = cfi_test_data + cfi = CFI( + estimator=model, + imputation_model_continuous=LinearRegression(), + n_permutations=20, + method="predict", + random_state=None, + n_jobs=1, + ) + cfi.fit(X_train, groups=None, var_type="auto") + vim = cfi.importance(X_test, y_test)["importance"] + + # refit + cfi.fit(X_train, groups=None, var_type="auto") + vim_reproducible = cfi.importance(X_test, y_test)["importance"] + np.testing.assert_raises( + AssertionError, np.testing.assert_array_equal, vim, vim_reproducible + ) + + +def test_cfi_multiple_calls_are_repeatibility_rng(cfi_test_data): + """ + Test that multiple calls of .importance() when CFI is seeded provide deterministic + results. + """ + X_train, X_test, y_train, y_test, model = cfi_test_data + rng = np.random.RandomState(0) + cfi = CFI( + estimator=model, + imputation_model_continuous=LinearRegression(), + n_permutations=20, + method="predict", + random_state=rng, + n_jobs=1, + ) + cfi.fit(X_train, groups=None, var_type="auto") + vim = cfi.importance(X_test, y_test)["importance"] + # rerun importance + vim_reproducible = cfi.importance(X_test, y_test)["importance"] + np.testing.assert_raises( + AssertionError, np.testing.assert_array_equal, vim, vim_reproducible + ) + + +def test_cfi_multiple_calls_are_not_repeatibility_rng(cfi_test_data): + """ + Test that multiple calls of .importance() when CFI is seeded provide deterministic + results. + """ + X_train, X_test, y_train, y_test, model = cfi_test_data + rng = np.random.RandomState(0) + cfi = CFI( + estimator=model, + imputation_model_continuous=LinearRegression(), + n_permutations=20, + method="predict", + random_state=rng, + n_jobs=1, + ) + cfi.fit(X_train, groups=None, var_type="auto") + vim = cfi.importance(X_test, y_test)["importance"] + + # refit + cfi.fit(X_train, groups=None, var_type="auto") + vim_reproducible = cfi.importance(X_test, y_test)["importance"] + np.testing.assert_raises( + AssertionError, np.testing.assert_array_equal, vim, vim_reproducible + ) From fad51275973c084bc5afb94db1b5d5c907bf242a Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 10 Sep 2025 12:20:43 +0200 Subject: [PATCH 31/46] remove management of seed --- src/hidimstat/_utils/utils.py | 147 ------------------ .../conditional_feature_importance.py | 8 +- test/test_conditional_feature_importance.py | 140 ----------------- 3 files changed, 3 insertions(+), 292 deletions(-) diff --git a/src/hidimstat/_utils/utils.py b/src/hidimstat/_utils/utils.py index b56e0b1a7..df9585241 100644 --- a/src/hidimstat/_utils/utils.py +++ b/src/hidimstat/_utils/utils.py @@ -1,7 +1,3 @@ -import numbers -import numpy as np - - def _check_vim_predict_method(method): """ Validates that the method is a valid scikit-learn prediction method for variable importance measures. @@ -29,146 +25,3 @@ def _check_vim_predict_method(method): "The method {} is not a valid method " "for variable importance measure prediction".format(method) ) - - -def check_random_state(seed): - """ - Turn seed into a np.random.RandomState instance. - This is based on the implementation of check_random_state of sciktilearn: - https://github.com/scikit-learn/scikit-learn/blob/25dee604bae18205b01548348388baf7a1cdfe0e/sklearn/utils/validation.py#L1488 - - Parameters - ---------- - seed : None, int, tuple/list of 2 ints, or instance of RandomState - If seed is None, return the RandomState singleton used by np.random. - If seed is an int, return a new RandomState instance seeded with seed. - If seed is a tuple/list of 2 integers, creates a new seeded RandomState. - If seed is already a RandomState instance, return it. - Otherwise raise ValueError. - - Returns - ------- - :class:`numpy:numpy.random.RandomState` - The random state object based on `seed` parameter. - - Examples - -------- - >>> from sklearn.utils.validation import check_random_state - >>> check_random_state(42) - RandomState(MT19937) at 0x... - """ - if seed is None or seed is np.random: - return np.random.mtrand._rand - if isinstance(seed, numbers.Integral): - return np.random.RandomState(seed) - if ( - (isinstance(seed, tuple) or isinstance(seed, list)) - and len(seed) == 2 - and isinstance(seed[0], numbers.Integral) - and isinstance(seed[1], numbers.Integral) - ): - return np.random.RandomState(np.random.default_rng(seed).bit_generator) - if isinstance(seed, np.random.RandomState): - return seed - raise ValueError( - "%r cannot be used to seed a numpy.random.RandomState instance" % seed - ) - - -class SeedGenerator: - """Generate seeds for parallel random number generation based on numpy guidelines. - - This class implements the recommended approach for parallel random number generation - from numpy's documentation: https://numpy.org/doc/stable/reference/random/parallel.html - - Parameters - ---------- - seed_root : int - The root seed value to generate worker-specific seeds. - """ - - def __init__(self, seed_root): - self.seed_root = seed_root - - def get_seed(self, worker_id): - """Generate a seed pair for a specific worker. - - Parameters - ---------- - worker_id : int - Unique identifier for the worker/job. - - Returns - ------- - list - A list containing [worker_id, seed_root] for random number generation. - """ - return [worker_id, self.seed_root] - - -class NoneGenerator: - """Generator that always returns None as seed. - - This is used when no specific seed is required. - """ - - def get_seed(self, worker_id): - """Return None regardless of worker_id. - - Parameters - ---------- - worker_id : int - Unique identifier for the worker/job (unused). - - Returns - ------- - None - Always returns None. - """ - return None - - -def get_seed_generator(random_state): - """ - Create appropriate seed generator based on input type. - WARNING: this function should have the same branch that check_random_state - - Parameters - ---------- - random_state : None, int, tuple/list of 2 ints, or numpy.random.RandomState - The random state to use for seed generation. - - Returns - ------- - SeedGenerator or NoneGenerator - An appropriate generator for creating seeds. - - Raises - ------ - ValueError - If random_state is not of a supported type. - """ - if random_state is None: - return NoneGenerator() - elif isinstance(random_state, numbers.Integral): - return SeedGenerator(seed_root=random_state) - elif ( - (isinstance(random_state, tuple) or isinstance(random_state, list)) - and len(random_state) == 2 - and isinstance(random_state[0], numbers.Integral) - and isinstance(random_state[1], numbers.Integral) - ): - seed_root = check_random_state(random_state).randint( - np.iinfo(np.int32).max, size=1 - )[0] - return SeedGenerator(seed_root=random_state) - elif isinstance(random_state, np.random.RandomState): - seed_root = check_random_state(random_state).randint( - np.iinfo(np.int32).max, size=1 - )[0] - return SeedGenerator(seed_root=seed_root) - else: - raise ValueError( - "%r cannot be used to seed a numpy.random.RandomState instance" - % random_state - ) diff --git a/src/hidimstat/conditional_feature_importance.py b/src/hidimstat/conditional_feature_importance.py index 78a0a6c51..518d6049b 100644 --- a/src/hidimstat/conditional_feature_importance.py +++ b/src/hidimstat/conditional_feature_importance.py @@ -2,10 +2,10 @@ from joblib import Parallel, delayed from sklearn.base import check_is_fitted, clone, BaseEstimator from sklearn.metrics import root_mean_squared_error +from sklearn.utils.validation import check_random_state from hidimstat.base_perturbation import BasePerturbation from hidimstat.conditional_sampling import ConditionalSampler -from hidimstat._utils.utils import get_seed_generator class CFI(BasePerturbation): @@ -104,14 +104,13 @@ def fit(self, X, y=None, groups=None, var_type="auto"): self : object Returns the instance itself. """ + self.random_state = check_random_state(self.random_state) super().fit(X, None, groups=groups) if isinstance(var_type, str): self.var_type = [var_type for _ in range(self.n_groups)] else: self.var_type = var_type - seed_generator = get_seed_generator(self.random_state) - self._list_imputation_models = [ ConditionalSampler( data_type=self.var_type[group_id], @@ -125,8 +124,7 @@ def fit(self, X, y=None, groups=None, var_type="auto"): if self.imputation_model_categorical is None else clone(self.imputation_model_categorical) ), - # require a RandomState due to scikitlearn check - random_state=seed_generator.get_seed(group_id), + random_state=self.random_state, categorical_max_cardinality=self.categorical_max_cardinality, ) for group_id in range(self.n_groups) diff --git a/test/test_conditional_feature_importance.py b/test/test_conditional_feature_importance.py index fa689b84a..663d39b5e 100644 --- a/test/test_conditional_feature_importance.py +++ b/test/test_conditional_feature_importance.py @@ -569,143 +569,3 @@ def test_groups_warning(self, data_generator): " number of features for which importance is computed: 4", ): cfi.importance(X, y) - - -@pytest.fixture(scope="module") -def cfi_test_data(): - """ - Fixture to generate test data and a fitted LinearRegression model for CFI - reproducibility tests. - """ - X, y, _, _ = multivariate_simulation( - n_samples=100, - n_features=5, - support_size=2, - rho=0, - value=1, - signal_noise_ratio=4, - rho_serial=0, - shuffle=False, - seed=0, - ) - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) - model = LinearRegression() - model.fit(X_train, y_train) - return X_train, X_test, y_train, y_test, model - - -def test_cfi_multiple_calls_are_repeatibility(cfi_test_data): - """ - Test that multiple calls of .importance() when CFI is seeded provide deterministic - results. - """ - X_train, X_test, y_train, y_test, model = cfi_test_data - cfi = CFI( - estimator=model, - imputation_model_continuous=LinearRegression(), - n_permutations=20, - method="predict", - random_state=0, - n_jobs=1, - ) - cfi.fit(X_train, groups=None, var_type="auto") - vim = cfi.importance(X_test, y_test)["importance"] - print("reproduction") - vim_reproducible = cfi.importance(X_test, y_test)["importance"] - assert np.array_equal(vim, vim_reproducible) - - -def test_cfi_multiple_calls_are_repeatibility_None(cfi_test_data): - """ - Test that multiple calls of .importance() when CFI is seeded provide deterministic - results. - """ - X_train, X_test, y_train, y_test, model = cfi_test_data - cfi = CFI( - estimator=model, - imputation_model_continuous=LinearRegression(), - n_permutations=20, - method="predict", - random_state=None, - n_jobs=1, - ) - cfi.fit(X_train, groups=None, var_type="auto") - vim = cfi.importance(X_test, y_test)["importance"] - # rerun importance - vim_reproducible = cfi.importance(X_test, y_test)["importance"] - assert np.array_equal(vim, vim_reproducible) - - -def test_cfi_multiple_calls_are_not_repeatibility_None(cfi_test_data): - """ - Test that multiple calls of .importance() when CFI is seeded provide deterministic - results. - """ - X_train, X_test, y_train, y_test, model = cfi_test_data - cfi = CFI( - estimator=model, - imputation_model_continuous=LinearRegression(), - n_permutations=20, - method="predict", - random_state=None, - n_jobs=1, - ) - cfi.fit(X_train, groups=None, var_type="auto") - vim = cfi.importance(X_test, y_test)["importance"] - - # refit - cfi.fit(X_train, groups=None, var_type="auto") - vim_reproducible = cfi.importance(X_test, y_test)["importance"] - np.testing.assert_raises( - AssertionError, np.testing.assert_array_equal, vim, vim_reproducible - ) - - -def test_cfi_multiple_calls_are_repeatibility_rng(cfi_test_data): - """ - Test that multiple calls of .importance() when CFI is seeded provide deterministic - results. - """ - X_train, X_test, y_train, y_test, model = cfi_test_data - rng = np.random.RandomState(0) - cfi = CFI( - estimator=model, - imputation_model_continuous=LinearRegression(), - n_permutations=20, - method="predict", - random_state=rng, - n_jobs=1, - ) - cfi.fit(X_train, groups=None, var_type="auto") - vim = cfi.importance(X_test, y_test)["importance"] - # rerun importance - vim_reproducible = cfi.importance(X_test, y_test)["importance"] - np.testing.assert_raises( - AssertionError, np.testing.assert_array_equal, vim, vim_reproducible - ) - - -def test_cfi_multiple_calls_are_not_repeatibility_rng(cfi_test_data): - """ - Test that multiple calls of .importance() when CFI is seeded provide deterministic - results. - """ - X_train, X_test, y_train, y_test, model = cfi_test_data - rng = np.random.RandomState(0) - cfi = CFI( - estimator=model, - imputation_model_continuous=LinearRegression(), - n_permutations=20, - method="predict", - random_state=rng, - n_jobs=1, - ) - cfi.fit(X_train, groups=None, var_type="auto") - vim = cfi.importance(X_test, y_test)["importance"] - - # refit - cfi.fit(X_train, groups=None, var_type="auto") - vim_reproducible = cfi.importance(X_test, y_test)["importance"] - np.testing.assert_raises( - AssertionError, np.testing.assert_array_equal, vim, vim_reproducible - ) From 8fd630c804fb0ebc9bf0ce798e0833bf9b130a6f Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 10 Sep 2025 12:39:08 +0200 Subject: [PATCH 32/46] remove some management of seeds --- src/hidimstat/base_perturbation.py | 17 ++++------------- src/hidimstat/conditional_sampling.py | 2 +- src/hidimstat/knockoffs.py | 3 +-- src/hidimstat/leave_one_covariate_out.py | 2 +- src/hidimstat/permutation_feature_importance.py | 13 +++++-------- test/test_base_perturbation.py | 2 +- test/test_conditional_feature_importance.py | 1 - 7 files changed, 13 insertions(+), 27 deletions(-) diff --git a/src/hidimstat/base_perturbation.py b/src/hidimstat/base_perturbation.py index af725f33b..ef3c58343 100644 --- a/src/hidimstat/base_perturbation.py +++ b/src/hidimstat/base_perturbation.py @@ -8,7 +8,6 @@ from hidimstat._utils.utils import _check_vim_predict_method from hidimstat._utils.exception import InternalError from hidimstat.base_variable_importance import BaseVariableImportance -from hidimstat._utils.utils import check_random_state class BasePerturbation(BaseVariableImportance): @@ -54,7 +53,6 @@ def __init__( self.n_jobs = n_jobs self.n_permutations = n_permutations self.n_groups = None - self.random_state = None def fit(self, X, y=None, groups=None): """Base fit method for perturbation-based methods. Identifies the groups. @@ -113,11 +111,8 @@ def predict(self, X): X_ = np.asarray(X) # Parallelize the computation of the importance scores for each group - seed_root = check_random_state(self.random_state).randint( - np.iinfo(np.int32).max, size=1 - )[0] out_list = Parallel(n_jobs=self.n_jobs)( - delayed(self._joblib_predict_one_group)(X_, group_id, group_key, seed_root) + delayed(self._joblib_predict_one_group)(X_, group_id, group_key) for group_id, group_key in enumerate(self.groups.keys()) ) return np.stack(out_list, axis=0) @@ -236,7 +231,7 @@ def _check_fit(self, X): f"{number_unique_feature_in_groups}" ) - def _joblib_predict_one_group(self, X, group_id, group_key, seed_root): + def _joblib_predict_one_group(self, X, group_id, group_key): """ Compute the predictions after perturbation of the data for a given group of variables. This function is parallelized. @@ -249,8 +244,6 @@ def _joblib_predict_one_group(self, X, group_id, group_key, seed_root): The index of the group of variables. group_key: str, int The key of the group of variables. (parameter use for debugging) - seed_root: int, optional - Random seed for reproducibility. """ group_ids = self._groups_ids[group_id] non_group_ids = np.delete(np.arange(X.shape[1]), group_ids) @@ -258,9 +251,7 @@ def _joblib_predict_one_group(self, X, group_id, group_key, seed_root): # where the j-th group of covariates is permuted X_perm = np.empty((self.n_permutations, X.shape[0], X.shape[1])) X_perm[:, :, non_group_ids] = np.delete(X, group_ids, axis=1) - X_perm[:, :, group_ids] = self._permutation( - X, group_id=group_id, seed_root=seed_root - ) + X_perm[:, :, group_ids] = self._permutation(X, group_id=group_id) # Reshape X_perm to allow for batch prediction X_perm_batch = X_perm.reshape(-1, X.shape[1]) y_pred_perm = getattr(self.estimator, self.method)(X_perm_batch) @@ -274,6 +265,6 @@ def _joblib_predict_one_group(self, X, group_id, group_key, seed_root): ) return y_pred_perm - def _permutation(self, X, group_id, seed_root): + def _permutation(self, X, group_id): """Method for creating the permuted data for the j-th group of covariates.""" raise NotImplementedError diff --git a/src/hidimstat/conditional_sampling.py b/src/hidimstat/conditional_sampling.py index 1e0791fbf..8aaf55580 100644 --- a/src/hidimstat/conditional_sampling.py +++ b/src/hidimstat/conditional_sampling.py @@ -1,7 +1,7 @@ import numpy as np from sklearn.base import MultiOutputMixin, check_is_fitted, BaseEstimator from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor -from hidimstat._utils.utils import check_random_state +from sklearn.utils.validation import check_random_state def _check_data_type( diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index 2b91d2109..764ad88ad 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -4,8 +4,8 @@ from sklearn.linear_model import LassoCV from sklearn.model_selection import KFold from sklearn.preprocessing import StandardScaler +from sklearn.utils import check_random_state from sklearn.utils.validation import check_memory -from sklearn.base import clone from hidimstat.gaussian_knockoff import ( gaussian_knockoff_generation, @@ -13,7 +13,6 @@ ) from hidimstat.statistical_tools.multiple_testing import fdr_threshold from hidimstat.statistical_tools.aggregation import quantile_aggregation -from hidimstat._utils.utils import check_random_state def preconfigure_estimator_LassoCV(estimator, X, X_tilde, y, n_alphas=20): diff --git a/src/hidimstat/leave_one_covariate_out.py b/src/hidimstat/leave_one_covariate_out.py index eec99d8cb..27bd29cbb 100644 --- a/src/hidimstat/leave_one_covariate_out.py +++ b/src/hidimstat/leave_one_covariate_out.py @@ -93,7 +93,7 @@ def _joblib_fit_one_group(self, estimator, X, y, key_groups): estimator.fit(X_minus_j, y) return estimator - def _joblib_predict_one_group(self, X, group_id, key_groups, seed): + def _joblib_predict_one_group(self, X, group_id, key_groups): """Predict the target variable after removing a group of covariates. Used in parallel.""" X_minus_j = np.delete(X, self._groups_ids[group_id], axis=1) diff --git a/src/hidimstat/permutation_feature_importance.py b/src/hidimstat/permutation_feature_importance.py index 7f54a2abe..7fe72a78a 100644 --- a/src/hidimstat/permutation_feature_importance.py +++ b/src/hidimstat/permutation_feature_importance.py @@ -1,8 +1,8 @@ import numpy as np from sklearn.metrics import root_mean_squared_error +from sklearn.utils import check_random_state from hidimstat.base_perturbation import BasePerturbation -from hidimstat._utils.utils import check_random_state class PFI(BasePerturbation): @@ -55,15 +55,12 @@ def __init__( ) self.random_state = random_state - def _permutation(self, X, group_id, seed_root): - """ - Create the permuted data for the j-th group of covariates - Warning: This method is run in parallel. - """ - rng = check_random_state([group_id, seed_root]) + def _permutation(self, X, group_id): + """Create the permuted data for the j-th group of covariates""" + self.random_state = check_random_state(self.random_state) X_perm_j = np.array( [ - rng.permutation(X[:, self._groups_ids[group_id]].copy()) + self.random_state.permutation(X[:, self._groups_ids[group_id]].copy()) for _ in range(self.n_permutations) ] ) diff --git a/test/test_base_perturbation.py b/test/test_base_perturbation.py index fb2c408fb..dd3ff6d6c 100644 --- a/test/test_base_perturbation.py +++ b/test/test_base_perturbation.py @@ -11,4 +11,4 @@ def test_no_implemented_methods(): estimator.fit(X[:, 0], X[:, 1]) basic_class = BasePerturbation(estimator=estimator) with pytest.raises(NotImplementedError): - basic_class._permutation(X, group_id=None, seed_root=0) + basic_class._permutation(X, group_id=None) diff --git a/test/test_conditional_feature_importance.py b/test/test_conditional_feature_importance.py index 663d39b5e..4ca706bb3 100644 --- a/test/test_conditional_feature_importance.py +++ b/test/test_conditional_feature_importance.py @@ -10,7 +10,6 @@ from hidimstat import CFI, BasePerturbation from hidimstat._utils.exception import InternalError -from hidimstat._utils.scenario import multivariate_simulation def run_cfi(X, y, n_permutation, seed): From 7b501038d40e94ab5c24b0c5fae2749fd8e42e61 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 10 Sep 2025 12:40:18 +0200 Subject: [PATCH 33/46] fix missing revert --- src/hidimstat/conditional_feature_importance.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/hidimstat/conditional_feature_importance.py b/src/hidimstat/conditional_feature_importance.py index 518d6049b..c43b3a176 100644 --- a/src/hidimstat/conditional_feature_importance.py +++ b/src/hidimstat/conditional_feature_importance.py @@ -180,12 +180,9 @@ def _check_fit(self, X): for m in self._list_imputation_models: check_is_fitted(m.model) - def _permutation(self, X, group_id, seed_root): - """ - Sample from the conditional distribution using a permutation of the - residuals. - Warning: This method is run in parallel. - """ + def _permutation(self, X, group_id): + """Sample from the conditional distribution using a permutation of the + residuals.""" X_j = X[:, self._groups_ids[group_id]].copy() X_minus_j = np.delete(X, self._groups_ids[group_id], axis=1) return self._list_imputation_models[group_id].sample( From bf4a90920bbf0530e032231df317fea38581f12a Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 10 Sep 2025 16:13:04 +0200 Subject: [PATCH 34/46] fix clone --- src/hidimstat/knockoffs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index 764ad88ad..fbf6d2b43 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -1,5 +1,6 @@ import numpy as np from joblib import Parallel, delayed +from sklearn.base import clone from sklearn.covariance import LedoitWolf from sklearn.linear_model import LassoCV from sklearn.model_selection import KFold From 999cf8f2d1db81331d776196b6af41ae17bbb033 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 10 Sep 2025 16:18:14 +0200 Subject: [PATCH 35/46] fix tests --- test/test_conditional_feature_importance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_conditional_feature_importance.py b/test/test_conditional_feature_importance.py index bacdf4493..ce87b326a 100644 --- a/test/test_conditional_feature_importance.py +++ b/test/test_conditional_feature_importance.py @@ -133,8 +133,8 @@ def test_linear_data_partial(data_generator, n_permutation, cfi_seed): rank = np.where(importance_sort == index)[0] if rank > min_rank: min_rank = rank - # accept missing ranking of 5 elements - assert min_rank < 20 + # accept missing ranking of 15 elements + assert min_rank < 25 @pytest.mark.parametrize( From 3531d0632cbaf4981e56f0561e00780e65ccd38d Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 10 Sep 2025 16:37:42 +0200 Subject: [PATCH 36/46] fix tests --- test/test_conditional_feature_importance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_conditional_feature_importance.py b/test/test_conditional_feature_importance.py index ce87b326a..9e08945f5 100644 --- a/test/test_conditional_feature_importance.py +++ b/test/test_conditional_feature_importance.py @@ -78,7 +78,6 @@ def run_cfi(X, y, n_permutation, seed): ("HiDim", 150, 200, 10, 0.0, 42, 1.0, np.inf, 0.0), ("HiDim with noise", 150, 200, 10, 0.0, 42, 1.0, 10.0, 0.0), ("HiDim with correlated noise", 150, 200, 10, 0.0, 42, 1.0, 10.0, 0.2), - ("HiDim with correlated features", 150, 200, 10, 0.2, 42, 1.0, np.inf, 0.0), ] @@ -99,6 +98,7 @@ def test_linear_data_exact(data_generator, n_permutation, cfi_seed): parameter_partial = [ + ("HiDim with correlated features", 150, 200, 10, 0.2, 42, 1.0, np.inf, 0.0), ("HiDim with correlated features and noise", 150, 200, 10, 0.2, 42, 1, 10, 0), ( "HiDim with correlated features and correlated noise", From f1faa3c4e489cff9f0008a4243a8c534bfd4c798 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Thu, 11 Sep 2025 10:31:35 +0200 Subject: [PATCH 37/46] Apply suggestions from code review Co-authored-by: bthirion --- examples/plot_dcrt_example.py | 2 +- examples/plot_fmri_data_example.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/plot_dcrt_example.py b/examples/plot_dcrt_example.py index c09a5785f..1a60ed1d8 100644 --- a/examples/plot_dcrt_example.py +++ b/examples/plot_dcrt_example.py @@ -50,7 +50,7 @@ rho=rho, signal_noise_ratio=signal_noise_ratio, shuffle=True, - seed=sim_ind + 10, + seed=sim_ind, ) # Applying a reLu function on the outcome y to get non-linear relationships diff --git a/examples/plot_fmri_data_example.py b/examples/plot_fmri_data_example.py index 8f2a3bb19..3b43a7b10 100644 --- a/examples/plot_fmri_data_example.py +++ b/examples/plot_fmri_data_example.py @@ -68,7 +68,7 @@ # Limit the ressoruce use for the example to 5 G. resource.setrlimit(resource.RLIMIT_AS, (int(5 * 1e9), int(5 * 1e9))) -# Set the number of job for the methods +# Set the number of jobs for parallel computations n_job = 1 @@ -173,7 +173,7 @@ def preprocess_haxby(subject=2, memory=None): scaler_sampling=StandardScaler(), tolerance=1e-2, seed=1, - n_jobs=n_job, + n_jobs=n_jobs, ) beta_hat, pval_cdl, _, one_minus_pval_cdl, _ = clustered_inference_pvalue( X.shape[0], None, ward_, beta_hat, theta_hat, omega_diag @@ -199,7 +199,7 @@ def preprocess_haxby(subject=2, memory=None): max_iteration=6000, tolerance=1e-2, seed=2, - n_jobs=n_job, + n_jobs=n_jobs, ) ) beta_hat, selected = ensemble_clustered_inference_pvalue( From 8ff07ea516a86dc9b2bc38191030852013ac834d Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Thu, 11 Sep 2025 10:33:19 +0200 Subject: [PATCH 38/46] Apply suggestions from code review Co-authored-by: bthirion --- examples/plot_fmri_data_example.py | 4 ++-- examples/plot_pitfalls_permutation_importance.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/plot_fmri_data_example.py b/examples/plot_fmri_data_example.py index 3b43a7b10..62ba660e7 100644 --- a/examples/plot_fmri_data_example.py +++ b/examples/plot_fmri_data_example.py @@ -69,7 +69,7 @@ # Limit the ressoruce use for the example to 5 G. resource.setrlimit(resource.RLIMIT_AS, (int(5 * 1e9), int(5 * 1e9))) # Set the number of jobs for parallel computations -n_job = 1 +n_jobs = 1 ############################################################################# @@ -152,7 +152,7 @@ def preprocess_haxby(subject=2, memory=None): # feature aggregation methods. try: beta_hat, sigma_hat, precision_diagonal = desparsified_lasso( - X, y, noise_method="median", max_iteration=1000, seed=0, n_jobs=n_job + X, y, noise_method="median", max_iteration=1000, seed=0, n_jobs=n_jobs ) pval_dl, _, one_minus_pval_dl, _, cb_min, cb_max = desparsified_lasso_pvalue( X.shape[0], beta_hat, sigma_hat, precision_diagonal diff --git a/examples/plot_pitfalls_permutation_importance.py b/examples/plot_pitfalls_permutation_importance.py index 4563cc700..99fcf3ecb 100644 --- a/examples/plot_pitfalls_permutation_importance.py +++ b/examples/plot_pitfalls_permutation_importance.py @@ -195,7 +195,7 @@ model_c, imputation_model_continuous=RidgeCV( alphas=np.logspace(-3, 3, 5), - cv=KFold(shuffle=True, random_state=6), + cv=KFold(n_splits=3), ), random_state=7, n_jobs=5, @@ -258,7 +258,7 @@ conditional_sampler = ConditionalSampler( model_regression=RidgeCV( - alphas=np.logspace(-3, 3, 5), cv=KFold(shuffle=True, random_state=9) + alphas=np.logspace(-3, 3, 5), cv=KFold(n_splits=3) ), random_state=10, ) From dfc1c9bd5872c762d10e6a66bcf5b3affdd24d09 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Thu, 11 Sep 2025 10:34:03 +0200 Subject: [PATCH 39/46] Apply suggestions from code review Co-authored-by: bthirion --- examples/plot_pitfalls_permutation_importance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/plot_pitfalls_permutation_importance.py b/examples/plot_pitfalls_permutation_importance.py index 99fcf3ecb..2bf66a429 100644 --- a/examples/plot_pitfalls_permutation_importance.py +++ b/examples/plot_pitfalls_permutation_importance.py @@ -119,7 +119,7 @@ # testing conditional importance, as it identifies the spurious feature as important. permutation_importances = [] conditional_permutation_importances = [] -kf = KFold(n_splits=5, shuffle=True, random_state=3) +kf = KFold(n_splits=5) for i, (train_index, test_index) in enumerate(kf.split(X)): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] @@ -183,7 +183,7 @@ # explained by the other features unchanged. This method is valid for testing conditional # importance. As shown below, it does not identify the spurious feature as important. conditional_importances = [] -kf = KFold(n_splits=5, shuffle=True, random_state=3) +kf = KFold(n_splits=5) for i, (train_index, test_index) in enumerate(kf.split(X)): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] From a232fa62e136cc35a262fc5f7f0ffb96b8ce6445 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 11 Sep 2025 10:35:12 +0200 Subject: [PATCH 40/46] format document --- examples/plot_pitfalls_permutation_importance.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/plot_pitfalls_permutation_importance.py b/examples/plot_pitfalls_permutation_importance.py index 2bf66a429..d408ac472 100644 --- a/examples/plot_pitfalls_permutation_importance.py +++ b/examples/plot_pitfalls_permutation_importance.py @@ -257,9 +257,7 @@ ) conditional_sampler = ConditionalSampler( - model_regression=RidgeCV( - alphas=np.logspace(-3, 3, 5), cv=KFold(n_splits=3) - ), + model_regression=RidgeCV(alphas=np.logspace(-3, 3, 5), cv=KFold(n_splits=3)), random_state=10, ) From 31ba5f5840ab57e4f68f6ae4fdd967063a44abfb Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 23 Sep 2025 10:49:22 +0200 Subject: [PATCH 41/46] fix example --- .../plot_conditional_vs_marginal_xor_data.py | 2 +- examples/plot_dcrt_example.py | 21 ++----------------- examples/plot_fmri_data_example.py | 2 +- 3 files changed, 4 insertions(+), 21 deletions(-) diff --git a/examples/plot_conditional_vs_marginal_xor_data.py b/examples/plot_conditional_vs_marginal_xor_data.py index 3a2c6a804..1351c80c6 100644 --- a/examples/plot_conditional_vs_marginal_xor_data.py +++ b/examples/plot_conditional_vs_marginal_xor_data.py @@ -77,7 +77,7 @@ # The decision function of the SVC shows that the model is able to learn the # non-linear decision boundary of the XOR problem. It also highlights that knowing # the value of both features is necessary to classify each sample correctly. -examples / plot_2D_simulation_example.py + # %% # Computing the conditional and marginal importance diff --git a/examples/plot_dcrt_example.py b/examples/plot_dcrt_example.py index bcb8e1aaf..0a8b67241 100644 --- a/examples/plot_dcrt_example.py +++ b/examples/plot_dcrt_example.py @@ -54,15 +54,9 @@ ## dcrt Lasso ## d0crt_lasso = D0CRT( -<<<<<<< HEAD estimator=LassoCV(random_state=sim_ind, n_jobs=1), - screening=False, + screening_threshold=None, random_state=sim_ind, -||||||| 57d40de4 - d0crt_lasso = D0CRT(estimator=LassoCV(random_state=42, n_jobs=1), screening=False) -======= - estimator=LassoCV(random_state=42, n_jobs=1), screening_threshold=None ->>>>>>> main ) d0crt_lasso.fit_importance(X, y) pvals_lasso = d0crt_lasso.pvalues_ @@ -78,21 +72,10 @@ ## dcrt Random Forest ## d0crt_random_forest = D0CRT( estimator=RandomForestRegressor( -<<<<<<< HEAD n_estimators=100, random_state=sim_ind, n_jobs=1 ), - screening=False, - random_state=sim_ind, -||||||| 57d40de4 - estimator=RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=1), - screening=False, -======= - n_estimators=100, - random_state=42, - n_jobs=1, - ), screening_threshold=None, ->>>>>>> main + random_state=sim_ind, ) d0crt_random_forest.fit_importance(X, y) pvals_forest = d0crt_random_forest.pvalues_ diff --git a/examples/plot_fmri_data_example.py b/examples/plot_fmri_data_example.py index d7ef14398..09c5ec738 100644 --- a/examples/plot_fmri_data_example.py +++ b/examples/plot_fmri_data_example.py @@ -64,7 +64,7 @@ new_soft_limit = limit_5G if soft < 0 else min(limit_5G, soft) new_hard_limit = limit_5G if hard < 0 else min(limit_5G, hard) resource.setrlimit(resource.RLIMIT_AS, (new_soft_limit, new_hard_limit)) -n_job = 1 +n_jobs = 1 # %% From 08fa0d4bf0ae61cd1dc37787ebb385173b3057a9 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 23 Sep 2025 11:20:28 +0200 Subject: [PATCH 42/46] fix language --- tools/examples/debugger_script/try_reproducibility.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/examples/debugger_script/try_reproducibility.py b/tools/examples/debugger_script/try_reproducibility.py index 95d3c804a..462a0ddf1 100644 --- a/tools/examples/debugger_script/try_reproducibility.py +++ b/tools/examples/debugger_script/try_reproducibility.py @@ -12,6 +12,6 @@ def run_joblib(i): import plot_knockoff_aggregation # include the example to test -# run in parralel the same example for compare result -parrallel = Parallel(n_jobs=4) -parrallel(delayed(run_joblib)(i) for i in range(6)) +# run in parallel the same example for compare result +parallel = Parallel(n_jobs=4) +parallel(delayed(run_joblib)(i) for i in range(6)) From aba9798b501e42b75434ad6b670643b332aa0fd4 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 23 Sep 2025 11:21:59 +0200 Subject: [PATCH 43/46] fix order import --- examples/plot_knockoffs_wisconsin.py | 1 - tools/examples/debugger_script/try_reproducibility.py | 1 - 2 files changed, 2 deletions(-) diff --git a/examples/plot_knockoffs_wisconsin.py b/examples/plot_knockoffs_wisconsin.py index f028d1a02..c6ee46928 100644 --- a/examples/plot_knockoffs_wisconsin.py +++ b/examples/plot_knockoffs_wisconsin.py @@ -12,7 +12,6 @@ import numpy as np import pandas as pd - # Define the seeds for the reproducibility of the example rng = np.random.RandomState(43) diff --git a/tools/examples/debugger_script/try_reproducibility.py b/tools/examples/debugger_script/try_reproducibility.py index 462a0ddf1..070376b03 100644 --- a/tools/examples/debugger_script/try_reproducibility.py +++ b/tools/examples/debugger_script/try_reproducibility.py @@ -1,7 +1,6 @@ import os import sys - from joblib import Parallel, delayed # add the example for import them From ce15242d1fcbee2ec977bab5c1358305af62f311 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Tue, 23 Sep 2025 16:48:17 +0200 Subject: [PATCH 44/46] Apply suggestions from code review Co-authored-by: Joseph Paillard --- examples/plot_importance_classification_iris.py | 2 +- examples/plot_knockoff_aggregation.py | 2 +- examples/plot_knockoffs_wisconsin.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/plot_importance_classification_iris.py b/examples/plot_importance_classification_iris.py index acf790f49..3d33fc2e5 100644 --- a/examples/plot_importance_classification_iris.py +++ b/examples/plot_importance_classification_iris.py @@ -37,7 +37,7 @@ from hidimstat import CFI, PFI # Define the seeds for the reproducibility of the example -rng = np.random.RandomState(0) +rng = np.random.default_rng(0) # %% # Load the iris dataset and add a spurious feature # ------------------------------------------------ diff --git a/examples/plot_knockoff_aggregation.py b/examples/plot_knockoff_aggregation.py index 1423715d4..8794cba82 100644 --- a/examples/plot_knockoff_aggregation.py +++ b/examples/plot_knockoff_aggregation.py @@ -58,7 +58,7 @@ # verbosity of the joblib joblib_verbose = 0 # Define the seeds for the reproducibility of the example -rng = np.random.RandomState(42) +rng = np.random.default_rng(42) # %% diff --git a/examples/plot_knockoffs_wisconsin.py b/examples/plot_knockoffs_wisconsin.py index c6ee46928..070196f64 100644 --- a/examples/plot_knockoffs_wisconsin.py +++ b/examples/plot_knockoffs_wisconsin.py @@ -13,7 +13,7 @@ import pandas as pd # Define the seeds for the reproducibility of the example -rng = np.random.RandomState(43) +rng = np.random.default_rng(43) # %% From 368c0d87c036dd32766e0b7e980d6f8ef6ef11d2 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 23 Sep 2025 16:55:24 +0200 Subject: [PATCH 45/46] chnage random state to generator --- examples/plot_conditional_vs_marginal_xor_data.py | 2 +- examples/plot_diabetes_variable_importance_example.py | 4 ++-- examples/plot_model_agnostic_importance.py | 2 +- examples/plot_pitfalls_permutation_importance.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/plot_conditional_vs_marginal_xor_data.py b/examples/plot_conditional_vs_marginal_xor_data.py index 1351c80c6..34fa662b9 100644 --- a/examples/plot_conditional_vs_marginal_xor_data.py +++ b/examples/plot_conditional_vs_marginal_xor_data.py @@ -21,7 +21,7 @@ # %% # To solve the XOR problem, we will use a Support Vector Classier (SVC) with Radial Basis Function (RBF) kernel. # -rng = np.random.RandomState(0) +rng = np.random.default_rng(0) X = rng.randn(400, 2) Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0).astype(int) diff --git a/examples/plot_diabetes_variable_importance_example.py b/examples/plot_diabetes_variable_importance_example.py index b1dfd940f..603df59ec 100644 --- a/examples/plot_diabetes_variable_importance_example.py +++ b/examples/plot_diabetes_variable_importance_example.py @@ -89,7 +89,7 @@ print(f"Fold {i}: {mse=}") # %% -# Fit a baselien model on the diabetes dataset +# Fit a baseline model on the diabetes dataset # -------------------------------------------- # We use a Ridge regression model with a 10-fold cross-validation to fit the # diabetes dataset. @@ -97,7 +97,7 @@ n_folds = 10 regressor = RidgeCV(alphas=np.logspace(-3, 3, 10)) regressor_list = [clone(regressor) for _ in range(n_folds)] -kf = KFold(n_splits=n_folds, shuffle=True, random_state=0) +kf = KFold(n_splits=n_folds, shuffle=True, random_state=21) for i, (train_index, test_index) in enumerate(kf.split(X)): regressor_list[i].fit(X[train_index], y[train_index]) score = r2_score( diff --git a/examples/plot_model_agnostic_importance.py b/examples/plot_model_agnostic_importance.py index 140067eae..5bfdb90e9 100644 --- a/examples/plot_model_agnostic_importance.py +++ b/examples/plot_model_agnostic_importance.py @@ -40,7 +40,7 @@ # %% # Generate data where classes are not linearly separable # ------------------------------------------------------ -rng = np.random.RandomState(0) +rng = np.random.default_rng(0) X, y = make_circles(n_samples=500, noise=0.1, factor=0.6, random_state=rng) diff --git a/examples/plot_pitfalls_permutation_importance.py b/examples/plot_pitfalls_permutation_importance.py index 0eaf090b7..20b2cfaf0 100644 --- a/examples/plot_pitfalls_permutation_importance.py +++ b/examples/plot_pitfalls_permutation_importance.py @@ -29,7 +29,7 @@ from hidimstat.conditional_sampling import ConditionalSampler # Define the seeds for the reproducibility of the example -rng = np.random.RandomState(0) +rng = np.random.default_rng(0) # %% # Load the California housing dataset and add a spurious feature From c8734d1cda42b5a14271bb8afe48a30f8b7192f4 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 23 Sep 2025 17:03:53 +0200 Subject: [PATCH 46/46] fix example --- examples/plot_conditional_vs_marginal_xor_data.py | 2 +- examples/plot_knockoffs_wisconsin.py | 4 ++-- examples/plot_model_agnostic_importance.py | 7 ++++++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/examples/plot_conditional_vs_marginal_xor_data.py b/examples/plot_conditional_vs_marginal_xor_data.py index 34fa662b9..0c4f90cbb 100644 --- a/examples/plot_conditional_vs_marginal_xor_data.py +++ b/examples/plot_conditional_vs_marginal_xor_data.py @@ -22,7 +22,7 @@ # To solve the XOR problem, we will use a Support Vector Classier (SVC) with Radial Basis Function (RBF) kernel. # rng = np.random.default_rng(0) -X = rng.randn(400, 2) +X = rng.standard_normal((400, 2)) Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0).astype(int) xx, yy = np.meshgrid( diff --git a/examples/plot_knockoffs_wisconsin.py b/examples/plot_knockoffs_wisconsin.py index 070196f64..6f03ecb71 100644 --- a/examples/plot_knockoffs_wisconsin.py +++ b/examples/plot_knockoffs_wisconsin.py @@ -84,8 +84,8 @@ for k in range(repeats_noise): X_train_c = X_train.copy() X_test_c = X_test.copy() - noises_train.append(X_train_c + 2 * rng.randn(n_train, p)) - noises_test.append(X_test_c + 2 * rng.randn(n_test, p)) + noises_train.append(X_train_c + 2 * rng.standard_normal((n_train, p))) + noises_test.append(X_test_c + 2 * rng.standard_normal((n_test, p))) feature_names_noise += [f"spurious #{k*p+i}" for i in range(p)] noisy_train = np.concatenate(noises_train, axis=1) diff --git a/examples/plot_model_agnostic_importance.py b/examples/plot_model_agnostic_importance.py index 5bfdb90e9..6ac08751e 100644 --- a/examples/plot_model_agnostic_importance.py +++ b/examples/plot_model_agnostic_importance.py @@ -41,7 +41,12 @@ # Generate data where classes are not linearly separable # ------------------------------------------------------ rng = np.random.default_rng(0) -X, y = make_circles(n_samples=500, noise=0.1, factor=0.6, random_state=rng) +X, y = make_circles( + n_samples=500, + noise=0.1, + factor=0.6, + random_state=np.random.RandomState(rng.bit_generator), +) fig, ax = plt.subplots()