mind-inria
diff --git a/‎examples/plot_2D_simulation_example.py‎
Lines changed: 4 additions & 7 deletions b/‎examples/plot_2D_simulation_example.py‎
Lines changed: 4 additions & 7 deletions
diff --git a/‎examples/plot_conditional_vs_marginal_xor_data.py‎
Lines changed: 7 additions & 7 deletions b/‎examples/plot_conditional_vs_marginal_xor_data.py‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎examples/plot_dcrt_example.py‎
Lines changed: 5 additions & 5 deletions b/‎examples/plot_dcrt_example.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎examples/plot_diabetes_variable_importance_example.py‎
Lines changed: 14 additions & 9 deletions b/‎examples/plot_diabetes_variable_importance_example.py‎
Lines changed: 14 additions & 9 deletions
diff --git a/‎examples/plot_fmri_data_example.py‎
Lines changed: 13 additions & 5 deletions b/‎examples/plot_fmri_data_example.py‎
Lines changed: 13 additions & 5 deletions
diff --git a/‎examples/plot_importance_classification_iris.py‎
Lines changed: 19 additions & 7 deletions b/‎examples/plot_importance_classification_iris.py‎
Lines changed: 19 additions & 7 deletions
diff --git a/‎examples/plot_knockoff_aggregation.py‎
Lines changed: 9 additions & 11 deletions b/‎examples/plot_knockoff_aggregation.py‎
Lines changed: 9 additions & 11 deletions
@@ -80,7 +80,7 @@
 
 # generating the data
 X_init, y, beta, epsilon = multivariate_simulation_spatial(
-    n_samples, shape, roi_size, signal_noise_ratio, smooth_X, seed=1
+    n_samples, shape, roi_size, signal_noise_ratio, smooth_X, seed=0
 )
 
 # %%
@@ -188,6 +188,7 @@ def weight_map_2D_extended(shape, roi_size, delta):
     X_init,
     y,
     n_jobs=n_jobs,
+    seed=0,
 )
 pval, pval_corr, one_minus_pval, one_minus_pval_corr, cb_min, cb_max = (
     desparsified_lasso_pvalue(
@@ -221,7 +222,7 @@ def weight_map_2D_extended(shape, roi_size, delta):
 
 # clustered desparsified lasso (CluDL)
 ward_, beta_hat, theta_hat, omega_diag = clustered_inference(
-    X_init, y, ward, n_clusters, scaler_sampling=StandardScaler()
+    X_init, y, ward, n_clusters, scaler_sampling=StandardScaler(), seed=0
 )
 beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = (
     clustered_inference_pvalue(
@@ -253,11 +254,7 @@ def weight_map_2D_extended(shape, roi_size, delta):
 # ensemble of clustered desparsified lasso (EnCluDL)
 list_ward, list_beta_hat, list_theta_hat, list_omega_diag = (
     ensemble_clustered_inference(
-        X_init,
-        y,
-        ward,
-        n_clusters,
-        scaler_sampling=StandardScaler(),
+        X_init, y, ward, n_clusters, scaler_sampling=StandardScaler(), seed=0
     )
 )
 beta_hat, selected_ecdl = ensemble_clustered_inference_pvalue(
 
@@ -21,8 +21,8 @@
 # %%
 # To solve the XOR problem, we will use a Support Vector Classier (SVC) with Radial Basis Function (RBF) kernel.
 #
-rng = np.random.RandomState(0)
-X = rng.randn(400, 2)
+rng = np.random.default_rng(0)
+X = rng.standard_normal((400, 2))
 Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0).astype(int)
 
 xx, yy = np.meshgrid(
@@ -34,9 +34,9 @@
     X,
     Y,
     test_size=0.2,
-    random_state=0,
+    random_state=1,
 )
-model = SVC(kernel="rbf", random_state=0)
+model = SVC(kernel="rbf", random_state=2)
 model.fit(X_train, y_train)
 
 
@@ -88,8 +88,8 @@
 # features. Conditional importance, on the other hand, reveals that both features
 # are important (therefore rejecting the null hypothesis
 # :math:`Y \perp\!\!\!\perp X^1 | X^2`).
-cv = KFold(n_splits=5, shuffle=True, random_state=0)
-clf = SVC(kernel="rbf", random_state=0)
+cv = KFold(n_splits=5, shuffle=True, random_state=3)
+clf = SVC(kernel="rbf", random_state=4)
 
 # %%
 # Compute marginal importance using univariate models.
@@ -126,7 +126,7 @@
         loss=hinge_loss,
         imputation_model_continuous=RidgeCV(np.logspace(-3, 3, 10)),
         n_permutations=50,
-        random_state=0,
+        random_state=5,
     )
     vim.fit(X_train, y_train)
     importances.append(vim.importance(X_test, y_test)["importance"])
 
@@ -25,7 +25,6 @@
 results_list = []
 for sim_ind in range(10):
     print(f"Processing: {sim_ind+1}")
-    np.random.seed(sim_ind)
 
     # Number of observations
     n = 100
@@ -55,7 +54,9 @@
 
     ## dcrt Lasso ##
     d0crt_lasso = D0CRT(
-        estimator=LassoCV(random_state=42, n_jobs=1), screening_threshold=None
+        estimator=LassoCV(random_state=sim_ind, n_jobs=1),
+        screening_threshold=None,
+        random_state=sim_ind,
     )
     d0crt_lasso.fit_importance(X, y)
     pvals_lasso = d0crt_lasso.pvalues_
@@ -71,11 +72,10 @@
     ## dcrt Random Forest ##
     d0crt_random_forest = D0CRT(
         estimator=RandomForestRegressor(
-            n_estimators=100,
-            random_state=42,
-            n_jobs=1,
+            n_estimators=100, random_state=sim_ind, n_jobs=1
         ),
         screening_threshold=None,
+        random_state=sim_ind,
     )
     d0crt_random_forest.fit_importance(X, y)
     pvals_forest = d0crt_random_forest.pvalues_
 
@@ -70,9 +70,12 @@
 # diabetes dataset.
 
 n_folds = 5
-regressor = RidgeCV(alphas=np.logspace(-3, 3, 10))
+regressor = RidgeCV(
+    alphas=np.logspace(-3, 3, 10),
+    cv=KFold(shuffle=True, random_state=20),
+)
 regressor_list = [clone(regressor) for _ in range(n_folds)]
-kf = KFold(n_splits=n_folds, shuffle=True, random_state=0)
+kf = KFold(n_splits=n_folds, shuffle=True, random_state=21)
 for i, (train_index, test_index) in enumerate(kf.split(X)):
     regressor_list[i].fit(X[train_index], y[train_index])
     score = r2_score(
@@ -86,15 +89,15 @@
     print(f"Fold {i}: {mse=}")
 
 # %%
-# Fit a baselien model on the diabetes dataset
+# Fit a baseline model on the diabetes dataset
 # --------------------------------------------
 # We use a Ridge regression model with a 10-fold cross-validation to fit the
 # diabetes dataset.
 
 n_folds = 10
 regressor = RidgeCV(alphas=np.logspace(-3, 3, 10))
 regressor_list = [clone(regressor) for _ in range(n_folds)]
-kf = KFold(n_splits=n_folds, shuffle=True, random_state=0)
+kf = KFold(n_splits=n_folds, shuffle=True, random_state=21)
 for i, (train_index, test_index) in enumerate(kf.split(X)):
     regressor_list[i].fit(X[train_index], y[train_index])
     score = r2_score(
@@ -112,19 +115,21 @@
 # --------------------------------------------------------
 
 cfi_importance_list = []
+kf = KFold(n_splits=n_folds, shuffle=True, random_state=21)
 for i, (train_index, test_index) in enumerate(kf.split(X)):
     print(f"Fold {i}")
     X_train, X_test = X[train_index], X[test_index]
     y_train, y_test = y[train_index], y[test_index]
     cfi = CFI(
         estimator=regressor_list[i],
-        imputation_model_continuous=RidgeCV(alphas=np.logspace(-3, 3, 10)),
+        imputation_model_continuous=RidgeCV(alphas=np.logspace(-3, 3, 10), cv=KFold()),
         imputation_model_categorical=LogisticRegressionCV(
             Cs=np.logspace(-2, 2, 10),
+            cv=KFold(),
         ),
         # covariate_estimator=HistGradientBoostingRegressor(random_state=0,),
         n_permutations=50,
-        random_state=0,
+        random_state=24,
         n_jobs=4,
     )
     cfi.fit(X_train, y_train)
@@ -136,7 +141,7 @@
 # ---------------------------------------------------------
 
 loco_importance_list = []
-
+kf = KFold(n_splits=n_folds, shuffle=True, random_state=21)
 for i, (train_index, test_index) in enumerate(kf.split(X)):
     print(f"Fold {i}")
     X_train, X_test = X[train_index], X[test_index]
@@ -155,15 +160,15 @@
 # ----------------------------------------------------------------
 
 pfi_importance_list = []
-
+kf = KFold(n_splits=n_folds, shuffle=True, random_state=21)
 for i, (train_index, test_index) in enumerate(kf.split(X)):
     print(f"Fold {i}")
     X_train, X_test = X[train_index], X[test_index]
     y_train, y_test = y[train_index], y[test_index]
     pfi = PFI(
         estimator=regressor_list[i],
         n_permutations=50,
-        random_state=0,
+        random_state=25,
         n_jobs=4,
     )
     pfi.fit(X_train, y_train)
 
@@ -64,7 +64,7 @@
 new_soft_limit = limit_5G if soft < 0 else min(limit_5G, soft)
 new_hard_limit = limit_5G if hard < 0 else min(limit_5G, hard)
 resource.setrlimit(resource.RLIMIT_AS, (new_soft_limit, new_hard_limit))
-n_job = 1
+n_jobs = 1
 
 
 # %%
@@ -149,7 +149,7 @@ def preprocess_haxby(subject=2, memory=None):
 #
 try:
     beta_hat, sigma_hat, precision_diagonal = desparsified_lasso(
-        X, y, noise_method="median", max_iteration=1000
+        X, y, noise_method="median", max_iteration=1000, seed=0, n_jobs=n_jobs
     )
     pval_dl, _, one_minus_pval_dl, _, cb_min, cb_max = desparsified_lasso_pvalue(
         X.shape[0], beta_hat, sigma_hat, precision_diagonal
@@ -163,7 +163,14 @@ def preprocess_haxby(subject=2, memory=None):
 # Now, the clustered inference algorithm which combines parcellation
 # and high-dimensional inference (c.f. References).
 ward_, beta_hat, theta_hat, omega_diag = clustered_inference(
-    X, y, ward, n_clusters, scaler_sampling=StandardScaler(), tolerance=1e-2
+    X,
+    y,
+    ward,
+    n_clusters,
+    scaler_sampling=StandardScaler(),
+    tolerance=1e-2,
+    seed=1,
+    n_jobs=n_jobs,
 )
 beta_hat, pval_cdl, _, one_minus_pval_cdl, _ = clustered_inference_pvalue(
     X.shape[0], None, ward_, beta_hat, theta_hat, omega_diag
@@ -176,7 +183,7 @@ def preprocess_haxby(subject=2, memory=None):
 # which means that 5 different parcellations are considered and
 # then 5 statistical maps are produced and aggregated into one.
 # However you might benefit from clustering randomization taking
-# `n_bootstraps=25` or `n_bootstraps=100`, also we set `n_jobs=2`.
+# `n_bootstraps=25` or `n_bootstraps=100`, also we set `n_jobs`.
 list_ward, list_beta_hat, list_theta_hat, list_omega_diag = (
     ensemble_clustered_inference(
         X,
@@ -188,7 +195,8 @@ def preprocess_haxby(subject=2, memory=None):
         n_bootstraps=5,
         max_iteration=6000,
         tolerance=1e-2,
-        n_jobs=2,
+        seed=2,
+        n_jobs=n_jobs,
     )
 )
 beta_hat, selected = ensemble_clustered_inference_pvalue(
 
@@ -36,6 +36,8 @@
 
 from hidimstat import CFI, PFI
 
+# Define the seeds for the reproducibility of the example
+rng = np.random.default_rng(0)
 # %%
 # Load the iris dataset and add a spurious feature
 # ------------------------------------------------
@@ -45,7 +47,6 @@
 # contrarily to `CFI`.
 
 dataset = load_iris()
-rng = np.random.RandomState(0)
 X, y = dataset.data, dataset.target
 spurious_feat = X[:, 2] + X[:, 3]
 spurious_feat += rng.normal(size=X.shape[0], scale=np.std(spurious_feat) / 2)
@@ -86,17 +87,19 @@ def run_one_fold(
     if vim_name == "CFI":
         vim = CFI(
             estimator=model_c,
-            imputation_model_continuous=RidgeCV(alphas=np.logspace(-3, 3, 10)),
+            imputation_model_continuous=RidgeCV(
+                alphas=np.logspace(-3, 3, 10), cv=KFold(shuffle=True, random_state=1)
+            ),
             n_permutations=50,
-            random_state=0,
+            random_state=2,
             method=method,
             loss=loss,
         )
     elif vim_name == "PFI":
         vim = PFI(
             estimator=model_c,
             n_permutations=50,
-            random_state=0,
+            random_state=3,
             method=method,
             loss=loss,
         )
@@ -124,10 +127,19 @@ def run_one_fold(
 # combination, in parallel.
 
 models = [
-    LogisticRegressionCV(Cs=np.logspace(-3, 3, 10), tol=1e-3, max_iter=1000),
-    GridSearchCV(SVC(kernel="rbf"), {"C": np.logspace(-3, 3, 10)}),
+    LogisticRegressionCV(
+        Cs=np.logspace(-3, 3, 10),
+        tol=1e-3,
+        max_iter=1000,
+        cv=KFold(shuffle=True, random_state=4),
+    ),
+    GridSearchCV(
+        SVC(kernel="rbf"),
+        {"C": np.logspace(-3, 3, 10)},
+        cv=KFold(shuffle=True, random_state=5),
+    ),
 ]
-cv = KFold(n_splits=5, shuffle=True, random_state=0)
+cv = KFold(n_splits=5, shuffle=True, random_state=6)
 groups = {ft: [i] for i, ft in enumerate(dataset.feature_names)}
 out_list = Parallel(n_jobs=5)(
     delayed(run_one_fold)(
 
@@ -18,7 +18,6 @@
 from joblib import Parallel, delayed
 from sklearn.linear_model import LassoCV
 from sklearn.model_selection import KFold
-from sklearn.utils import check_random_state
 
 from hidimstat._utils.scenario import multivariate_simulation
 from hidimstat.knockoffs import (
@@ -54,15 +53,12 @@
 signal_noise_ratio = 10
 # number of repetitions for the bootstraps
 n_bootstraps = 25
-# seed for the random generator
-seed = 45
 # number of jobs for repetition of the method
 n_jobs = 2
 # verbosity of the joblib
 joblib_verbose = 0
-
-rng = check_random_state(seed)
-seed_list = rng.randint(1, np.iinfo(np.int32).max, runs)
+# Define the seeds for the reproducibility of the example
+rng = np.random.default_rng(42)
 
 
 # %%
@@ -96,9 +92,10 @@ def single_run(
         estimator=LassoCV(
             n_jobs=1,
             cv=KFold(n_splits=5, shuffle=True, random_state=0),
+            random_state=1,
         ),
         n_bootstraps=1,
-        random_state=seed,
+        random_state=2,
     )
     mx_selection, _ = model_x_knockoff_pvalue(test_scores, fdr=fdr)
     fdp_mx, power_mx = fdp_power(mx_selection, non_zero_index)
@@ -109,11 +106,12 @@ def single_run(
         y,
         estimator=LassoCV(
             n_jobs=1,
-            cv=KFold(n_splits=5, shuffle=True, random_state=0),
+            cv=KFold(n_splits=5, shuffle=True, random_state=3),
+            random_state=4,
         ),
         n_bootstraps=n_bootstraps,
         n_jobs=1,
-        random_state=seed,
+        random_state=5,
     )
 
     # Use p-values aggregation [2]
@@ -141,7 +139,7 @@ def plot_results(bounds, fdr, n_samples, n_features, power=False):
     for nb in range(len(bounds)):
         for i in range(len(bounds[nb])):
             y = bounds[nb][i]
-            x = np.random.normal(nb + 1, 0.05)
+            x = rng.normal(nb + 1, 0.05)
             plt.scatter(x, y, alpha=0.65, c="blue")
 
     plt.boxplot(bounds, sym="")
@@ -184,7 +182,7 @@ def effect_number_samples(n_samples):
             n_bootstraps,
             seed=seed,
         )
-        for seed in seed_list
+        for seed in range(runs)
     )
 
     fdps_mx = []