mind-inria · lionelkusch · Sep 23, 2025 · Aug 22, 2025 · Aug 22, 2025 · Aug 22, 2025
diff --git a/examples/plot_diabetes_variable_importance_example.py b/examples/plot_diabetes_variable_importance_example.py
@@ -54,9 +54,12 @@
 from sklearn.linear_model import LogisticRegressionCV, RidgeCV
 from sklearn.metrics import r2_score, root_mean_squared_error
 from sklearn.model_selection import KFold
+from sklearn.utils import check_random_state
 
 from hidimstat import CPI, LOCO, PFI
 
+seeds = check_random_state(42).randint(1, np.iinfo(np.int32).max, 7)
+
 #############################################################################
 # Load the diabetes dataset
 # -------------------------
@@ -71,30 +74,12 @@
 # diabetes dataset.
 
 n_folds = 5
-regressor = RidgeCV(alphas=np.logspace(-3, 3, 10))
-regressor_list = [clone(regressor) for _ in range(n_folds)]
-kf = KFold(n_splits=n_folds, shuffle=True, random_state=0)
-for i, (train_index, test_index) in enumerate(kf.split(X)):
-    regressor_list[i].fit(X[train_index], y[train_index])
-    score = r2_score(
-        y_true=y[test_index], y_pred=regressor_list[i].predict(X[test_index])
-    )
-    mse = root_mean_squared_error(
-        y_true=y[test_index], y_pred=regressor_list[i].predict(X[test_index])
-    )
-
-    print(f"Fold {i}: {score}")
-    print(f"Fold {i}: {mse}")
-#############################################################################
-# Fit a baselien model on the diabetes dataset
-# --------------------------------------------
-# We use a Ridge regression model with a 10-fold cross-validation to fit the
-# diabetes dataset.
-
-n_folds = 10
-regressor = RidgeCV(alphas=np.logspace(-3, 3, 10))
+regressor = RidgeCV(
+    alphas=np.logspace(-3, 3, 10),
+    cv=KFold(shuffle=True, random_state=seeds[0]),
+)
 regressor_list = [clone(regressor) for _ in range(n_folds)]
-kf = KFold(n_splits=n_folds, shuffle=True, random_state=0)
+kf = KFold(n_splits=n_folds, shuffle=True, random_state=seeds[1])
 for i, (train_index, test_index) in enumerate(kf.split(X)):
     regressor_list[i].fit(X[train_index], y[train_index])
     score = r2_score(
@@ -112,17 +97,23 @@
 # --------------------------------------------------------
 
 cpi_importance_list = []
+kf = KFold(n_splits=n_folds, shuffle=True, random_state=seeds[1])
 for i, (train_index, test_index) in enumerate(kf.split(X)):
     print(f"Fold {i}")
     X_train, X_test = X[train_index], X[test_index]
     y_train, y_test = y[train_index], y[test_index]
     cpi = CPI(
         estimator=regressor_list[i],
-        imputation_model_continuous=RidgeCV(alphas=np.logspace(-3, 3, 10)),
-        imputation_model_categorical=LogisticRegressionCV(Cs=np.logspace(-2, 2, 10)),
-        # covariate_estimator=HistGradientBoostingRegressor(random_state=0,),
+        imputation_model_continuous=RidgeCV(
+            alphas=np.logspace(-3, 3, 10),
+            cv=KFold(shuffle=True, random_state=seeds[3]),
+        ),
+        imputation_model_categorical=LogisticRegressionCV(
+            Cs=np.logspace(-2, 2, 10),
+            cv=KFold(shuffle=True, random_state=seeds[4]),
+        ),
         n_permutations=50,
-        random_state=0,
+        random_state=seeds[5],
         n_jobs=4,
     )
     cpi.fit(X_train, y_train)
@@ -134,7 +125,7 @@
 # ---------------------------------------------------------
 
 loco_importance_list = []
-
+kf = KFold(n_splits=n_folds, shuffle=True, random_state=seeds[1])
 for i, (train_index, test_index) in enumerate(kf.split(X)):
     print(f"Fold {i}")
     X_train, X_test = X[train_index], X[test_index]
@@ -153,15 +144,15 @@
 # ----------------------------------------------------------------
 
 pfi_importance_list = []
-
+kf = KFold(n_splits=n_folds, shuffle=True, random_state=seeds[1])
 for i, (train_index, test_index) in enumerate(kf.split(X)):
     print(f"Fold {i}")
     X_train, X_test = X[train_index], X[test_index]
     y_train, y_test = y[train_index], y[test_index]
     pfi = PFI(
         estimator=regressor_list[i],
         n_permutations=50,
-        random_state=0,
+        random_state=seeds[6],
         n_jobs=4,
     )
     pfi.fit(X_train, y_train)

diff --git a/examples/plot_importance_classification_iris.py b/examples/plot_importance_classification_iris.py
@@ -77,7 +77,9 @@ def run_one_fold(X, y, model, train_index, test_index, vim_name="CPI", groups=No
     if vim_name == "CPI":
         vim = CPI(
             estimator=model_c,
-            imputation_model_continuous=RidgeCV(alphas=np.logspace(-3, 3, 10)),
+            imputation_model_continuous=RidgeCV(
+                alphas=np.logspace(-3, 3, 10), cv=KFold(shuffle=True, random_state=1)
+            ),
             n_permutations=50,
             random_state=0,
             method=method,
@@ -112,10 +114,19 @@ def run_one_fold(X, y, model, train_index, test_index, vim_name="CPI", groups=No
 # combination, in parallel.
 
 models = [
-    LogisticRegressionCV(Cs=np.logspace(-3, 3, 10), tol=1e-3, max_iter=1000),
-    GridSearchCV(SVC(kernel="rbf"), {"C": np.logspace(-3, 3, 10)}),
+    LogisticRegressionCV(
+        Cs=np.logspace(-3, 3, 10),
+        tol=1e-3,
+        max_iter=1000,
+        cv=KFold(shuffle=True, random_state=2),
+    ),
+    GridSearchCV(
+        SVC(kernel="rbf"),
+        {"C": np.logspace(-3, 3, 10)},
+        cv=KFold(shuffle=True, random_state=3),
+    ),
 ]
-cv = KFold(n_splits=5, shuffle=True, random_state=0)
+cv = KFold(n_splits=5, shuffle=True, random_state=4)
 groups = {ft: i for i, ft in enumerate(dataset.feature_names)}
 out_list = Parallel(n_jobs=5)(
     delayed(run_one_fold)(

diff --git a/examples/plot_knockoff_aggregation.py b/examples/plot_knockoff_aggregation.py
@@ -73,7 +73,8 @@
 #######################################################################
 # Define the function for running the three procedures on the same data
 # ---------------------------------------------------------------------
-def single_run(n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, seed=None):
+def single_run(n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, seed=0):
+    seeds = check_random_state(seed).randint(1, np.iinfo(np.int32).max, 4)
     # Generate data
     X, y, _, non_zero_index = multivariate_1D_simulation_AR(
         n_samples, n_features, rho=rho, sparsity=sparsity, seed=seed, snr=snr
@@ -85,10 +86,10 @@ def single_run(n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, see
         y,
         estimator=LassoCV(
             n_jobs=1,
-            cv=KFold(n_splits=5, shuffle=True, random_state=0),
+            cv=KFold(n_splits=5, shuffle=True, random_state=seeds[0]),
         ),
         n_bootstraps=1,
-        random_state=seed,
+        random_state=seeds[1],
     )
     mx_selection, _ = model_x_knockoff_pvalue(test_scores, fdr=fdr)
     fdp_mx, power_mx = fdp_power(mx_selection, non_zero_index)
@@ -99,11 +100,11 @@ def single_run(n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, see
         y,
         estimator=LassoCV(
             n_jobs=1,
-            cv=KFold(n_splits=5, shuffle=True, random_state=0),
+            cv=KFold(n_splits=5, shuffle=True, random_state=seeds[2]),
         ),
         n_bootstraps=n_bootstraps,
         n_jobs=1,
-        random_state=seed,
+        random_state=seeds[3],
     )
 
     # Use p-values aggregation [2]
@@ -131,7 +132,7 @@ def plot_results(bounds, fdr, n_samples, n_features, power=False):
     for nb in range(len(bounds)):
         for i in range(len(bounds[nb])):
             y = bounds[nb][i]
-            x = np.random.normal(nb + 1, 0.05)
+            x = rng.normal(nb + 1, 0.05)
             plt.scatter(x, y, alpha=0.65, c="blue")
 
     plt.boxplot(bounds, sym="")
@@ -165,7 +166,14 @@ def effect_number_samples(n_samples):
     parallel = Parallel(n_jobs, verbose=joblib_verbose)
     results = parallel(
         delayed(single_run)(
-            n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, seed=seed
+            n_samples,
+            n_features,
+            rho,
+            sparsity,
+            snr,
+            fdr,
+            n_bootstraps,
+            seed=seed,
         )
         for seed in seed_list
     )

diff --git a/examples/plot_pitfalls_permutation_importance.py b/examples/plot_pitfalls_permutation_importance.py
@@ -24,11 +24,13 @@
 from sklearn.neural_network import MLPRegressor
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
+from sklearn.utils import check_random_state
 
 from hidimstat import CPI, PFI
 from hidimstat.conditional_sampling import ConditionalSampler
 
-rng = np.random.RandomState(0)
+rng = check_random_state(42)
+seeds = rng.randint(1, np.iinfo(np.int32).max, 9)
 
 #############################################################################
 # Load the California housing dataset and add a spurious feature
@@ -40,7 +42,9 @@
 dataset = fetch_california_housing()
 X_, y_ = dataset.data, dataset.target
 # only use 2/3 of samples to speed up the example
-X, _, y, _ = train_test_split(X_, y_, test_size=0.6667, random_state=0, shuffle=True)
+X, _, y, _ = train_test_split(
+    X_, y_, test_size=0.6667, random_state=seeds[0], shuffle=True
+)
 
 redundant_coef = rng.choice(np.arange(X.shape[1]), size=(3,), replace=False)
 X_spurious = X[:, redundant_coef].sum(axis=1)
@@ -85,7 +89,7 @@
     regressor=make_pipeline(
         StandardScaler(),
         MLPRegressor(
-            random_state=0,
+            random_state=seeds[1],
             hidden_layer_sizes=(32, 16, 8),
             early_stopping=True,
             learning_rate_init=0.01,
@@ -96,7 +100,7 @@
 )
 
 
-kf = KFold(n_splits=5, shuffle=True, random_state=0)
+kf = KFold(n_splits=5, shuffle=True, random_state=seeds[2])
 for train_index, test_index in kf.split(X):
     X_train, X_test = X[train_index], X[test_index]
     y_train, y_test = y[train_index], y[test_index]
@@ -118,6 +122,7 @@
 # testing conditional importance, as it identifies the spurious feature as important.
 permutation_importances = []
 conditional_permutation_importances = []
+kf = KFold(n_splits=5, shuffle=True, random_state=seeds[2])
 for i, (train_index, test_index) in enumerate(kf.split(X)):
     X_train, X_test = X[train_index], X[test_index]
     y_train, y_test = y[train_index], y[test_index]
@@ -128,7 +133,8 @@
     pfi = PFI(
         model_c,
         n_permutations=50,
-        random_state=0,
+        n_jobs=5,
+        random_state=seeds[3],
     )
     pfi.fit(X_test, y_test)
 
@@ -185,6 +191,7 @@
 # explained by the other features unchanged. This method is valid for testing conditional
 # importance. As shown below, it does not identify the spurious feature as important.
 conditional_importances = []
+kf = KFold(n_splits=5, shuffle=True, random_state=seeds[2])
 for i, (train_index, test_index) in enumerate(kf.split(X)):
     X_train, X_test = X[train_index], X[test_index]
     y_train, y_test = y[train_index], y[test_index]
@@ -194,8 +201,11 @@
     # Compute conditional permutation feature importance
     cpi = CPI(
         model_c,
-        imputation_model_continuous=RidgeCV(alphas=np.logspace(-3, 3, 5)),
-        random_state=0,
+        imputation_model_continuous=RidgeCV(
+            alphas=np.logspace(-3, 3, 5),
+            cv=KFold(shuffle=True, random_state=seeds[4]),
+        ),
+        random_state=seeds[5],
         n_jobs=5,
     )
     cpi.fit(X_test, y_test)
@@ -251,12 +261,14 @@
 X_train, X_test = train_test_split(
     X,
     test_size=0.3,
-    random_state=0,
+    random_state=seeds[6],
 )
 
 conditional_sampler = ConditionalSampler(
-    model_regression=RidgeCV(alphas=np.logspace(-3, 3, 5)),
-    random_state=0,
+    model_regression=RidgeCV(
+        alphas=np.logspace(-3, 3, 5), cv=KFold(shuffle=True, random_state=seeds[7])
+    ),
+    random_state=seeds[8],
 )
 
 

diff --git a/src/hidimstat/base_perturbation.py b/src/hidimstat/base_perturbation.py
@@ -49,6 +49,7 @@ def __init__(
         self.n_jobs = n_jobs
         self.n_permutations = n_permutations
         self.n_groups = None
+        self.random_state = None
 
     def fit(self, X, y=None, groups=None):
         """Base fit method for perturbation-based methods. Identifies the groups.
@@ -105,9 +106,17 @@ def predict(self, X):
         X_ = np.asarray(X)
 
         # Parallelize the computation of the importance scores for each group
+        if self.random_state is None:
+            list_seed = [None for i in range(self.n_groups)]
+        else:
+            list_seed = self.random_state.randint(
+                1, np.iinfo(np.int32).max, self.n_groups
+            )
         out_list = Parallel(n_jobs=self.n_jobs)(
-            delayed(self._joblib_predict_one_group)(X_, group_id, group_key)
-            for group_id, group_key in enumerate(self.groups.keys())
+            delayed(self._joblib_predict_one_group)(X_, group_id, group_key, seed)
+            for group_id, (group_key, seed) in enumerate(
+                zip(self.groups.keys(), list_seed)
+            )
         )
         return np.stack(out_list, axis=0)
 
@@ -168,7 +177,7 @@ def _check_fit(self):
                 " call fit with groups=None"
             )
 
-    def _joblib_predict_one_group(self, X, group_id, group_key):
+    def _joblib_predict_one_group(self, X, group_id, group_key, seed):
         """
         Compute the predictions after perturbation of the data for a given
         group of variables. This function is parallelized.
@@ -181,14 +190,16 @@ def _joblib_predict_one_group(self, X, group_id, group_key):
             The index of the group of variables.
         group_key: str, int
             The key of the group of variables. (parameter use for debugging)
+        seed: int, optional
+            Random seed for reproducibility.
         """
         group_ids = self._groups_ids[group_id]
         non_group_ids = np.delete(np.arange(X.shape[1]), group_ids)
         # Create an array X_perm_j of shape (n_permutations, n_samples, n_features)
         # where the j-th group of covariates is permuted
         X_perm = np.empty((self.n_permutations, X.shape[0], X.shape[1]))
         X_perm[:, :, non_group_ids] = np.delete(X, group_ids, axis=1)
-        X_perm[:, :, group_ids] = self._permutation(X, group_id=group_id)
+        X_perm[:, :, group_ids] = self._permutation(X, group_id=group_id, seed=seed)
         # Reshape X_perm to allow for batch prediction
         X_perm_batch = X_perm.reshape(-1, X.shape[1])
         y_pred_perm = getattr(self.estimator, self.method)(X_perm_batch)
@@ -202,6 +213,6 @@ def _joblib_predict_one_group(self, X, group_id, group_key):
             )
         return y_pred_perm
 
-    def _permutation(self, X, group_id):
+    def _permutation(self, X, group_id, seed):
         """Method for creating the permuted data for the j-th group of covariates."""
         raise NotImplementedError