Skip to content

Commit c276002

Browse files
lionelkuschbthirionjpaillard
authored
Remove some randomization in the example and in the code base. (#344)
* Improve reproducbility example * Improve reproducibility in code base * Test for testing reproducibilities * fix error * homogenize managment of seed in examples * fix randomize in plot_konckoff_aggregation * fix error in plot * fix path * fix seed for seaborn * improve seed setting * change seed in methods * Fix seed in test and example * Fix seeds * Apply suggestions from code review Co-authored-by: bthirion <[email protected]> * remove some seed * change seed management * remove rng * fix name * change seed * Apply suggestions from code review Co-authored-by: Joseph Paillard <[email protected]> * fix bug for random_generator in pertubation cases * fix test * fix randomization * update the way to set seeds * Update src/hidimstat/conditional_feature_importance.py Co-authored-by: lionel kusch <[email protected]> * add a new way to check random * Apply suggestions from code review Co-authored-by: Joseph Paillard <[email protected]> * change predict signature * update docstring of check_random_state * improvement gestion of seed * remove management of seed * remove some management of seeds * fix missing revert * fix clone * fix tests * fix tests * Apply suggestions from code review Co-authored-by: bthirion <[email protected]> * Apply suggestions from code review Co-authored-by: bthirion <[email protected]> * Apply suggestions from code review Co-authored-by: bthirion <[email protected]> * format document * fix example * fix language * fix order import * Apply suggestions from code review Co-authored-by: Joseph Paillard <[email protected]> * chnage random state to generator * fix example --------- Co-authored-by: bthirion <[email protected]> Co-authored-by: Joseph Paillard <[email protected]>
1 parent 6eb643c commit c276002

17 files changed

+149
-106
lines changed

examples/plot_2D_simulation_example.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@
8080

8181
# generating the data
8282
X_init, y, beta, epsilon = multivariate_simulation_spatial(
83-
n_samples, shape, roi_size, signal_noise_ratio, smooth_X, seed=1
83+
n_samples, shape, roi_size, signal_noise_ratio, smooth_X, seed=0
8484
)
8585

8686
# %%
@@ -188,6 +188,7 @@ def weight_map_2D_extended(shape, roi_size, delta):
188188
X_init,
189189
y,
190190
n_jobs=n_jobs,
191+
seed=0,
191192
)
192193
pval, pval_corr, one_minus_pval, one_minus_pval_corr, cb_min, cb_max = (
193194
desparsified_lasso_pvalue(
@@ -221,7 +222,7 @@ def weight_map_2D_extended(shape, roi_size, delta):
221222

222223
# clustered desparsified lasso (CluDL)
223224
ward_, beta_hat, theta_hat, omega_diag = clustered_inference(
224-
X_init, y, ward, n_clusters, scaler_sampling=StandardScaler()
225+
X_init, y, ward, n_clusters, scaler_sampling=StandardScaler(), seed=0
225226
)
226227
beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = (
227228
clustered_inference_pvalue(
@@ -253,11 +254,7 @@ def weight_map_2D_extended(shape, roi_size, delta):
253254
# ensemble of clustered desparsified lasso (EnCluDL)
254255
list_ward, list_beta_hat, list_theta_hat, list_omega_diag = (
255256
ensemble_clustered_inference(
256-
X_init,
257-
y,
258-
ward,
259-
n_clusters,
260-
scaler_sampling=StandardScaler(),
257+
X_init, y, ward, n_clusters, scaler_sampling=StandardScaler(), seed=0
261258
)
262259
)
263260
beta_hat, selected_ecdl = ensemble_clustered_inference_pvalue(

examples/plot_conditional_vs_marginal_xor_data.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@
2121
# %%
2222
# To solve the XOR problem, we will use a Support Vector Classier (SVC) with Radial Basis Function (RBF) kernel.
2323
#
24-
rng = np.random.RandomState(0)
25-
X = rng.randn(400, 2)
24+
rng = np.random.default_rng(0)
25+
X = rng.standard_normal((400, 2))
2626
Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0).astype(int)
2727

2828
xx, yy = np.meshgrid(
@@ -34,9 +34,9 @@
3434
X,
3535
Y,
3636
test_size=0.2,
37-
random_state=0,
37+
random_state=1,
3838
)
39-
model = SVC(kernel="rbf", random_state=0)
39+
model = SVC(kernel="rbf", random_state=2)
4040
model.fit(X_train, y_train)
4141

4242

@@ -88,8 +88,8 @@
8888
# features. Conditional importance, on the other hand, reveals that both features
8989
# are important (therefore rejecting the null hypothesis
9090
# :math:`Y \perp\!\!\!\perp X^1 | X^2`).
91-
cv = KFold(n_splits=5, shuffle=True, random_state=0)
92-
clf = SVC(kernel="rbf", random_state=0)
91+
cv = KFold(n_splits=5, shuffle=True, random_state=3)
92+
clf = SVC(kernel="rbf", random_state=4)
9393

9494
# %%
9595
# Compute marginal importance using univariate models.
@@ -126,7 +126,7 @@
126126
loss=hinge_loss,
127127
imputation_model_continuous=RidgeCV(np.logspace(-3, 3, 10)),
128128
n_permutations=50,
129-
random_state=0,
129+
random_state=5,
130130
)
131131
vim.fit(X_train, y_train)
132132
importances.append(vim.importance(X_test, y_test)["importance"])

examples/plot_dcrt_example.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
results_list = []
2626
for sim_ind in range(10):
2727
print(f"Processing: {sim_ind+1}")
28-
np.random.seed(sim_ind)
2928

3029
# Number of observations
3130
n = 100
@@ -55,7 +54,9 @@
5554

5655
## dcrt Lasso ##
5756
d0crt_lasso = D0CRT(
58-
estimator=LassoCV(random_state=42, n_jobs=1), screening_threshold=None
57+
estimator=LassoCV(random_state=sim_ind, n_jobs=1),
58+
screening_threshold=None,
59+
random_state=sim_ind,
5960
)
6061
d0crt_lasso.fit_importance(X, y)
6162
pvals_lasso = d0crt_lasso.pvalues_
@@ -71,11 +72,10 @@
7172
## dcrt Random Forest ##
7273
d0crt_random_forest = D0CRT(
7374
estimator=RandomForestRegressor(
74-
n_estimators=100,
75-
random_state=42,
76-
n_jobs=1,
75+
n_estimators=100, random_state=sim_ind, n_jobs=1
7776
),
7877
screening_threshold=None,
78+
random_state=sim_ind,
7979
)
8080
d0crt_random_forest.fit_importance(X, y)
8181
pvals_forest = d0crt_random_forest.pvalues_

examples/plot_diabetes_variable_importance_example.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -70,9 +70,12 @@
7070
# diabetes dataset.
7171

7272
n_folds = 5
73-
regressor = RidgeCV(alphas=np.logspace(-3, 3, 10))
73+
regressor = RidgeCV(
74+
alphas=np.logspace(-3, 3, 10),
75+
cv=KFold(shuffle=True, random_state=20),
76+
)
7477
regressor_list = [clone(regressor) for _ in range(n_folds)]
75-
kf = KFold(n_splits=n_folds, shuffle=True, random_state=0)
78+
kf = KFold(n_splits=n_folds, shuffle=True, random_state=21)
7679
for i, (train_index, test_index) in enumerate(kf.split(X)):
7780
regressor_list[i].fit(X[train_index], y[train_index])
7881
score = r2_score(
@@ -86,15 +89,15 @@
8689
print(f"Fold {i}: {mse=}")
8790

8891
# %%
89-
# Fit a baselien model on the diabetes dataset
92+
# Fit a baseline model on the diabetes dataset
9093
# --------------------------------------------
9194
# We use a Ridge regression model with a 10-fold cross-validation to fit the
9295
# diabetes dataset.
9396

9497
n_folds = 10
9598
regressor = RidgeCV(alphas=np.logspace(-3, 3, 10))
9699
regressor_list = [clone(regressor) for _ in range(n_folds)]
97-
kf = KFold(n_splits=n_folds, shuffle=True, random_state=0)
100+
kf = KFold(n_splits=n_folds, shuffle=True, random_state=21)
98101
for i, (train_index, test_index) in enumerate(kf.split(X)):
99102
regressor_list[i].fit(X[train_index], y[train_index])
100103
score = r2_score(
@@ -112,19 +115,21 @@
112115
# --------------------------------------------------------
113116

114117
cfi_importance_list = []
118+
kf = KFold(n_splits=n_folds, shuffle=True, random_state=21)
115119
for i, (train_index, test_index) in enumerate(kf.split(X)):
116120
print(f"Fold {i}")
117121
X_train, X_test = X[train_index], X[test_index]
118122
y_train, y_test = y[train_index], y[test_index]
119123
cfi = CFI(
120124
estimator=regressor_list[i],
121-
imputation_model_continuous=RidgeCV(alphas=np.logspace(-3, 3, 10)),
125+
imputation_model_continuous=RidgeCV(alphas=np.logspace(-3, 3, 10), cv=KFold()),
122126
imputation_model_categorical=LogisticRegressionCV(
123127
Cs=np.logspace(-2, 2, 10),
128+
cv=KFold(),
124129
),
125130
# covariate_estimator=HistGradientBoostingRegressor(random_state=0,),
126131
n_permutations=50,
127-
random_state=0,
132+
random_state=24,
128133
n_jobs=4,
129134
)
130135
cfi.fit(X_train, y_train)
@@ -136,7 +141,7 @@
136141
# ---------------------------------------------------------
137142

138143
loco_importance_list = []
139-
144+
kf = KFold(n_splits=n_folds, shuffle=True, random_state=21)
140145
for i, (train_index, test_index) in enumerate(kf.split(X)):
141146
print(f"Fold {i}")
142147
X_train, X_test = X[train_index], X[test_index]
@@ -155,15 +160,15 @@
155160
# ----------------------------------------------------------------
156161

157162
pfi_importance_list = []
158-
163+
kf = KFold(n_splits=n_folds, shuffle=True, random_state=21)
159164
for i, (train_index, test_index) in enumerate(kf.split(X)):
160165
print(f"Fold {i}")
161166
X_train, X_test = X[train_index], X[test_index]
162167
y_train, y_test = y[train_index], y[test_index]
163168
pfi = PFI(
164169
estimator=regressor_list[i],
165170
n_permutations=50,
166-
random_state=0,
171+
random_state=25,
167172
n_jobs=4,
168173
)
169174
pfi.fit(X_train, y_train)

examples/plot_fmri_data_example.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@
6464
new_soft_limit = limit_5G if soft < 0 else min(limit_5G, soft)
6565
new_hard_limit = limit_5G if hard < 0 else min(limit_5G, hard)
6666
resource.setrlimit(resource.RLIMIT_AS, (new_soft_limit, new_hard_limit))
67-
n_job = 1
67+
n_jobs = 1
6868

6969

7070
# %%
@@ -149,7 +149,7 @@ def preprocess_haxby(subject=2, memory=None):
149149
#
150150
try:
151151
beta_hat, sigma_hat, precision_diagonal = desparsified_lasso(
152-
X, y, noise_method="median", max_iteration=1000
152+
X, y, noise_method="median", max_iteration=1000, seed=0, n_jobs=n_jobs
153153
)
154154
pval_dl, _, one_minus_pval_dl, _, cb_min, cb_max = desparsified_lasso_pvalue(
155155
X.shape[0], beta_hat, sigma_hat, precision_diagonal
@@ -163,7 +163,14 @@ def preprocess_haxby(subject=2, memory=None):
163163
# Now, the clustered inference algorithm which combines parcellation
164164
# and high-dimensional inference (c.f. References).
165165
ward_, beta_hat, theta_hat, omega_diag = clustered_inference(
166-
X, y, ward, n_clusters, scaler_sampling=StandardScaler(), tolerance=1e-2
166+
X,
167+
y,
168+
ward,
169+
n_clusters,
170+
scaler_sampling=StandardScaler(),
171+
tolerance=1e-2,
172+
seed=1,
173+
n_jobs=n_jobs,
167174
)
168175
beta_hat, pval_cdl, _, one_minus_pval_cdl, _ = clustered_inference_pvalue(
169176
X.shape[0], None, ward_, beta_hat, theta_hat, omega_diag
@@ -176,7 +183,7 @@ def preprocess_haxby(subject=2, memory=None):
176183
# which means that 5 different parcellations are considered and
177184
# then 5 statistical maps are produced and aggregated into one.
178185
# However you might benefit from clustering randomization taking
179-
# `n_bootstraps=25` or `n_bootstraps=100`, also we set `n_jobs=2`.
186+
# `n_bootstraps=25` or `n_bootstraps=100`, also we set `n_jobs`.
180187
list_ward, list_beta_hat, list_theta_hat, list_omega_diag = (
181188
ensemble_clustered_inference(
182189
X,
@@ -188,7 +195,8 @@ def preprocess_haxby(subject=2, memory=None):
188195
n_bootstraps=5,
189196
max_iteration=6000,
190197
tolerance=1e-2,
191-
n_jobs=2,
198+
seed=2,
199+
n_jobs=n_jobs,
192200
)
193201
)
194202
beta_hat, selected = ensemble_clustered_inference_pvalue(

examples/plot_importance_classification_iris.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636

3737
from hidimstat import CFI, PFI
3838

39+
# Define the seeds for the reproducibility of the example
40+
rng = np.random.default_rng(0)
3941
# %%
4042
# Load the iris dataset and add a spurious feature
4143
# ------------------------------------------------
@@ -45,7 +47,6 @@
4547
# contrarily to `CFI`.
4648

4749
dataset = load_iris()
48-
rng = np.random.RandomState(0)
4950
X, y = dataset.data, dataset.target
5051
spurious_feat = X[:, 2] + X[:, 3]
5152
spurious_feat += rng.normal(size=X.shape[0], scale=np.std(spurious_feat) / 2)
@@ -86,17 +87,19 @@ def run_one_fold(
8687
if vim_name == "CFI":
8788
vim = CFI(
8889
estimator=model_c,
89-
imputation_model_continuous=RidgeCV(alphas=np.logspace(-3, 3, 10)),
90+
imputation_model_continuous=RidgeCV(
91+
alphas=np.logspace(-3, 3, 10), cv=KFold(shuffle=True, random_state=1)
92+
),
9093
n_permutations=50,
91-
random_state=0,
94+
random_state=2,
9295
method=method,
9396
loss=loss,
9497
)
9598
elif vim_name == "PFI":
9699
vim = PFI(
97100
estimator=model_c,
98101
n_permutations=50,
99-
random_state=0,
102+
random_state=3,
100103
method=method,
101104
loss=loss,
102105
)
@@ -124,10 +127,19 @@ def run_one_fold(
124127
# combination, in parallel.
125128

126129
models = [
127-
LogisticRegressionCV(Cs=np.logspace(-3, 3, 10), tol=1e-3, max_iter=1000),
128-
GridSearchCV(SVC(kernel="rbf"), {"C": np.logspace(-3, 3, 10)}),
130+
LogisticRegressionCV(
131+
Cs=np.logspace(-3, 3, 10),
132+
tol=1e-3,
133+
max_iter=1000,
134+
cv=KFold(shuffle=True, random_state=4),
135+
),
136+
GridSearchCV(
137+
SVC(kernel="rbf"),
138+
{"C": np.logspace(-3, 3, 10)},
139+
cv=KFold(shuffle=True, random_state=5),
140+
),
129141
]
130-
cv = KFold(n_splits=5, shuffle=True, random_state=0)
142+
cv = KFold(n_splits=5, shuffle=True, random_state=6)
131143
groups = {ft: [i] for i, ft in enumerate(dataset.feature_names)}
132144
out_list = Parallel(n_jobs=5)(
133145
delayed(run_one_fold)(

examples/plot_knockoff_aggregation.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
from joblib import Parallel, delayed
1919
from sklearn.linear_model import LassoCV
2020
from sklearn.model_selection import KFold
21-
from sklearn.utils import check_random_state
2221

2322
from hidimstat._utils.scenario import multivariate_simulation
2423
from hidimstat.knockoffs import (
@@ -54,15 +53,12 @@
5453
signal_noise_ratio = 10
5554
# number of repetitions for the bootstraps
5655
n_bootstraps = 25
57-
# seed for the random generator
58-
seed = 45
5956
# number of jobs for repetition of the method
6057
n_jobs = 2
6158
# verbosity of the joblib
6259
joblib_verbose = 0
63-
64-
rng = check_random_state(seed)
65-
seed_list = rng.randint(1, np.iinfo(np.int32).max, runs)
60+
# Define the seeds for the reproducibility of the example
61+
rng = np.random.default_rng(42)
6662

6763

6864
# %%
@@ -96,9 +92,10 @@ def single_run(
9692
estimator=LassoCV(
9793
n_jobs=1,
9894
cv=KFold(n_splits=5, shuffle=True, random_state=0),
95+
random_state=1,
9996
),
10097
n_bootstraps=1,
101-
random_state=seed,
98+
random_state=2,
10299
)
103100
mx_selection, _ = model_x_knockoff_pvalue(test_scores, fdr=fdr)
104101
fdp_mx, power_mx = fdp_power(mx_selection, non_zero_index)
@@ -109,11 +106,12 @@ def single_run(
109106
y,
110107
estimator=LassoCV(
111108
n_jobs=1,
112-
cv=KFold(n_splits=5, shuffle=True, random_state=0),
109+
cv=KFold(n_splits=5, shuffle=True, random_state=3),
110+
random_state=4,
113111
),
114112
n_bootstraps=n_bootstraps,
115113
n_jobs=1,
116-
random_state=seed,
114+
random_state=5,
117115
)
118116

119117
# Use p-values aggregation [2]
@@ -141,7 +139,7 @@ def plot_results(bounds, fdr, n_samples, n_features, power=False):
141139
for nb in range(len(bounds)):
142140
for i in range(len(bounds[nb])):
143141
y = bounds[nb][i]
144-
x = np.random.normal(nb + 1, 0.05)
142+
x = rng.normal(nb + 1, 0.05)
145143
plt.scatter(x, y, alpha=0.65, c="blue")
146144

147145
plt.boxplot(bounds, sym="")
@@ -184,7 +182,7 @@ def effect_number_samples(n_samples):
184182
n_bootstraps,
185183
seed=seed,
186184
)
187-
for seed in seed_list
185+
for seed in range(runs)
188186
)
189187

190188
fdps_mx = []

0 commit comments

Comments
 (0)