Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
72094ff
Improve reproducbility example
lionelkusch Aug 22, 2025
fb8acdb
Improve reproducibility in code base
lionelkusch Aug 22, 2025
93fd34f
Test for testing reproducibilities
lionelkusch Aug 22, 2025
4261f07
fix error
lionelkusch Aug 22, 2025
7ea339f
Merge branch 'main' into PR_remove_randomize
lionelkusch Aug 25, 2025
46e9961
homogenize managment of seed in examples
lionelkusch Aug 25, 2025
b5a4b08
Merge branch 'main' into PR_remove_randomize
lionelkusch Aug 25, 2025
8085706
fix randomize in plot_konckoff_aggregation
lionelkusch Aug 25, 2025
e68a9a3
fix error in plot
lionelkusch Aug 25, 2025
0b90779
fix path
lionelkusch Aug 25, 2025
8f4011f
fix seed for seaborn
lionelkusch Aug 25, 2025
fc0d21f
improve seed setting
lionelkusch Aug 26, 2025
99ed1e6
change seed in methods
lionelkusch Aug 26, 2025
6f98698
Merge branch 'main' into PR_remove_randomize
lionelkusch Aug 26, 2025
9ac2059
Fix seed in test and example
lionelkusch Aug 26, 2025
370f8d2
Merge branch 'main' into PR_remove_randomize
lionelkusch Aug 27, 2025
5d5a6d8
Fix seeds
lionelkusch Aug 27, 2025
25f0b3e
Apply suggestions from code review
lionelkusch Aug 28, 2025
7782e6f
remove some seed
lionelkusch Aug 28, 2025
f21edd7
change seed management
lionelkusch Aug 28, 2025
1dfcb63
remove rng
lionelkusch Aug 28, 2025
54fe2e9
fix name
lionelkusch Aug 28, 2025
072d319
change seed
lionelkusch Sep 2, 2025
814c24b
Merge branch 'main' into PR_remove_randomize
lionelkusch Sep 2, 2025
469857f
Apply suggestions from code review
lionelkusch Sep 3, 2025
02133c8
Merge branch 'main' into PR_remove_randomize
lionelkusch Sep 3, 2025
1dd12cc
fix bug for random_generator in pertubation cases
lionelkusch Sep 3, 2025
d6b8d1e
fix test
lionelkusch Sep 3, 2025
6718931
fix randomization
lionelkusch Sep 3, 2025
729b28a
update the way to set seeds
lionelkusch Sep 4, 2025
02a4c64
Update src/hidimstat/conditional_feature_importance.py
jpaillard Sep 8, 2025
d6314ca
add a new way to check random
lionelkusch Sep 9, 2025
930c24e
Merge branch 'main' into PR_remove_randomize
lionelkusch Sep 9, 2025
f9df301
Apply suggestions from code review
lionelkusch Sep 9, 2025
347a6af
change predict signature
lionelkusch Sep 9, 2025
05ca2b8
update docstring of check_random_state
lionelkusch Sep 9, 2025
b13f7b2
improvement gestion of seed
lionelkusch Sep 9, 2025
fad5127
remove management of seed
lionelkusch Sep 10, 2025
8fd630c
remove some management of seeds
lionelkusch Sep 10, 2025
f1eec67
Merge branch 'main' into PR_remove_randomize
lionelkusch Sep 10, 2025
7b50103
fix missing revert
lionelkusch Sep 10, 2025
bf4a909
fix clone
lionelkusch Sep 10, 2025
999cf8f
fix tests
lionelkusch Sep 10, 2025
3531d06
fix tests
lionelkusch Sep 10, 2025
f1faa3c
Apply suggestions from code review
lionelkusch Sep 11, 2025
8ff07ea
Apply suggestions from code review
lionelkusch Sep 11, 2025
dfc1c9b
Apply suggestions from code review
lionelkusch Sep 11, 2025
8a0b9b8
Merge branch 'main' into PR_remove_randomize
lionelkusch Sep 11, 2025
a232fa6
format document
lionelkusch Sep 11, 2025
6b5f521
Merge branch 'main' into PR_remove_randomize
lionelkusch Sep 23, 2025
31ba5f5
fix example
lionelkusch Sep 23, 2025
08fa0d4
fix language
lionelkusch Sep 23, 2025
aba9798
fix order import
lionelkusch Sep 23, 2025
ce15242
Apply suggestions from code review
lionelkusch Sep 23, 2025
368c0d8
chnage random state to generator
lionelkusch Sep 23, 2025
c8734d1
fix example
lionelkusch Sep 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 21 additions & 30 deletions examples/plot_diabetes_variable_importance_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,12 @@
from sklearn.linear_model import LogisticRegressionCV, RidgeCV
from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.model_selection import KFold
from sklearn.utils import check_random_state

from hidimstat import CPI, LOCO, PFI

seeds = check_random_state(42).randint(1, np.iinfo(np.int32).max, 7)

#############################################################################
# Load the diabetes dataset
# -------------------------
Expand All @@ -71,30 +74,12 @@
# diabetes dataset.

n_folds = 5
regressor = RidgeCV(alphas=np.logspace(-3, 3, 10))
regressor_list = [clone(regressor) for _ in range(n_folds)]
kf = KFold(n_splits=n_folds, shuffle=True, random_state=0)
for i, (train_index, test_index) in enumerate(kf.split(X)):
regressor_list[i].fit(X[train_index], y[train_index])
score = r2_score(
y_true=y[test_index], y_pred=regressor_list[i].predict(X[test_index])
)
mse = root_mean_squared_error(
y_true=y[test_index], y_pred=regressor_list[i].predict(X[test_index])
)

print(f"Fold {i}: {score}")
print(f"Fold {i}: {mse}")
#############################################################################
# Fit a baselien model on the diabetes dataset
# --------------------------------------------
# We use a Ridge regression model with a 10-fold cross-validation to fit the
# diabetes dataset.

n_folds = 10
regressor = RidgeCV(alphas=np.logspace(-3, 3, 10))
regressor = RidgeCV(
alphas=np.logspace(-3, 3, 10),
cv=KFold(shuffle=True, random_state=seeds[0]),
)
regressor_list = [clone(regressor) for _ in range(n_folds)]
kf = KFold(n_splits=n_folds, shuffle=True, random_state=0)
kf = KFold(n_splits=n_folds, shuffle=True, random_state=seeds[1])
for i, (train_index, test_index) in enumerate(kf.split(X)):
regressor_list[i].fit(X[train_index], y[train_index])
score = r2_score(
Expand All @@ -112,17 +97,23 @@
# --------------------------------------------------------

cpi_importance_list = []
kf = KFold(n_splits=n_folds, shuffle=True, random_state=seeds[1])
for i, (train_index, test_index) in enumerate(kf.split(X)):
print(f"Fold {i}")
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
cpi = CPI(
estimator=regressor_list[i],
imputation_model_continuous=RidgeCV(alphas=np.logspace(-3, 3, 10)),
imputation_model_categorical=LogisticRegressionCV(Cs=np.logspace(-2, 2, 10)),
# covariate_estimator=HistGradientBoostingRegressor(random_state=0,),
imputation_model_continuous=RidgeCV(
alphas=np.logspace(-3, 3, 10),
cv=KFold(shuffle=True, random_state=seeds[3]),
),
imputation_model_categorical=LogisticRegressionCV(
Cs=np.logspace(-2, 2, 10),
cv=KFold(shuffle=True, random_state=seeds[4]),
),
n_permutations=50,
random_state=0,
random_state=seeds[5],
n_jobs=4,
)
cpi.fit(X_train, y_train)
Expand All @@ -134,7 +125,7 @@
# ---------------------------------------------------------

loco_importance_list = []

kf = KFold(n_splits=n_folds, shuffle=True, random_state=seeds[1])
for i, (train_index, test_index) in enumerate(kf.split(X)):
print(f"Fold {i}")
X_train, X_test = X[train_index], X[test_index]
Expand All @@ -153,15 +144,15 @@
# ----------------------------------------------------------------

pfi_importance_list = []

kf = KFold(n_splits=n_folds, shuffle=True, random_state=seeds[1])
for i, (train_index, test_index) in enumerate(kf.split(X)):
print(f"Fold {i}")
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
pfi = PFI(
estimator=regressor_list[i],
n_permutations=50,
random_state=0,
random_state=seeds[6],
n_jobs=4,
)
pfi.fit(X_train, y_train)
Expand Down
19 changes: 15 additions & 4 deletions examples/plot_importance_classification_iris.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,9 @@ def run_one_fold(X, y, model, train_index, test_index, vim_name="CPI", groups=No
if vim_name == "CPI":
vim = CPI(
estimator=model_c,
imputation_model_continuous=RidgeCV(alphas=np.logspace(-3, 3, 10)),
imputation_model_continuous=RidgeCV(
alphas=np.logspace(-3, 3, 10), cv=KFold(shuffle=True, random_state=1)
),
n_permutations=50,
random_state=0,
method=method,
Expand Down Expand Up @@ -112,10 +114,19 @@ def run_one_fold(X, y, model, train_index, test_index, vim_name="CPI", groups=No
# combination, in parallel.

models = [
LogisticRegressionCV(Cs=np.logspace(-3, 3, 10), tol=1e-3, max_iter=1000),
GridSearchCV(SVC(kernel="rbf"), {"C": np.logspace(-3, 3, 10)}),
LogisticRegressionCV(
Cs=np.logspace(-3, 3, 10),
tol=1e-3,
max_iter=1000,
cv=KFold(shuffle=True, random_state=2),
),
GridSearchCV(
SVC(kernel="rbf"),
{"C": np.logspace(-3, 3, 10)},
cv=KFold(shuffle=True, random_state=3),
),
]
cv = KFold(n_splits=5, shuffle=True, random_state=0)
cv = KFold(n_splits=5, shuffle=True, random_state=4)
groups = {ft: i for i, ft in enumerate(dataset.feature_names)}
out_list = Parallel(n_jobs=5)(
delayed(run_one_fold)(
Expand Down
22 changes: 15 additions & 7 deletions examples/plot_knockoff_aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@
#######################################################################
# Define the function for running the three procedures on the same data
# ---------------------------------------------------------------------
def single_run(n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, seed=None):
def single_run(n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, seed=0):
seeds = check_random_state(seed).randint(1, np.iinfo(np.int32).max, 4)
# Generate data
X, y, _, non_zero_index = multivariate_1D_simulation_AR(
n_samples, n_features, rho=rho, sparsity=sparsity, seed=seed, snr=snr
Expand All @@ -85,10 +86,10 @@ def single_run(n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, see
y,
estimator=LassoCV(
n_jobs=1,
cv=KFold(n_splits=5, shuffle=True, random_state=0),
cv=KFold(n_splits=5, shuffle=True, random_state=seeds[0]),
),
n_bootstraps=1,
random_state=seed,
random_state=seeds[1],
)
mx_selection, _ = model_x_knockoff_pvalue(test_scores, fdr=fdr)
fdp_mx, power_mx = fdp_power(mx_selection, non_zero_index)
Expand All @@ -99,11 +100,11 @@ def single_run(n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, see
y,
estimator=LassoCV(
n_jobs=1,
cv=KFold(n_splits=5, shuffle=True, random_state=0),
cv=KFold(n_splits=5, shuffle=True, random_state=seeds[2]),
),
n_bootstraps=n_bootstraps,
n_jobs=1,
random_state=seed,
random_state=seeds[3],
)

# Use p-values aggregation [2]
Expand Down Expand Up @@ -131,7 +132,7 @@ def plot_results(bounds, fdr, n_samples, n_features, power=False):
for nb in range(len(bounds)):
for i in range(len(bounds[nb])):
y = bounds[nb][i]
x = np.random.normal(nb + 1, 0.05)
x = rng.normal(nb + 1, 0.05)
plt.scatter(x, y, alpha=0.65, c="blue")

plt.boxplot(bounds, sym="")
Expand Down Expand Up @@ -165,7 +166,14 @@ def effect_number_samples(n_samples):
parallel = Parallel(n_jobs, verbose=joblib_verbose)
results = parallel(
delayed(single_run)(
n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, seed=seed
n_samples,
n_features,
rho,
sparsity,
snr,
fdr,
n_bootstraps,
seed=seed,
)
for seed in seed_list
)
Expand Down
32 changes: 22 additions & 10 deletions examples/plot_pitfalls_permutation_importance.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,13 @@
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_random_state

from hidimstat import CPI, PFI
from hidimstat.conditional_sampling import ConditionalSampler

rng = np.random.RandomState(0)
rng = check_random_state(42)
seeds = rng.randint(1, np.iinfo(np.int32).max, 9)

#############################################################################
# Load the California housing dataset and add a spurious feature
Expand All @@ -40,7 +42,9 @@
dataset = fetch_california_housing()
X_, y_ = dataset.data, dataset.target
# only use 2/3 of samples to speed up the example
X, _, y, _ = train_test_split(X_, y_, test_size=0.6667, random_state=0, shuffle=True)
X, _, y, _ = train_test_split(
X_, y_, test_size=0.6667, random_state=seeds[0], shuffle=True
)

redundant_coef = rng.choice(np.arange(X.shape[1]), size=(3,), replace=False)
X_spurious = X[:, redundant_coef].sum(axis=1)
Expand Down Expand Up @@ -85,7 +89,7 @@
regressor=make_pipeline(
StandardScaler(),
MLPRegressor(
random_state=0,
random_state=seeds[1],
hidden_layer_sizes=(32, 16, 8),
early_stopping=True,
learning_rate_init=0.01,
Expand All @@ -96,7 +100,7 @@
)


kf = KFold(n_splits=5, shuffle=True, random_state=0)
kf = KFold(n_splits=5, shuffle=True, random_state=seeds[2])
for train_index, test_index in kf.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
Expand All @@ -118,6 +122,7 @@
# testing conditional importance, as it identifies the spurious feature as important.
permutation_importances = []
conditional_permutation_importances = []
kf = KFold(n_splits=5, shuffle=True, random_state=seeds[2])
for i, (train_index, test_index) in enumerate(kf.split(X)):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
Expand All @@ -128,7 +133,8 @@
pfi = PFI(
model_c,
n_permutations=50,
random_state=0,
n_jobs=5,
random_state=seeds[3],
)
pfi.fit(X_test, y_test)

Expand Down Expand Up @@ -185,6 +191,7 @@
# explained by the other features unchanged. This method is valid for testing conditional
# importance. As shown below, it does not identify the spurious feature as important.
conditional_importances = []
kf = KFold(n_splits=5, shuffle=True, random_state=seeds[2])
for i, (train_index, test_index) in enumerate(kf.split(X)):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
Expand All @@ -194,8 +201,11 @@
# Compute conditional permutation feature importance
cpi = CPI(
model_c,
imputation_model_continuous=RidgeCV(alphas=np.logspace(-3, 3, 5)),
random_state=0,
imputation_model_continuous=RidgeCV(
alphas=np.logspace(-3, 3, 5),
cv=KFold(shuffle=True, random_state=seeds[4]),
),
random_state=seeds[5],
n_jobs=5,
)
cpi.fit(X_test, y_test)
Expand Down Expand Up @@ -251,12 +261,14 @@
X_train, X_test = train_test_split(
X,
test_size=0.3,
random_state=0,
random_state=seeds[6],
)

conditional_sampler = ConditionalSampler(
model_regression=RidgeCV(alphas=np.logspace(-3, 3, 5)),
random_state=0,
model_regression=RidgeCV(
alphas=np.logspace(-3, 3, 5), cv=KFold(shuffle=True, random_state=seeds[7])
),
random_state=seeds[8],
)


Expand Down
21 changes: 16 additions & 5 deletions src/hidimstat/base_perturbation.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def __init__(
self.n_jobs = n_jobs
self.n_permutations = n_permutations
self.n_groups = None
self.random_state = None

def fit(self, X, y=None, groups=None):
"""Base fit method for perturbation-based methods. Identifies the groups.
Expand Down Expand Up @@ -105,9 +106,17 @@ def predict(self, X):
X_ = np.asarray(X)

# Parallelize the computation of the importance scores for each group
if self.random_state is None:
list_seed = [None for i in range(self.n_groups)]
else:
list_seed = self.random_state.randint(
1, np.iinfo(np.int32).max, self.n_groups
)
out_list = Parallel(n_jobs=self.n_jobs)(
delayed(self._joblib_predict_one_group)(X_, group_id, group_key)
for group_id, group_key in enumerate(self.groups.keys())
delayed(self._joblib_predict_one_group)(X_, group_id, group_key, seed)
for group_id, (group_key, seed) in enumerate(
zip(self.groups.keys(), list_seed)
)
)
return np.stack(out_list, axis=0)

Expand Down Expand Up @@ -168,7 +177,7 @@ def _check_fit(self):
" call fit with groups=None"
)

def _joblib_predict_one_group(self, X, group_id, group_key):
def _joblib_predict_one_group(self, X, group_id, group_key, seed):
"""
Compute the predictions after perturbation of the data for a given
group of variables. This function is parallelized.
Expand All @@ -181,14 +190,16 @@ def _joblib_predict_one_group(self, X, group_id, group_key):
The index of the group of variables.
group_key: str, int
The key of the group of variables. (parameter use for debugging)
seed: int, optional
Random seed for reproducibility.
"""
group_ids = self._groups_ids[group_id]
non_group_ids = np.delete(np.arange(X.shape[1]), group_ids)
# Create an array X_perm_j of shape (n_permutations, n_samples, n_features)
# where the j-th group of covariates is permuted
X_perm = np.empty((self.n_permutations, X.shape[0], X.shape[1]))
X_perm[:, :, non_group_ids] = np.delete(X, group_ids, axis=1)
X_perm[:, :, group_ids] = self._permutation(X, group_id=group_id)
X_perm[:, :, group_ids] = self._permutation(X, group_id=group_id, seed=seed)
# Reshape X_perm to allow for batch prediction
X_perm_batch = X_perm.reshape(-1, X.shape[1])
y_pred_perm = getattr(self.estimator, self.method)(X_perm_batch)
Expand All @@ -202,6 +213,6 @@ def _joblib_predict_one_group(self, X, group_id, group_key):
)
return y_pred_perm

def _permutation(self, X, group_id):
def _permutation(self, X, group_id, seed):
"""Method for creating the permuted data for the j-th group of covariates."""
raise NotImplementedError
Loading
Loading