Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,40 @@ jobs:
paths:
- /home/circleci/nilearn_data
- /home/circleci/sklearn_data

test_example_same:
docker:
- image: cimg/python:3.13.2
environment:
- PYTHON_VERSION: "3.13"
steps:
- checkout
- run:
name:
'Checkout to PR commit'
command:
bash ./tools/documentation/circleci/checkout_merge_commit.sh
- restore_cache:
key: saved-cache
- run:
name: "Create the environment for building the documentation"
command:
bash ./tools/documentation/circleci/setup_virtual_environment.sh
- run:
name: "Build documentation"
command:
bash ./tools/examples/circleci/test_example.sh
no_output_timeout: 40m
# store the comparison for see it in a PR
- store_artifacts:
path: tools/examples/circleci/result_test
destination: compare_example
- save_cache:
# cache some library
key: saved-cache
paths:
- /home/circleci/nilearn_data
- /home/circleci/sklearn_data

get-tests-reports:
docker:
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ __pycache__
docs/_build/
docs/src/generated/
docs/src/ssg_execution_times.rst
tools/examples/circleci/baseline_images/*png

# file generated by hatch
src/hidimstat/_version.py
Expand Down
51 changes: 21 additions & 30 deletions examples/plot_diabetes_variable_importance_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,12 @@
from sklearn.linear_model import LogisticRegressionCV, RidgeCV
from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.model_selection import KFold
from sklearn.utils import check_random_state

from hidimstat import CPI, LOCO, PFI

seeds = check_random_state(42).randint(1, np.iinfo(np.int32).max, 7)

#############################################################################
# Load the diabetes dataset
# -------------------------
Expand All @@ -71,30 +74,12 @@
# diabetes dataset.

n_folds = 5
regressor = RidgeCV(alphas=np.logspace(-3, 3, 10))
regressor_list = [clone(regressor) for _ in range(n_folds)]
kf = KFold(n_splits=n_folds, shuffle=True, random_state=0)
for i, (train_index, test_index) in enumerate(kf.split(X)):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are you removing that ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed it because it was a duplicated line of the next one. I don't know why there was a duplicate line.

regressor_list[i].fit(X[train_index], y[train_index])
score = r2_score(
y_true=y[test_index], y_pred=regressor_list[i].predict(X[test_index])
)
mse = root_mean_squared_error(
y_true=y[test_index], y_pred=regressor_list[i].predict(X[test_index])
)

print(f"Fold {i}: {score}")
print(f"Fold {i}: {mse}")
#############################################################################
# Fit a baselien model on the diabetes dataset
# --------------------------------------------
# We use a Ridge regression model with a 10-fold cross-validation to fit the
# diabetes dataset.

n_folds = 10
regressor = RidgeCV(alphas=np.logspace(-3, 3, 10))
regressor = RidgeCV(
alphas=np.logspace(-3, 3, 10),
cv=KFold(shuffle=True, random_state=seeds[0]),
)
regressor_list = [clone(regressor) for _ in range(n_folds)]
kf = KFold(n_splits=n_folds, shuffle=True, random_state=0)
kf = KFold(n_splits=n_folds, shuffle=True, random_state=seeds[1])
for i, (train_index, test_index) in enumerate(kf.split(X)):
regressor_list[i].fit(X[train_index], y[train_index])
score = r2_score(
Expand All @@ -112,17 +97,23 @@
# --------------------------------------------------------

cpi_importance_list = []
kf = KFold(n_splits=n_folds, shuffle=True, random_state=seeds[1])
for i, (train_index, test_index) in enumerate(kf.split(X)):
print(f"Fold {i}")
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
cpi = CPI(
estimator=regressor_list[i],
imputation_model_continuous=RidgeCV(alphas=np.logspace(-3, 3, 10)),
imputation_model_categorical=LogisticRegressionCV(Cs=np.logspace(-2, 2, 10)),
# covariate_estimator=HistGradientBoostingRegressor(random_state=0,),
imputation_model_continuous=RidgeCV(
alphas=np.logspace(-3, 3, 10),
cv=KFold(shuffle=True, random_state=seeds[3]),
),
imputation_model_categorical=LogisticRegressionCV(
Cs=np.logspace(-2, 2, 10),
cv=KFold(shuffle=True, random_state=seeds[4]),
),
n_permutations=50,
random_state=0,
random_state=seeds[5],
n_jobs=4,
)
cpi.fit(X_train, y_train)
Expand All @@ -134,7 +125,7 @@
# ---------------------------------------------------------

loco_importance_list = []

kf = KFold(n_splits=n_folds, shuffle=True, random_state=seeds[1])
for i, (train_index, test_index) in enumerate(kf.split(X)):
print(f"Fold {i}")
X_train, X_test = X[train_index], X[test_index]
Expand All @@ -153,15 +144,15 @@
# ----------------------------------------------------------------

pfi_importance_list = []

kf = KFold(n_splits=n_folds, shuffle=True, random_state=seeds[1])
for i, (train_index, test_index) in enumerate(kf.split(X)):
print(f"Fold {i}")
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
pfi = PFI(
estimator=regressor_list[i],
n_permutations=50,
random_state=0,
random_state=seeds[6],
n_jobs=4,
)
pfi.fit(X_train, y_train)
Expand Down
19 changes: 15 additions & 4 deletions examples/plot_importance_classification_iris.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,9 @@ def run_one_fold(X, y, model, train_index, test_index, vim_name="CPI", groups=No
if vim_name == "CPI":
vim = CPI(
estimator=model_c,
imputation_model_continuous=RidgeCV(alphas=np.logspace(-3, 3, 10)),
imputation_model_continuous=RidgeCV(
alphas=np.logspace(-3, 3, 10), cv=KFold(shuffle=True, random_state=1)
),
n_permutations=50,
random_state=0,
method=method,
Expand Down Expand Up @@ -112,10 +114,19 @@ def run_one_fold(X, y, model, train_index, test_index, vim_name="CPI", groups=No
# combination, in parallel.

models = [
LogisticRegressionCV(Cs=np.logspace(-3, 3, 10), tol=1e-3, max_iter=1000),
GridSearchCV(SVC(kernel="rbf"), {"C": np.logspace(-3, 3, 10)}),
LogisticRegressionCV(
Cs=np.logspace(-3, 3, 10),
tol=1e-3,
max_iter=1000,
cv=KFold(shuffle=True, random_state=2),
),
GridSearchCV(
SVC(kernel="rbf"),
{"C": np.logspace(-3, 3, 10)},
cv=KFold(shuffle=True, random_state=3),
),
]
cv = KFold(n_splits=5, shuffle=True, random_state=0)
cv = KFold(n_splits=5, shuffle=True, random_state=4)
groups = {ft: i for i, ft in enumerate(dataset.feature_names)}
out_list = Parallel(n_jobs=5)(
delayed(run_one_fold)(
Expand Down
22 changes: 15 additions & 7 deletions examples/plot_knockoff_aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@
#######################################################################
# Define the function for running the three procedures on the same data
# ---------------------------------------------------------------------
def single_run(n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, seed=None):
def single_run(n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, seed=0):
seeds = check_random_state(seed).randint(1, np.iinfo(np.int32).max, 4)
# Generate data
X, y, _, non_zero_index = multivariate_1D_simulation_AR(
n_samples, n_features, rho=rho, sparsity=sparsity, seed=seed, snr=snr
Expand All @@ -85,10 +86,10 @@ def single_run(n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, see
y,
estimator=LassoCV(
n_jobs=1,
cv=KFold(n_splits=5, shuffle=True, random_state=0),
cv=KFold(n_splits=5, shuffle=True, random_state=seeds[0]),
),
n_bootstraps=1,
random_state=seed,
random_state=seeds[1],
)
mx_selection, _ = model_x_knockoff_pvalue(test_scores, fdr=fdr)
fdp_mx, power_mx = fdp_power(mx_selection, non_zero_index)
Expand All @@ -99,11 +100,11 @@ def single_run(n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, see
y,
estimator=LassoCV(
n_jobs=1,
cv=KFold(n_splits=5, shuffle=True, random_state=0),
cv=KFold(n_splits=5, shuffle=True, random_state=seeds[2]),
),
n_bootstraps=n_bootstraps,
n_jobs=1,
random_state=seed,
random_state=seeds[3],
)

# Use p-values aggregation [2]
Expand Down Expand Up @@ -131,7 +132,7 @@ def plot_results(bounds, fdr, n_samples, n_features, power=False):
for nb in range(len(bounds)):
for i in range(len(bounds[nb])):
y = bounds[nb][i]
x = np.random.normal(nb + 1, 0.05)
x = rng.normal(nb + 1, 0.05)
plt.scatter(x, y, alpha=0.65, c="blue")

plt.boxplot(bounds, sym="")
Expand Down Expand Up @@ -165,7 +166,14 @@ def effect_number_samples(n_samples):
parallel = Parallel(n_jobs, verbose=joblib_verbose)
results = parallel(
delayed(single_run)(
n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, seed=seed
n_samples,
n_features,
rho,
sparsity,
snr,
fdr,
n_bootstraps,
seed=seed,
)
for seed in seed_list
)
Expand Down
32 changes: 22 additions & 10 deletions examples/plot_pitfalls_permutation_importance.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,13 @@
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_random_state

from hidimstat import CPI, PFI
from hidimstat.conditional_sampling import ConditionalSampler

rng = np.random.RandomState(0)
rng = check_random_state(42)
seeds = rng.randint(1, np.iinfo(np.int32).max, 9)

#############################################################################
# Load the California housing dataset and add a spurious feature
Expand All @@ -40,7 +42,9 @@
dataset = fetch_california_housing()
X_, y_ = dataset.data, dataset.target
# only use 2/3 of samples to speed up the example
X, _, y, _ = train_test_split(X_, y_, test_size=0.6667, random_state=0, shuffle=True)
X, _, y, _ = train_test_split(
X_, y_, test_size=0.6667, random_state=seeds[0], shuffle=True
)

redundant_coef = rng.choice(np.arange(X.shape[1]), size=(3,), replace=False)
X_spurious = X[:, redundant_coef].sum(axis=1)
Expand Down Expand Up @@ -85,7 +89,7 @@
regressor=make_pipeline(
StandardScaler(),
MLPRegressor(
random_state=0,
random_state=seeds[1],
hidden_layer_sizes=(32, 16, 8),
early_stopping=True,
learning_rate_init=0.01,
Expand All @@ -96,7 +100,7 @@
)


kf = KFold(n_splits=5, shuffle=True, random_state=0)
kf = KFold(n_splits=5, shuffle=True, random_state=seeds[2])
for train_index, test_index in kf.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
Expand All @@ -118,6 +122,7 @@
# testing conditional importance, as it identifies the spurious feature as important.
permutation_importances = []
conditional_permutation_importances = []
kf = KFold(n_splits=5, shuffle=True, random_state=seeds[2])
for i, (train_index, test_index) in enumerate(kf.split(X)):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
Expand All @@ -128,7 +133,8 @@
pfi = PFI(
model_c,
n_permutations=50,
random_state=0,
n_jobs=5,
random_state=seeds[3],
)
pfi.fit(X_test, y_test)

Expand Down Expand Up @@ -185,6 +191,7 @@
# explained by the other features unchanged. This method is valid for testing conditional
# importance. As shown below, it does not identify the spurious feature as important.
conditional_importances = []
kf = KFold(n_splits=5, shuffle=True, random_state=seeds[2])
for i, (train_index, test_index) in enumerate(kf.split(X)):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
Expand All @@ -194,8 +201,11 @@
# Compute conditional permutation feature importance
cpi = CPI(
model_c,
imputation_model_continuous=RidgeCV(alphas=np.logspace(-3, 3, 5)),
random_state=0,
imputation_model_continuous=RidgeCV(
alphas=np.logspace(-3, 3, 5),
cv=KFold(shuffle=True, random_state=seeds[4]),
),
random_state=seeds[5],
n_jobs=5,
)
cpi.fit(X_test, y_test)
Expand Down Expand Up @@ -251,12 +261,14 @@
X_train, X_test = train_test_split(
X,
test_size=0.3,
random_state=0,
random_state=seeds[6],
)

conditional_sampler = ConditionalSampler(
model_regression=RidgeCV(alphas=np.logspace(-3, 3, 5)),
random_state=0,
model_regression=RidgeCV(
alphas=np.logspace(-3, 3, 5), cv=KFold(shuffle=True, random_state=seeds[7])
),
random_state=seeds[8],
)


Expand Down
Loading
Loading