diff --git a/.circleci/config.yml b/.circleci/config.yml index faad1bcc3..eb457e00a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -66,6 +66,40 @@ jobs: paths: - /home/circleci/nilearn_data - /home/circleci/sklearn_data + + test_example_same: + docker: + - image: cimg/python:3.13.2 + environment: + - PYTHON_VERSION: "3.13" + steps: + - checkout + - run: + name: + 'Checkout to PR commit' + command: + bash ./tools/documentation/circleci/checkout_merge_commit.sh + - restore_cache: + key: saved-cache + - run: + name: "Create the environment for building the documentation" + command: + bash ./tools/documentation/circleci/setup_virtual_environment.sh + - run: + name: "Build documentation" + command: + bash ./tools/examples/circleci/test_example.sh + no_output_timeout: 40m + # store the comparison for see it in a PR + - store_artifacts: + path: tools/examples/circleci/result_test + destination: compare_example + - save_cache: + # cache some library + key: saved-cache + paths: + - /home/circleci/nilearn_data + - /home/circleci/sklearn_data get-tests-reports: docker: diff --git a/.gitignore b/.gitignore index 0ac771869..fdac7c15f 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ __pycache__ docs/_build/ docs/src/generated/ docs/src/ssg_execution_times.rst +tools/examples/circleci/baseline_images/*png # file generated by hatch src/hidimstat/_version.py diff --git a/examples/plot_diabetes_variable_importance_example.py b/examples/plot_diabetes_variable_importance_example.py index c9c1d0a6d..53f7407fa 100644 --- a/examples/plot_diabetes_variable_importance_example.py +++ b/examples/plot_diabetes_variable_importance_example.py @@ -54,9 +54,12 @@ from sklearn.linear_model import LogisticRegressionCV, RidgeCV from sklearn.metrics import r2_score, root_mean_squared_error from sklearn.model_selection import KFold +from sklearn.utils import check_random_state from hidimstat import CPI, LOCO, PFI +seeds = check_random_state(42).randint(1, np.iinfo(np.int32).max, 7) + ############################################################################# # Load the diabetes dataset # ------------------------- @@ -71,30 +74,12 @@ # diabetes dataset. n_folds = 5 -regressor = RidgeCV(alphas=np.logspace(-3, 3, 10)) -regressor_list = [clone(regressor) for _ in range(n_folds)] -kf = KFold(n_splits=n_folds, shuffle=True, random_state=0) -for i, (train_index, test_index) in enumerate(kf.split(X)): - regressor_list[i].fit(X[train_index], y[train_index]) - score = r2_score( - y_true=y[test_index], y_pred=regressor_list[i].predict(X[test_index]) - ) - mse = root_mean_squared_error( - y_true=y[test_index], y_pred=regressor_list[i].predict(X[test_index]) - ) - - print(f"Fold {i}: {score}") - print(f"Fold {i}: {mse}") -############################################################################# -# Fit a baselien model on the diabetes dataset -# -------------------------------------------- -# We use a Ridge regression model with a 10-fold cross-validation to fit the -# diabetes dataset. - -n_folds = 10 -regressor = RidgeCV(alphas=np.logspace(-3, 3, 10)) +regressor = RidgeCV( + alphas=np.logspace(-3, 3, 10), + cv=KFold(shuffle=True, random_state=seeds[0]), +) regressor_list = [clone(regressor) for _ in range(n_folds)] -kf = KFold(n_splits=n_folds, shuffle=True, random_state=0) +kf = KFold(n_splits=n_folds, shuffle=True, random_state=seeds[1]) for i, (train_index, test_index) in enumerate(kf.split(X)): regressor_list[i].fit(X[train_index], y[train_index]) score = r2_score( @@ -112,17 +97,23 @@ # -------------------------------------------------------- cpi_importance_list = [] +kf = KFold(n_splits=n_folds, shuffle=True, random_state=seeds[1]) for i, (train_index, test_index) in enumerate(kf.split(X)): print(f"Fold {i}") X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] cpi = CPI( estimator=regressor_list[i], - imputation_model_continuous=RidgeCV(alphas=np.logspace(-3, 3, 10)), - imputation_model_categorical=LogisticRegressionCV(Cs=np.logspace(-2, 2, 10)), - # covariate_estimator=HistGradientBoostingRegressor(random_state=0,), + imputation_model_continuous=RidgeCV( + alphas=np.logspace(-3, 3, 10), + cv=KFold(shuffle=True, random_state=seeds[3]), + ), + imputation_model_categorical=LogisticRegressionCV( + Cs=np.logspace(-2, 2, 10), + cv=KFold(shuffle=True, random_state=seeds[4]), + ), n_permutations=50, - random_state=0, + random_state=seeds[5], n_jobs=4, ) cpi.fit(X_train, y_train) @@ -134,7 +125,7 @@ # --------------------------------------------------------- loco_importance_list = [] - +kf = KFold(n_splits=n_folds, shuffle=True, random_state=seeds[1]) for i, (train_index, test_index) in enumerate(kf.split(X)): print(f"Fold {i}") X_train, X_test = X[train_index], X[test_index] @@ -153,7 +144,7 @@ # ---------------------------------------------------------------- pfi_importance_list = [] - +kf = KFold(n_splits=n_folds, shuffle=True, random_state=seeds[1]) for i, (train_index, test_index) in enumerate(kf.split(X)): print(f"Fold {i}") X_train, X_test = X[train_index], X[test_index] @@ -161,7 +152,7 @@ pfi = PFI( estimator=regressor_list[i], n_permutations=50, - random_state=0, + random_state=seeds[6], n_jobs=4, ) pfi.fit(X_train, y_train) diff --git a/examples/plot_importance_classification_iris.py b/examples/plot_importance_classification_iris.py index 6216ce044..a3c166a92 100644 --- a/examples/plot_importance_classification_iris.py +++ b/examples/plot_importance_classification_iris.py @@ -77,7 +77,9 @@ def run_one_fold(X, y, model, train_index, test_index, vim_name="CPI", groups=No if vim_name == "CPI": vim = CPI( estimator=model_c, - imputation_model_continuous=RidgeCV(alphas=np.logspace(-3, 3, 10)), + imputation_model_continuous=RidgeCV( + alphas=np.logspace(-3, 3, 10), cv=KFold(shuffle=True, random_state=1) + ), n_permutations=50, random_state=0, method=method, @@ -112,10 +114,19 @@ def run_one_fold(X, y, model, train_index, test_index, vim_name="CPI", groups=No # combination, in parallel. models = [ - LogisticRegressionCV(Cs=np.logspace(-3, 3, 10), tol=1e-3, max_iter=1000), - GridSearchCV(SVC(kernel="rbf"), {"C": np.logspace(-3, 3, 10)}), + LogisticRegressionCV( + Cs=np.logspace(-3, 3, 10), + tol=1e-3, + max_iter=1000, + cv=KFold(shuffle=True, random_state=2), + ), + GridSearchCV( + SVC(kernel="rbf"), + {"C": np.logspace(-3, 3, 10)}, + cv=KFold(shuffle=True, random_state=3), + ), ] -cv = KFold(n_splits=5, shuffle=True, random_state=0) +cv = KFold(n_splits=5, shuffle=True, random_state=4) groups = {ft: i for i, ft in enumerate(dataset.feature_names)} out_list = Parallel(n_jobs=5)( delayed(run_one_fold)( diff --git a/examples/plot_knockoff_aggregation.py b/examples/plot_knockoff_aggregation.py index b4ba66af5..14ea794a7 100644 --- a/examples/plot_knockoff_aggregation.py +++ b/examples/plot_knockoff_aggregation.py @@ -73,7 +73,8 @@ ####################################################################### # Define the function for running the three procedures on the same data # --------------------------------------------------------------------- -def single_run(n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, seed=None): +def single_run(n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, seed=0): + seeds = check_random_state(seed).randint(1, np.iinfo(np.int32).max, 4) # Generate data X, y, _, non_zero_index = multivariate_1D_simulation_AR( n_samples, n_features, rho=rho, sparsity=sparsity, seed=seed, snr=snr @@ -85,10 +86,10 @@ def single_run(n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, see y, estimator=LassoCV( n_jobs=1, - cv=KFold(n_splits=5, shuffle=True, random_state=0), + cv=KFold(n_splits=5, shuffle=True, random_state=seeds[0]), ), n_bootstraps=1, - random_state=seed, + random_state=seeds[1], ) mx_selection, _ = model_x_knockoff_pvalue(test_scores, fdr=fdr) fdp_mx, power_mx = fdp_power(mx_selection, non_zero_index) @@ -99,11 +100,11 @@ def single_run(n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, see y, estimator=LassoCV( n_jobs=1, - cv=KFold(n_splits=5, shuffle=True, random_state=0), + cv=KFold(n_splits=5, shuffle=True, random_state=seeds[2]), ), n_bootstraps=n_bootstraps, n_jobs=1, - random_state=seed, + random_state=seeds[3], ) # Use p-values aggregation [2] @@ -131,7 +132,7 @@ def plot_results(bounds, fdr, n_samples, n_features, power=False): for nb in range(len(bounds)): for i in range(len(bounds[nb])): y = bounds[nb][i] - x = np.random.normal(nb + 1, 0.05) + x = rng.normal(nb + 1, 0.05) plt.scatter(x, y, alpha=0.65, c="blue") plt.boxplot(bounds, sym="") @@ -165,7 +166,14 @@ def effect_number_samples(n_samples): parallel = Parallel(n_jobs, verbose=joblib_verbose) results = parallel( delayed(single_run)( - n_samples, n_features, rho, sparsity, snr, fdr, n_bootstraps, seed=seed + n_samples, + n_features, + rho, + sparsity, + snr, + fdr, + n_bootstraps, + seed=seed, ) for seed in seed_list ) diff --git a/examples/plot_pitfalls_permutation_importance.py b/examples/plot_pitfalls_permutation_importance.py index e456349be..2592345c4 100644 --- a/examples/plot_pitfalls_permutation_importance.py +++ b/examples/plot_pitfalls_permutation_importance.py @@ -24,11 +24,13 @@ from sklearn.neural_network import MLPRegressor from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler +from sklearn.utils import check_random_state from hidimstat import CPI, PFI from hidimstat.conditional_sampling import ConditionalSampler -rng = np.random.RandomState(0) +rng = check_random_state(42) +seeds = rng.randint(1, np.iinfo(np.int32).max, 9) ############################################################################# # Load the California housing dataset and add a spurious feature @@ -40,7 +42,9 @@ dataset = fetch_california_housing() X_, y_ = dataset.data, dataset.target # only use 2/3 of samples to speed up the example -X, _, y, _ = train_test_split(X_, y_, test_size=0.6667, random_state=0, shuffle=True) +X, _, y, _ = train_test_split( + X_, y_, test_size=0.6667, random_state=seeds[0], shuffle=True +) redundant_coef = rng.choice(np.arange(X.shape[1]), size=(3,), replace=False) X_spurious = X[:, redundant_coef].sum(axis=1) @@ -85,7 +89,7 @@ regressor=make_pipeline( StandardScaler(), MLPRegressor( - random_state=0, + random_state=seeds[1], hidden_layer_sizes=(32, 16, 8), early_stopping=True, learning_rate_init=0.01, @@ -96,7 +100,7 @@ ) -kf = KFold(n_splits=5, shuffle=True, random_state=0) +kf = KFold(n_splits=5, shuffle=True, random_state=seeds[2]) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] @@ -118,6 +122,7 @@ # testing conditional importance, as it identifies the spurious feature as important. permutation_importances = [] conditional_permutation_importances = [] +kf = KFold(n_splits=5, shuffle=True, random_state=seeds[2]) for i, (train_index, test_index) in enumerate(kf.split(X)): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] @@ -128,7 +133,8 @@ pfi = PFI( model_c, n_permutations=50, - random_state=0, + n_jobs=5, + random_state=seeds[3], ) pfi.fit(X_test, y_test) @@ -185,6 +191,7 @@ # explained by the other features unchanged. This method is valid for testing conditional # importance. As shown below, it does not identify the spurious feature as important. conditional_importances = [] +kf = KFold(n_splits=5, shuffle=True, random_state=seeds[2]) for i, (train_index, test_index) in enumerate(kf.split(X)): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] @@ -194,8 +201,11 @@ # Compute conditional permutation feature importance cpi = CPI( model_c, - imputation_model_continuous=RidgeCV(alphas=np.logspace(-3, 3, 5)), - random_state=0, + imputation_model_continuous=RidgeCV( + alphas=np.logspace(-3, 3, 5), + cv=KFold(shuffle=True, random_state=seeds[4]), + ), + random_state=seeds[5], n_jobs=5, ) cpi.fit(X_test, y_test) @@ -251,12 +261,14 @@ X_train, X_test = train_test_split( X, test_size=0.3, - random_state=0, + random_state=seeds[6], ) conditional_sampler = ConditionalSampler( - model_regression=RidgeCV(alphas=np.logspace(-3, 3, 5)), - random_state=0, + model_regression=RidgeCV( + alphas=np.logspace(-3, 3, 5), cv=KFold(shuffle=True, random_state=seeds[7]) + ), + random_state=seeds[8], ) diff --git a/src/hidimstat/base_perturbation.py b/src/hidimstat/base_perturbation.py index 437e1aeee..5332fa07a 100644 --- a/src/hidimstat/base_perturbation.py +++ b/src/hidimstat/base_perturbation.py @@ -49,6 +49,7 @@ def __init__( self.n_jobs = n_jobs self.n_permutations = n_permutations self.n_groups = None + self.random_state = None def fit(self, X, y=None, groups=None): """Base fit method for perturbation-based methods. Identifies the groups. @@ -105,9 +106,17 @@ def predict(self, X): X_ = np.asarray(X) # Parallelize the computation of the importance scores for each group + if self.random_state is None: + list_seed = [None for i in range(self.n_groups)] + else: + list_seed = self.random_state.randint( + 1, np.iinfo(np.int32).max, self.n_groups + ) out_list = Parallel(n_jobs=self.n_jobs)( - delayed(self._joblib_predict_one_group)(X_, group_id, group_key) - for group_id, group_key in enumerate(self.groups.keys()) + delayed(self._joblib_predict_one_group)(X_, group_id, group_key, seed) + for group_id, (group_key, seed) in enumerate( + zip(self.groups.keys(), list_seed) + ) ) return np.stack(out_list, axis=0) @@ -168,7 +177,7 @@ def _check_fit(self): " call fit with groups=None" ) - def _joblib_predict_one_group(self, X, group_id, group_key): + def _joblib_predict_one_group(self, X, group_id, group_key, seed): """ Compute the predictions after perturbation of the data for a given group of variables. This function is parallelized. @@ -181,6 +190,8 @@ def _joblib_predict_one_group(self, X, group_id, group_key): The index of the group of variables. group_key: str, int The key of the group of variables. (parameter use for debugging) + seed: int, optional + Random seed for reproducibility. """ group_ids = self._groups_ids[group_id] non_group_ids = np.delete(np.arange(X.shape[1]), group_ids) @@ -188,7 +199,7 @@ def _joblib_predict_one_group(self, X, group_id, group_key): # where the j-th group of covariates is permuted X_perm = np.empty((self.n_permutations, X.shape[0], X.shape[1])) X_perm[:, :, non_group_ids] = np.delete(X, group_ids, axis=1) - X_perm[:, :, group_ids] = self._permutation(X, group_id=group_id) + X_perm[:, :, group_ids] = self._permutation(X, group_id=group_id, seed=seed) # Reshape X_perm to allow for batch prediction X_perm_batch = X_perm.reshape(-1, X.shape[1]) y_pred_perm = getattr(self.estimator, self.method)(X_perm_batch) @@ -202,6 +213,6 @@ def _joblib_predict_one_group(self, X, group_id, group_key): ) return y_pred_perm - def _permutation(self, X, group_id): + def _permutation(self, X, group_id, seed): """Method for creating the permuted data for the j-th group of covariates.""" raise NotImplementedError diff --git a/src/hidimstat/conditional_permutation_importance.py b/src/hidimstat/conditional_permutation_importance.py index e0a71e064..746ef873d 100644 --- a/src/hidimstat/conditional_permutation_importance.py +++ b/src/hidimstat/conditional_permutation_importance.py @@ -72,7 +72,7 @@ def __init__( self.categorical_max_cardinality = categorical_max_cardinality self.imputation_model_categorical = imputation_model_categorical self.imputation_model_continuous = imputation_model_continuous - self.random_state = random_state + self.random_state = check_random_state(random_state) def fit(self, X, y=None, groups=None, var_type="auto"): """Fit the imputation models. @@ -96,7 +96,6 @@ def fit(self, X, y=None, groups=None, var_type="auto"): self : object Returns the instance itself. """ - self.random_state = check_random_state(self.random_state) super().fit(X, None, groups=groups) if isinstance(var_type, str): self.var_type = [var_type for _ in range(self.n_groups)] @@ -105,7 +104,7 @@ def fit(self, X, y=None, groups=None, var_type="auto"): self._list_imputation_models = [ ConditionalSampler( - data_type=self.var_type[groupd_id], + data_type=self.var_type[group_id], model_regression=( None if self.imputation_model_continuous is None @@ -116,10 +115,13 @@ def fit(self, X, y=None, groups=None, var_type="auto"): if self.imputation_model_categorical is None else clone(self.imputation_model_categorical) ), - random_state=self.random_state, + random_state=seed, categorical_max_cardinality=self.categorical_max_cardinality, ) - for groupd_id in range(self.n_groups) + for group_id, seed in zip( + range(self.n_groups), + self.random_state.randint(0, np.iinfo(np.int32).max, self.n_groups), + ) ] # Parallelize the fitting of the covariate estimators @@ -149,7 +151,7 @@ def _check_fit(self): for m in self._list_imputation_models: check_is_fitted(m.model) - def _permutation(self, X, group_id): + def _permutation(self, X, group_id, seed): """Sample from the conditional distribution using a permutation of the residuals.""" X_j = X[:, self._groups_ids[group_id]].copy() diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index 7dab07aeb..b01605051 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -6,6 +6,7 @@ from sklearn.preprocessing import StandardScaler from sklearn.utils import check_random_state from sklearn.utils.validation import check_memory +from sklearn.base import clone from hidimstat.gaussian_knockoff import ( gaussian_knockoff_generation, @@ -188,13 +189,9 @@ def model_x_knockoff( parallel = Parallel(n_jobs, verbose=joblib_verbose) # get the seed for the different run - if isinstance(random_state, (int, np.int32, np.int64)): - rng = check_random_state(random_state) - elif random_state is None: - rng = check_random_state(0) - else: - raise TypeError("Wrong type for random_state") - seed_list = rng.randint(1, np.iinfo(np.int32).max, n_bootstraps) + seed_list = check_random_state(random_state).randint( + 0, np.iinfo(np.int32).max, n_bootstraps + ) if centered: X = StandardScaler().fit_transform(X) @@ -225,7 +222,7 @@ def model_x_knockoff( results = parallel( delayed(memory.cache(_stat_coefficient_diff))( - X, X_tildes[i], y, estimator, fdr, preconfigure_estimator + X, X_tildes[i], y, clone(estimator), fdr, preconfigure_estimator ) for i in range(n_bootstraps) ) diff --git a/src/hidimstat/leave_one_covariate_out.py b/src/hidimstat/leave_one_covariate_out.py index f1ea71894..6330e04a9 100644 --- a/src/hidimstat/leave_one_covariate_out.py +++ b/src/hidimstat/leave_one_covariate_out.py @@ -93,7 +93,7 @@ def _joblib_fit_one_group(self, estimator, X, y, key_groups): estimator.fit(X_minus_j, y) return estimator - def _joblib_predict_one_group(self, X, group_id, key_groups): + def _joblib_predict_one_group(self, X, group_id, key_groups, seed): """Predict the target variable after removing a group of covariates. Used in parallel.""" X_minus_j = np.delete(X, self._groups_ids[group_id], axis=1) diff --git a/src/hidimstat/permutation_feature_importance.py b/src/hidimstat/permutation_feature_importance.py index 29d007656..485eecd65 100644 --- a/src/hidimstat/permutation_feature_importance.py +++ b/src/hidimstat/permutation_feature_importance.py @@ -53,14 +53,14 @@ def __init__( n_jobs=n_jobs, n_permutations=n_permutations, ) - self.random_state = random_state + self.random_state = check_random_state(random_state) - def _permutation(self, X, group_id): + def _permutation(self, X, group_id, seed): """Create the permuted data for the j-th group of covariates""" - self.random_state = check_random_state(self.random_state) + rgn = np.random.RandomState(seed) X_perm_j = np.array( [ - self.random_state.permutation(X[:, self._groups_ids[group_id]].copy()) + rgn.permutation(X[:, self._groups_ids[group_id]].copy()) for _ in range(self.n_permutations) ] ) diff --git a/test/test_base_perturbation.py b/test/test_base_perturbation.py index dd3ff6d6c..835c3f12e 100644 --- a/test/test_base_perturbation.py +++ b/test/test_base_perturbation.py @@ -11,4 +11,4 @@ def test_no_implemented_methods(): estimator.fit(X[:, 0], X[:, 1]) basic_class = BasePerturbation(estimator=estimator) with pytest.raises(NotImplementedError): - basic_class._permutation(X, group_id=None) + basic_class._permutation(X, group_id=None, seed=0) diff --git a/tools/examples/circleci/baseline_images/.gitkeep b/tools/examples/circleci/baseline_images/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/tools/examples/circleci/test_example.sh b/tools/examples/circleci/test_example.sh new file mode 100755 index 000000000..5cce86158 --- /dev/null +++ b/tools/examples/circleci/test_example.sh @@ -0,0 +1,141 @@ +#!/usr/bin/env bash + +# display all the command and associet argument when they are executed +set -x +# exit immediately if a command exits with a non-zero status. +set -e + +# activate the environment for the creation of the documentation +# see the file setup_virtual_environment +source .venv/bin/activate + +# Decide what kind of documentation build to run, and run it. +# +# If the last commit message has a "[doc skip]" marker, do not build +# the doc. On the contrary if a "[doc build]" marker is found, build the doc +# instead of relying on the subsequent rules. +# +# We always build the documentation for jobs that are not related to a specific +# PR (e.g. a merge to master or a maintenance branch). +# +# If this is a PR, do a full build if there are some files in this PR that are +# under the "doc/" or "examples/" folders, otherwise perform a quick build. +# +# If the inspection of the current commit fails for any reason, the default +# behavior is to quick build the documentation. + +# check that the workflow is done on a Pull Request +if [ $(echo $CIRCLE_BRANCH | cut -d'/' -f 1) == 'pull' ] +then CI_PULL_REQUEST="PR" +fi + +get_build_type() { + # Full build if it is not in a PR + if [ -z "$CI_PULL_REQUEST" ] + then + echo BUILD: not a pull request + return + fi + # get the hash of the last commit of the PR + if [ -z "$CIRCLE_SHA1" ] + then + echo SKIP: undefined CIRCLE_SHA1 + return + fi + # get the log of commit and detect marker: [doc skip], [doc quick], [doc changed] + commit_msg=$(git log --format=%B -n 1 $CIRCLE_SHA1) + if [ -z "$commit_msg" ] + then + echo QUICK BUILD: failed to inspect commit $CIRCLE_SHA1 + return + fi + if [[ "$commit_msg" =~ \[doc\ skip\] ]] + then + echo SKIP: [doc skip] marker found + return + fi + if [[ "$commit_msg" =~ \[doc\ quick\] ]] + then + echo QUICK: [doc quick] marker found + return + fi + if [[ "$commit_msg" =~ \[doc\ change\] ]] + then + # get the example difference between main and the actual commit + git_range="origin/main...$CIRCLE_SHA1" + git fetch origin main >&2 || (echo QUICK BUILD: failed to get changed filenames for $git_range; return) + filenames=$(git diff --name-only $git_range) + # case where there is no modifed file + if [ -z "$filenames" ] + then + echo QUICK BUILD: no changed filenames for $git_range + return + fi + # get the examples which have been modified + changed_examples=$(echo "$filenames" | grep -e ^examples/) + if [[ -n "$changed_examples" ]] + then + echo BUILD: detected examples/ filename modified in $git_range: $changed_examples + pattern=$(echo "$changed_examples" | paste -sd '|') + # pattern for examples to run is the last line of output + echo "$pattern" + return + fi + # case where there is no modified example + echo QUICK BUILD: no examples/ filename modified in $git_range: + echo "$filenames" + return + fi + echo BUILD: build all the example by default +} + +build_type=$(get_build_type) +# Skip examples +if [[ "$build_type" =~ ^SKIP || "$build_type" =~ ^QUICK || "$build_type" =~ ^'BUILD: detected examples']] +then + exit 0 +fi + +pip install .[test] +# generate all the example if it's push on main or on a previous version +if [[ "$CIRCLE_BRANCH" =~ ^main$|^[0-9]+\.[0-9]+\.X$ ]] +then + url="$CIRCLE_BRANCH" +else + url="main" +fi + +# The pipefail is requested to propagate exit code +set -o pipefail && cd tools/examples/circleci && pytest --mpl --mpl-generate-summary=basic-html --mpl-results-path=./result_test +cd - + +set +o pipefail + + +affected_doc_paths() { + # generate a list fo the file modified in the PR + files=$(git diff --name-only origin/main...$CIRCLE_SHA1) + # list of the modified documentation files + echo "$files" | grep ^docs/src/.*\.rst | sed 's/^docs\/src\/\(.*\)\.rst$/\1.html/' + # list of the modified examples + echo "$files" | grep ^examples/.*.py | sed 's/^\(.*\)\.py$/auto_\1.html/' + # list of the modifed source file + project_files=$(echo "$files" | grep 'src/hidimstat/') + if [ -n "$project_files" ] + then + grep -hlR -f<(echo "$project_files" | sed 's/src\/hidimstat\//hidimstat\./') docs/_build/html/generated | cut -d/ -f4- + fi +} + +# generate a html page which list the modified files +if [ -n "$CI_PULL_REQUEST" ] +then + echo "The following documentation files may have been changed by PR #$CI_PULL_REQUEST:" + affected=$(affected_doc_paths) + echo "$affected" + ( + echo '
General: Home | API Reference | Examples
' + ) > 'docs/_build/html/_changed.html' +fi diff --git a/tools/examples/circleci/test_replicability_examples.py b/tools/examples/circleci/test_replicability_examples.py new file mode 100644 index 000000000..7ce2df5df --- /dev/null +++ b/tools/examples/circleci/test_replicability_examples.py @@ -0,0 +1,53 @@ +import os + +path_file = os.path.dirname(os.path.abspath(__file__)) + +import pytest +import matplotlib.pyplot as plt +import matplotlib.image as mpimg + +import urllib.request +import json + + +def get_list_of_images(): + name_figure = [] + url = "https://api.github.com/repos/hidimstat/hidimstat.github.io/contents/dev/_images" + headers = {"Accept": "application/vnd.github.v3+json"} + + req = urllib.request.Request(url, headers=headers) + with urllib.request.urlopen(req) as response: + data = json.loads(response.read().decode()) + for item in data: + if not ("thumb" in item["name"]): + name_figure.append(item["name"]) + print(name_figure) + return name_figure + + +@pytest.mark.parametrize( + "name_figure", + get_list_of_images(), +) +@pytest.mark.mpl_image_compare( + style="default", + baseline_dir="baseline_images", + tolerance=5, # tolerance should be reduce to 0 +) +def test_example_figure_generated(name_figure): + # Download the baseline image from the specified URL + baseline_url = ( + "https://github.com/hidimstat/hidimstat.github.io/raw/main/dev/_images/" + + name_figure + ) + baseline_path = os.path.join( + path_file + "/baseline_images/", "test_example_figure_generated_" + name_figure + ) + urllib.request.urlretrieve(baseline_url, baseline_path) + + img = mpimg.imread(path_file + "/../../../docs/_build/html/_images/" + name_figure) + fig = plt.figure(figsize=(img.shape[1] / 100, img.shape[0] / 100), dpi=100) + plt.imshow(img) + plt.axis("off") + plt.subplots_adjust(top=1.0, bottom=0.0, left=0.0, right=1.0) + return fig diff --git a/tools/examples/debugger_script/try_reproducibility.py b/tools/examples/debugger_script/try_reproducibility.py new file mode 100644 index 000000000..d30aeeab2 --- /dev/null +++ b/tools/examples/debugger_script/try_reproducibility.py @@ -0,0 +1,17 @@ +import os +import sys + + +from joblib import Parallel, delayed + +# add the example for import them +sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../../examples") + + +def run_joblib(i): + import plot_knockoff_aggregation # include the example to test + + +# run in parralel the same example for compare result +parrallel = Parallel(n_jobs=4) +parrallel(delayed(run_joblib)(i) for i in range(6))