From 190300dbfea2ea34caf1598a17e9b08213da2963 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 29 Aug 2025 13:16:37 +0200 Subject: [PATCH 01/44] gaussian distribution reformat --- src/hidimstat/gaussian_knockoff.py | 191 ----------------- .../gaussian_distribution.py | 194 ++++++++++++++++++ .../test_gaussian_distribution.py | 85 ++++++++ 3 files changed, 279 insertions(+), 191 deletions(-) delete mode 100644 src/hidimstat/gaussian_knockoff.py create mode 100644 src/hidimstat/statistical_tools/gaussian_distribution.py create mode 100644 test/statistical_tools/test_gaussian_distribution.py diff --git a/src/hidimstat/gaussian_knockoff.py b/src/hidimstat/gaussian_knockoff.py deleted file mode 100644 index 886312c7e..000000000 --- a/src/hidimstat/gaussian_knockoff.py +++ /dev/null @@ -1,191 +0,0 @@ -import warnings - -import numpy as np - - -def gaussian_knockoff_generation(X, mu, sigma, seed=None, tol=1e-14): - """ - Generate second-order knockoff variables using the equi-correlated method. - - This function generates knockoff variables for a given design matrix X, - using the equi-correlated method described in :footcite:t:`barber2015controlling` - and :footcite:t:`candes2018panning`. The function takes as input the design matrix - X, the vector of empirical mean values mu, and the empirical covariance - matrix sigma. It returns the knockoff variables X_tilde. - - The original implementation can be found at - https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/create_gaussian.R - - Parameters - ---------- - X: 2D ndarray (n_samples, n_features) - The original design matrix. - - mu : 1D ndarray (n_features, ) - A vector of empirical mean values. - - sigma : 2D ndarray (n_samples, n_features) - The empirical covariance matrix. - - seed : int, optional - A random seed for generating the uniform noise used to create - the knockoff variables. - - tol : float, default=1.e-14 - A tolerance value used for numerical stability in the calculation - of the Cholesky decomposition. - - Returns - ------- - X_tilde : 2D ndarray (n_samples, n_features) - The knockoff variables. - - mu_tilde : 2D ndarray (n_samples, n_features) - The mean matrix used for generating knockoffs. - - sigma_tilde_decompose : 2D ndarray (n_features, n_features) - The Cholesky decomposition of the covariance matrix. - - References - ---------- - .. footbibliography:: - """ - n_samples, n_features = X.shape - - # create a uniform noise for all the data - rng = np.random.RandomState(seed) - u_tilde = rng.randn(n_samples, n_features) - - diag_s = np.diag(_s_equi(sigma, tol=tol)) - - sigma_inv_s = np.linalg.solve(sigma, diag_s) - - # First part on the RHS of equation 1.4 in barber2015controlling - mu_tilde = X - np.dot(X - mu, sigma_inv_s) - # To calculate the Cholesky decomposition later on - sigma_tilde = 2 * diag_s - diag_s.dot(sigma_inv_s) - # test is the matrix is positive definite - while not np.all(np.linalg.eigvalsh(sigma_tilde) > tol): - sigma_tilde += 1e-10 * np.eye(n_features) - warnings.warn( - "The conditional covariance matrix for knockoffs is not positive " - "definite. Adding minor positive value to the matrix.", - UserWarning, - ) - - # Equation 1.4 in barber2015controlling - sigma_tilde_decompose = np.linalg.cholesky(sigma_tilde) - X_tilde = mu_tilde + np.dot(u_tilde, sigma_tilde_decompose) - - return X_tilde, mu_tilde, sigma_tilde_decompose - - -def repeat_gaussian_knockoff_generation(mu_tilde, sigma_tilde_decompose, seed): - """ - Generate additional knockoff variables using pre-computed values. - - This function generates additional knockoff variables using pre-computed - values returned by the gaussian_knockoff_generation function - with repeat=True. It takes as input mu_tilde and sigma_tilde_decompose, - which were returned by gaussian_knockoff_generation, and a random seed. - It returns the new knockoff variables X_tilde. - - Parameters - ---------- - mu_tilde : 2D ndarray (n_samples, n_features) - The matrix of means used to generate the knockoff variables, - returned by gaussian_knockoff_generation. - - sigma_tilde_decompose : 2D ndarray (n_features, n_features) - The Cholesky decomposition of the covariance matrix used - to generate the knockoff variables,returned by - gaussian_knockoff_generation. - - seed : int - A random seed for generating the uniform noise used to create - the knockoff variables. - - Returns - ------- - X_tilde : 2D ndarray (n_samples, n_features) - The knockoff variables. - """ - n_samples, n_features = mu_tilde.shape - - # create a uniform noise for all the data - rng = np.random.RandomState(seed) - u_tilde = rng.randn(n_samples, n_features) - - X_tilde = mu_tilde + np.dot(u_tilde, sigma_tilde_decompose) - return X_tilde - - -def _s_equi(sigma, tol=1e-14): - """ - Estimate the diagonal matrix of correlation between real - and knockoff variables using the equi-correlated equation. - - This function estimates the diagonal matrix of correlation - between real and knockoff variables using the equi-correlated - equation described in :footcite:t:`barber2015controlling` and - :footcite:t:`candes2018panning`. It takes as input the empirical - covariance matrix sigma and a tolerance value tol, - and returns a vector of diagonal values of the estimated - matrix diag{s}. - - Parameters - ---------- - sigma : 2D ndarray (n_features, n_features) - The empirical covariance matrix calculated from - the original design matrix. - - tol : float, optional - A tolerance value used for numerical stability in the calculation - of the eigenvalues of the correlation matrix. - - Returns - ------- - 1D ndarray (n_features, ) - A vector of diagonal values of the estimated matrix diag{s}. - - Raises - ------ - Exception - If the covariance matrix is not positive-definite. - """ - n_features = sigma.shape[0] - - # Convert covariance matrix to correlation matrix - # as example see cov2corr from statsmodels - features_std = np.sqrt(np.diag(sigma)) - scale = np.outer(features_std, features_std) - corr_matrix = sigma / scale - - eig_value = np.linalg.eigvalsh(corr_matrix) - lambda_min = np.min(eig_value[0]) - # check if the matrix is positive-defined - if lambda_min <= 0: - raise Exception("The covariance matrix is not positive-definite.") - - s = np.ones(n_features) * min(2 * lambda_min, 1) - - psd = np.all(np.linalg.eigvalsh(2 * corr_matrix - np.diag(s)) > tol) - s_eps = 0 - while not psd: - if s_eps == 0: - s_eps = np.finfo(type(s[0])).eps # avoid zeros - else: - s_eps *= 10 - # if all eigval > 0 then the matrix is positive define - psd = np.all( - np.linalg.eigvalsh(2 * corr_matrix - np.diag(s * (1 - s_eps))) > tol - ) - warnings.warn( - "The equi-correlated matrix for knockoffs is not positive " - f"definite. Reduce the value of distance by {s_eps}.", - UserWarning, - ) - - s = s * (1 - s_eps) - - return s * np.diag(sigma) diff --git a/src/hidimstat/statistical_tools/gaussian_distribution.py b/src/hidimstat/statistical_tools/gaussian_distribution.py new file mode 100644 index 000000000..207f988be --- /dev/null +++ b/src/hidimstat/statistical_tools/gaussian_distribution.py @@ -0,0 +1,194 @@ +import warnings + +import numpy as np +from sklearn.preprocessing import StandardScaler +from sklearn.utils import check_random_state + + +class GaussianDistribution: + """ + Generator for second-order Gaussian variables using the equi-correlated method. + Creates synthetic variables that preserve the covariance structure of the original + variables while ensuring conditional independence between the original and synthetic data. + Parameters + ---------- + cov_estimator : object + Estimator for computing the covariance matrix. Must implement fit and + have a covariance_ attribute after fitting. + random_state : int or None, default=None + Random seed for generating synthetic data. + centered : bool, default=False + Whether to center and scale the input data before generating synthetic variables. + tol : float, default=1e-14 + Tolerance for numerical stability in matrix computations. + Attributes + ---------- + mu_tilde_ : ndarray of shape (n_samples, n_features) + Mean matrix for generating synthetic variables. + sigma_tilde_decompose_ : ndarray of shape (n_features, n_features) + Cholesky decomposition of the synthetic covariance matrix. + References + ---------- + .. footbibliography:: + """ + + def __init__(self, cov_estimator, random_state=None, centered=False, tol=1e-14): + self.cov_estimator = cov_estimator + self.centered = centered + self.tol = tol + self.rng = check_random_state(random_state) + + def fit(self, X): + """ + Fit the Gaussian synthetic variable generator. + This method estimates the parameters needed to generate Gaussian synthetic variables + based on the input data. It follows a methodology for creating second-order + synthetic variables that preserve the covariance structure. + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The input samples used to estimate the parameters for synthetic variable generation. + The data is assumed to follow a Gaussian distribution. + Returns + ------- + self : object + Returns the instance itself. + Notes + ----- + The method implements the following steps: + 1. Centers and scales the data if specified + 2. Estimates mean and covariance of input data + 3. Computes parameters for synthetic variable generation + """ + _, n_features = X.shape + if self.centered: + X_ = StandardScaler().fit_transform(X) + else: + X_ = X + + # estimation of X distribution + # original implementation: + # https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/create_second_order.R + mu = X_.mean(axis=0) + sigma = self.cov_estimator.fit(X_).covariance_ + + diag_s = np.diag(_s_equi(sigma, tol=self.tol)) + + sigma_inv_s = np.linalg.solve(sigma, diag_s) + + # First part on the RHS of equation 1.4 in barber2015controlling + self.mu_tilde_ = X - np.dot(X - mu, sigma_inv_s) + # To calculate the Cholesky decomposition later on + sigma_tilde = 2 * diag_s - diag_s.dot(sigma_inv_s) + # test is the matrix is positive definite + while not np.all(np.linalg.eigvalsh(sigma_tilde) > self.tol): + sigma_tilde += 1e-10 * np.eye(n_features) + warnings.warn( + "The conditional covariance matrix for knockoffs is not positive " + "definite. Adding minor positive value to the matrix.", + UserWarning, + ) + + self.sigma_tilde_decompose_ = np.linalg.cholesky(sigma_tilde) + + return self + + def _check_fit(self): + """ + Check if the model has been fit before performing analysis. + Raises + ------ + ValueError + If any of the required attributes are missing, indicating the model + hasn't been fit before generating synthetic variables. + """ + if not hasattr(self, "mu_tilde_") or not hasattr( + self, "sigma_tilde_decompose_" + ): + raise ValueError("The GaussianGenerator requires to be fit before simulate") + + def sample(self): + """ + Generate synthetic variables. + This function generates synthetic variables that preserve the covariance structure + of the original data while ensuring conditional independence. + Returns + ------- + X_tilde : 2D ndarray (n_samples, n_features) + The synthetic variables. + """ + self._check_fit() + n_samples, n_features = self.mu_tilde_.shape + + # create a uniform noise for all the data + u_tilde = self.rng.randn(n_samples, n_features) + + # Equation 1.4 in barber2015controlling + X_tilde = self.mu_tilde_ + np.dot(u_tilde, self.sigma_tilde_decompose_) + return X_tilde + + +def _s_equi(sigma, tol=1e-14): + """ + Estimate the diagonal matrix of correlation between real + and knockoff variables using the equi-correlated equation. + This function estimates the diagonal matrix of correlation + between real and knockoff variables using the equi-correlated + equation described in :footcite:t:`barber2015controlling` and + :footcite:t:`candes2018panning`. It takes as input the empirical + covariance matrix sigma and a tolerance value tol, + and returns a vector of diagonal values of the estimated + matrix diag{s}. + Parameters + ---------- + sigma : 2D ndarray (n_features, n_features) + The empirical covariance matrix calculated from + the original design matrix. + tol : float, optional + A tolerance value used for numerical stability in the calculation + of the eigenvalues of the correlation matrix. + Returns + ------- + 1D ndarray (n_features, ) + A vector of diagonal values of the estimated matrix diag{s}. + Raises + ------ + Exception + If the covariance matrix is not positive-definite. + """ + n_features = sigma.shape[0] + + # Convert covariance matrix to correlation matrix + # as example see cov2corr from statsmodels + features_std = np.sqrt(np.diag(sigma)) + scale = np.outer(features_std, features_std) + corr_matrix = sigma / scale + + eig_value = np.linalg.eigvalsh(corr_matrix) + lambda_min = np.min(eig_value[0]) + # check if the matrix is positive-defined + if lambda_min <= 0: + raise Exception("The covariance matrix is not positive-definite.") + + s = np.ones(n_features) * min(2 * lambda_min, 1) + + psd = np.all(np.linalg.eigvalsh(2 * corr_matrix - np.diag(s)) > tol) + s_eps = 0 + while not psd: + if s_eps == 0: + s_eps = np.finfo(type(s[0])).eps # avoid zeros + else: + s_eps *= 10 + # if all eigval > 0 then the matrix is positive define + psd = np.all( + np.linalg.eigvalsh(2 * corr_matrix - np.diag(s * (1 - s_eps))) > tol + ) + warnings.warn( + "The equi-correlated matrix for knockoffs is not positive " + f"definite. Reduce the value of distance by {s_eps}.", + UserWarning, + ) + + s = s * (1 - s_eps) + + return s * np.diag(sigma) diff --git a/test/statistical_tools/test_gaussian_distribution.py b/test/statistical_tools/test_gaussian_distribution.py new file mode 100644 index 000000000..28c138114 --- /dev/null +++ b/test/statistical_tools/test_gaussian_distribution.py @@ -0,0 +1,85 @@ +import numpy as np +import pytest +from hidimstat.statistical_tools.gaussian_distribution import ( + _s_equi, + GaussianDistribution, +) +from hidimstat._utils.scenario import multivariate_simulation +from sklearn.covariance import LedoitWolf + + +def test_gaussian_equi(): + """test function of gaussian""" + seed = 42 + n = 100 + p = 50 + X, y, beta, noise = multivariate_simulation(n, p, seed=seed) + generator = GaussianDistribution( + cov_estimator=LedoitWolf(assume_centered=True), + random_state=seed * 2, + centered=False, + ) + generator.fit(X=X) + X_tilde = generator.sample() + assert X_tilde.shape == (n, p) + + +def test_gaussian_center(): + """test function of gaussian""" + seed = 42 + n = 100 + p = 50 + X, y, beta, noise = multivariate_simulation(n, p, seed=seed) + generator = GaussianDistribution( + cov_estimator=LedoitWolf(assume_centered=True), + random_state=seed * 2, + centered=True, + ) + generator.fit(X=X) + X_tilde = generator.sample() + assert X_tilde.shape == (n, p) + + +def test_gaussian_error(): + """test function error""" + seed = 42 + n = 100 + p = 50 + X, y, beta, noise = multivariate_simulation(n, p, seed=seed) + generator = GaussianDistribution( + cov_estimator=LedoitWolf(assume_centered=True), + random_state=seed * 2, + centered=True, + ) + with pytest.raises( + ValueError, match="The GaussianGenerator requires to be fit before simulate" + ): + generator.sample() + + +def test_s_equi_not_define_positive(): + """test the warning and error of s_equi function""" + n = 10 + tol = 1e-7 + seed = 42 + + # random positive matrix + rgn = np.random.RandomState(seed) + a = rgn.randn(n, n) + a -= np.min(a) + with pytest.raises( + Exception, match="The covariance matrix is not positive-definite." + ): + _s_equi(a) + + # matrix with positive eigenvalues, positive diagonal + while not np.all(np.linalg.eigvalsh(a) > tol): + a += 0.1 * np.eye(n) + with pytest.warns(UserWarning, match="The equi-correlated matrix"): + _s_equi(a) + + # positive definite matrix + u, s, vh = np.linalg.svd(a) + d = np.eye(n) + sigma = u * d * u.T + _s_equi(sigma) From b576522e33ba743c794ed35d6c35c5f1e352516f Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 29 Aug 2025 13:30:42 +0200 Subject: [PATCH 02/44] fix knockoff --- src/hidimstat/knockoffs.py | 35 ++++++----------------------------- 1 file changed, 6 insertions(+), 29 deletions(-) diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index 7dab07aeb..7d7609b44 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -7,10 +7,7 @@ from sklearn.utils import check_random_state from sklearn.utils.validation import check_memory -from hidimstat.gaussian_knockoff import ( - gaussian_knockoff_generation, - repeat_gaussian_knockoff_generation, -) +from hidimstat.statistical_tools.gaussian_distribution import GaussianDistribution from hidimstat.statistical_tools.multiple_testing import fdr_threshold from hidimstat.statistical_tools.aggregation import quantile_aggregation @@ -196,32 +193,12 @@ def model_x_knockoff( raise TypeError("Wrong type for random_state") seed_list = rng.randint(1, np.iinfo(np.int32).max, n_bootstraps) - if centered: - X = StandardScaler().fit_transform(X) - - # estimation of X distribution - # original implementation: - # https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/create_second_order.R - mu = X.mean(axis=0) - sigma = cov_estimator.fit(X).covariance_ - # Create knockoff variables - X_tilde, mu_tilde, sigma_tilde_decompose = memory.cache( - gaussian_knockoff_generation - )(X, mu, sigma, seed=seed_list[0], tol=tol_gauss) - - if n_bootstraps == 1: - X_tildes = [X_tilde] - else: - X_tildes = parallel( - delayed(repeat_gaussian_knockoff_generation)( - mu_tilde, - sigma_tilde_decompose, - seed=seed, - ) - for seed in seed_list[1:] - ) - X_tildes.insert(0, X_tilde) + conditionnal_sampler = GaussianDistribution( + cov_estimator, random_state=seed_list[0], centered=centered, tol=tol_gauss + ) + conditionnal_sampler.fit(X) + X_tildes = [conditionnal_sampler.sample() for i in range(n_bootstraps)] results = parallel( delayed(memory.cache(_stat_coefficient_diff))( From 8f4e03236600b255e28cee36557e6f7bc1b028c6 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 29 Aug 2025 13:33:29 +0200 Subject: [PATCH 03/44] move conditional sampler in statistical tools --- examples/plot_pitfalls_permutation_importance.py | 2 +- src/hidimstat/conditional_feature_importance.py | 2 +- src/hidimstat/{ => statistical_tools}/conditional_sampling.py | 0 test/test_conditional_sampling.py | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) rename src/hidimstat/{ => statistical_tools}/conditional_sampling.py (100%) diff --git a/examples/plot_pitfalls_permutation_importance.py b/examples/plot_pitfalls_permutation_importance.py index af4deb83e..adc144a62 100644 --- a/examples/plot_pitfalls_permutation_importance.py +++ b/examples/plot_pitfalls_permutation_importance.py @@ -26,7 +26,7 @@ from sklearn.preprocessing import StandardScaler from hidimstat import CFI, PFI -from hidimstat.conditional_sampling import ConditionalSampler +from hidimstat.statistical_tools.conditional_sampling import ConditionalSampler rng = np.random.RandomState(0) diff --git a/src/hidimstat/conditional_feature_importance.py b/src/hidimstat/conditional_feature_importance.py index 9b0e7905f..8c9686334 100644 --- a/src/hidimstat/conditional_feature_importance.py +++ b/src/hidimstat/conditional_feature_importance.py @@ -5,7 +5,7 @@ from sklearn.utils.validation import check_random_state from hidimstat.base_perturbation import BasePerturbation -from hidimstat.conditional_sampling import ConditionalSampler +from hidimstat.statistical_tools.conditional_sampling import ConditionalSampler class CFI(BasePerturbation): diff --git a/src/hidimstat/conditional_sampling.py b/src/hidimstat/statistical_tools/conditional_sampling.py similarity index 100% rename from src/hidimstat/conditional_sampling.py rename to src/hidimstat/statistical_tools/conditional_sampling.py diff --git a/test/test_conditional_sampling.py b/test/test_conditional_sampling.py index d42f1f11c..7c347ff50 100644 --- a/test/test_conditional_sampling.py +++ b/test/test_conditional_sampling.py @@ -11,7 +11,7 @@ from sklearn.multiclass import OneVsRestClassifier from sklearn.preprocessing import StandardScaler -from hidimstat.conditional_sampling import ConditionalSampler +from hidimstat.statistical_tools.conditional_sampling import ConditionalSampler def test_continuous_case(): From 942941893329553504f5af81cf31e2e1c3bbce51 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 29 Aug 2025 13:41:06 +0200 Subject: [PATCH 04/44] move test from knockoff --- test/test_knockoff.py | 70 ------------------------------------------- 1 file changed, 70 deletions(-) diff --git a/test/test_knockoff.py b/test/test_knockoff.py index 3b0e86aa1..8ec615772 100644 --- a/test/test_knockoff.py +++ b/test/test_knockoff.py @@ -4,7 +4,6 @@ model_x_knockoff_bootstrap_e_value, model_x_knockoff_bootstrap_quantile, ) -from hidimstat.gaussian_knockoff import gaussian_knockoff_generation, _s_equi from hidimstat._utils.scenario import multivariate_simulation from hidimstat.statistical_tools.multiple_testing import fdp_power import numpy as np @@ -235,72 +234,3 @@ def test_estimate_distribution(): ) for i in selected: assert np.any(i == non_zero) - - -def test_gaussian_knockoff_equi(): - """test function of gaussian knockoff""" - seed = 42 - n = 100 - p = 50 - X, y, beta, noise = multivariate_simulation(n, p, seed=seed) - non_zero = np.where(beta)[0] - mu = X.mean(axis=0) - sigma = LedoitWolf(assume_centered=True).fit(X).covariance_ - - X_tilde, mu_tilde, sigma_tilde_decompose = gaussian_knockoff_generation( - X, mu, sigma, seed=seed * 2 - ) - - assert X_tilde.shape == (n, p) - - -def test_gaussian_knockoff_equi_warning(): - "test warning in guassian knockoff" - seed = 42 - n = 100 - p = 50 - tol = 1e-14 - rgn = np.random.RandomState(seed) - X = rgn.randn(n, p) - mu = X.mean(axis=0) - # create a positive definite matrix - u, s, vh = np.linalg.svd(rgn.randn(p, p)) - d = np.eye(p) * tol / 10 - sigma = u * d * u.T - with pytest.warns( - UserWarning, - match="The conditional covariance matrix for knockoffs is not positive", - ): - X_tilde, mu_tilde, sigma_tilde_decompose = gaussian_knockoff_generation( - X, mu, sigma, seed=seed * 2, tol=tol - ) - - assert X_tilde.shape == (n, p) - - -def test_s_equi_not_define_positive(): - """test the warning and error of s_equi function""" - n = 10 - tol = 1e-7 - seed = 42 - - # random positive matrix - rgn = np.random.RandomState(seed) - a = rgn.randn(n, n) - a -= np.min(a) - with pytest.raises( - Exception, match="The covariance matrix is not positive-definite." - ): - _s_equi(a) - - # matrix with positive eigenvalues, positive diagonal - while not np.all(np.linalg.eigvalsh(a) > tol): - a += 0.1 * np.eye(n) - with pytest.warns(UserWarning, match="The equi-correlated matrix"): - _s_equi(a) - - # positive definite matrix - u, s, vh = np.linalg.svd(a) - d = np.eye(n) - sigma = u * d * u.T - _s_equi(sigma) From 81f2fb06d70f5e8f0b836b7b97bb20c0d524d522 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 29 Aug 2025 13:43:51 +0200 Subject: [PATCH 05/44] move test in right folder --- test/{ => statistical_tools}/test_conditional_sampling.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename test/{ => statistical_tools}/test_conditional_sampling.py (100%) diff --git a/test/test_conditional_sampling.py b/test/statistical_tools/test_conditional_sampling.py similarity index 100% rename from test/test_conditional_sampling.py rename to test/statistical_tools/test_conditional_sampling.py From 17eabee45e0795c731abfdd0de84d5a01d60fee4 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 29 Aug 2025 18:51:21 +0200 Subject: [PATCH 06/44] remove gaussian --- .../statistical_tools/gaussian_distribution.py | 13 +++---------- .../statistical_tools/test_gaussian_distribution.py | 3 --- 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/src/hidimstat/statistical_tools/gaussian_distribution.py b/src/hidimstat/statistical_tools/gaussian_distribution.py index 207f988be..7aad0ba68 100644 --- a/src/hidimstat/statistical_tools/gaussian_distribution.py +++ b/src/hidimstat/statistical_tools/gaussian_distribution.py @@ -17,8 +17,6 @@ class GaussianDistribution: have a covariance_ attribute after fitting. random_state : int or None, default=None Random seed for generating synthetic data. - centered : bool, default=False - Whether to center and scale the input data before generating synthetic variables. tol : float, default=1e-14 Tolerance for numerical stability in matrix computations. Attributes @@ -32,9 +30,8 @@ class GaussianDistribution: .. footbibliography:: """ - def __init__(self, cov_estimator, random_state=None, centered=False, tol=1e-14): + def __init__(self, cov_estimator, random_state=None, tol=1e-14): self.cov_estimator = cov_estimator - self.centered = centered self.tol = tol self.rng = check_random_state(random_state) @@ -61,16 +58,12 @@ def fit(self, X): 3. Computes parameters for synthetic variable generation """ _, n_features = X.shape - if self.centered: - X_ = StandardScaler().fit_transform(X) - else: - X_ = X # estimation of X distribution # original implementation: # https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/create_second_order.R - mu = X_.mean(axis=0) - sigma = self.cov_estimator.fit(X_).covariance_ + mu = X.mean(axis=0) + sigma = self.cov_estimator.fit(X).covariance_ diag_s = np.diag(_s_equi(sigma, tol=self.tol)) diff --git a/test/statistical_tools/test_gaussian_distribution.py b/test/statistical_tools/test_gaussian_distribution.py index 28c138114..ba586dc1b 100644 --- a/test/statistical_tools/test_gaussian_distribution.py +++ b/test/statistical_tools/test_gaussian_distribution.py @@ -17,7 +17,6 @@ def test_gaussian_equi(): generator = GaussianDistribution( cov_estimator=LedoitWolf(assume_centered=True), random_state=seed * 2, - centered=False, ) generator.fit(X=X) X_tilde = generator.sample() @@ -33,7 +32,6 @@ def test_gaussian_center(): generator = GaussianDistribution( cov_estimator=LedoitWolf(assume_centered=True), random_state=seed * 2, - centered=True, ) generator.fit(X=X) X_tilde = generator.sample() @@ -49,7 +47,6 @@ def test_gaussian_error(): generator = GaussianDistribution( cov_estimator=LedoitWolf(assume_centered=True), random_state=seed * 2, - centered=True, ) with pytest.raises( ValueError, match="The GaussianGenerator requires to be fit before simulate" From 0bac34a15f89b356536e27a3758514c74b61a19a Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 29 Aug 2025 18:53:43 +0200 Subject: [PATCH 07/44] remove unsed import --- src/hidimstat/statistical_tools/gaussian_distribution.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/hidimstat/statistical_tools/gaussian_distribution.py b/src/hidimstat/statistical_tools/gaussian_distribution.py index 7aad0ba68..e0fafeb76 100644 --- a/src/hidimstat/statistical_tools/gaussian_distribution.py +++ b/src/hidimstat/statistical_tools/gaussian_distribution.py @@ -1,7 +1,6 @@ import warnings import numpy as np -from sklearn.preprocessing import StandardScaler from sklearn.utils import check_random_state From b965f13ee086d6600dfbb5255f040c02f5ec2e79 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 29 Aug 2025 18:56:41 +0200 Subject: [PATCH 08/44] fix center to knockoff --- src/hidimstat/knockoffs.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index 7d7609b44..e7461e8b9 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -193,9 +193,12 @@ def model_x_knockoff( raise TypeError("Wrong type for random_state") seed_list = rng.randint(1, np.iinfo(np.int32).max, n_bootstraps) + if centered: + X = StandardScaler().fit_transform(X) + # Create knockoff variables conditionnal_sampler = GaussianDistribution( - cov_estimator, random_state=seed_list[0], centered=centered, tol=tol_gauss + cov_estimator, random_state=seed_list[0], tol=tol_gauss ) conditionnal_sampler.fit(X) X_tildes = [conditionnal_sampler.sample() for i in range(n_bootstraps)] From 7e29c970c0c6bb4895701c313d09140969ab2724 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 1 Sep 2025 13:33:11 +0200 Subject: [PATCH 09/44] Change name of GaussianDistribution --- src/hidimstat/knockoffs.py | 4 ++-- ...{gaussian_distribution.py => gaussian_knockoffs.py} | 2 +- ...sian_distribution.py => test_gaussian_knockoffs.py} | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) rename src/hidimstat/statistical_tools/{gaussian_distribution.py => gaussian_knockoffs.py} (99%) rename test/statistical_tools/{test_gaussian_distribution.py => test_gaussian_knockoffs.py} (90%) diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index e7461e8b9..ecf6e4ba3 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -7,7 +7,7 @@ from sklearn.utils import check_random_state from sklearn.utils.validation import check_memory -from hidimstat.statistical_tools.gaussian_distribution import GaussianDistribution +from hidimstat.statistical_tools.gaussian_knockoffs import GaussianKnockoffs from hidimstat.statistical_tools.multiple_testing import fdr_threshold from hidimstat.statistical_tools.aggregation import quantile_aggregation @@ -197,7 +197,7 @@ def model_x_knockoff( X = StandardScaler().fit_transform(X) # Create knockoff variables - conditionnal_sampler = GaussianDistribution( + conditionnal_sampler = GaussianKnockoffs( cov_estimator, random_state=seed_list[0], tol=tol_gauss ) conditionnal_sampler.fit(X) diff --git a/src/hidimstat/statistical_tools/gaussian_distribution.py b/src/hidimstat/statistical_tools/gaussian_knockoffs.py similarity index 99% rename from src/hidimstat/statistical_tools/gaussian_distribution.py rename to src/hidimstat/statistical_tools/gaussian_knockoffs.py index e0fafeb76..7f0387b03 100644 --- a/src/hidimstat/statistical_tools/gaussian_distribution.py +++ b/src/hidimstat/statistical_tools/gaussian_knockoffs.py @@ -4,7 +4,7 @@ from sklearn.utils import check_random_state -class GaussianDistribution: +class GaussianKnockoffs: """ Generator for second-order Gaussian variables using the equi-correlated method. Creates synthetic variables that preserve the covariance structure of the original diff --git a/test/statistical_tools/test_gaussian_distribution.py b/test/statistical_tools/test_gaussian_knockoffs.py similarity index 90% rename from test/statistical_tools/test_gaussian_distribution.py rename to test/statistical_tools/test_gaussian_knockoffs.py index ba586dc1b..936d19c53 100644 --- a/test/statistical_tools/test_gaussian_distribution.py +++ b/test/statistical_tools/test_gaussian_knockoffs.py @@ -1,8 +1,8 @@ import numpy as np import pytest -from hidimstat.statistical_tools.gaussian_distribution import ( +from hidimstat.statistical_tools.gaussian_knockoffs import ( _s_equi, - GaussianDistribution, + GaussianKnockoffs, ) from hidimstat._utils.scenario import multivariate_simulation from sklearn.covariance import LedoitWolf @@ -14,7 +14,7 @@ def test_gaussian_equi(): n = 100 p = 50 X, y, beta, noise = multivariate_simulation(n, p, seed=seed) - generator = GaussianDistribution( + generator = GaussianKnockoffs( cov_estimator=LedoitWolf(assume_centered=True), random_state=seed * 2, ) @@ -29,7 +29,7 @@ def test_gaussian_center(): n = 100 p = 50 X, y, beta, noise = multivariate_simulation(n, p, seed=seed) - generator = GaussianDistribution( + generator = GaussianKnockoffs( cov_estimator=LedoitWolf(assume_centered=True), random_state=seed * 2, ) @@ -44,7 +44,7 @@ def test_gaussian_error(): n = 100 p = 50 X, y, beta, noise = multivariate_simulation(n, p, seed=seed) - generator = GaussianDistribution( + generator = GaussianKnockoffs( cov_estimator=LedoitWolf(assume_centered=True), random_state=seed * 2, ) From 86542699d60122225e7c7af8612962b8bbd2e854 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 1 Sep 2025 13:35:57 +0200 Subject: [PATCH 10/44] remove parameter assumed_centerd of tests --- test/statistical_tools/test_gaussian_knockoffs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/statistical_tools/test_gaussian_knockoffs.py b/test/statistical_tools/test_gaussian_knockoffs.py index 936d19c53..ce3b269a3 100644 --- a/test/statistical_tools/test_gaussian_knockoffs.py +++ b/test/statistical_tools/test_gaussian_knockoffs.py @@ -15,7 +15,7 @@ def test_gaussian_equi(): p = 50 X, y, beta, noise = multivariate_simulation(n, p, seed=seed) generator = GaussianKnockoffs( - cov_estimator=LedoitWolf(assume_centered=True), + cov_estimator=LedoitWolf(), random_state=seed * 2, ) generator.fit(X=X) @@ -30,7 +30,7 @@ def test_gaussian_center(): p = 50 X, y, beta, noise = multivariate_simulation(n, p, seed=seed) generator = GaussianKnockoffs( - cov_estimator=LedoitWolf(assume_centered=True), + cov_estimator=LedoitWolf(), random_state=seed * 2, ) generator.fit(X=X) @@ -45,7 +45,7 @@ def test_gaussian_error(): p = 50 X, y, beta, noise = multivariate_simulation(n, p, seed=seed) generator = GaussianKnockoffs( - cov_estimator=LedoitWolf(assume_centered=True), + cov_estimator=LedoitWolf(), random_state=seed * 2, ) with pytest.raises( From afe9915e06f609fff0821648899ad33c1960b057 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 3 Sep 2025 17:07:19 +0200 Subject: [PATCH 11/44] change the way of using the randomgenerator --- src/hidimstat/knockoffs.py | 14 ++++---------- .../statistical_tools/gaussian_knockoffs.py | 5 +++-- test/statistical_tools/test_gaussian_knockoffs.py | 15 --------------- 3 files changed, 7 insertions(+), 27 deletions(-) diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index ecf6e4ba3..86ae4d133 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -180,25 +180,19 @@ def model_x_knockoff( """ assert n_bootstraps > 0, "the number of bootstraps should at least higher than 1" memory = check_memory(memory) + rng = check_random_state(random_state) # unnecessary to have n_jobs > number of bootstraps n_jobs = min(n_bootstraps, n_jobs) parallel = Parallel(n_jobs, verbose=joblib_verbose) - # get the seed for the different run - if isinstance(random_state, (int, np.int32, np.int64)): - rng = check_random_state(random_state) - elif random_state is None: - rng = check_random_state(0) - else: - raise TypeError("Wrong type for random_state") - seed_list = rng.randint(1, np.iinfo(np.int32).max, n_bootstraps) - if centered: X = StandardScaler().fit_transform(X) # Create knockoff variables conditionnal_sampler = GaussianKnockoffs( - cov_estimator, random_state=seed_list[0], tol=tol_gauss + cov_estimator, + random_state=rng, + tol=tol_gauss, ) conditionnal_sampler.fit(X) X_tildes = [conditionnal_sampler.sample() for i in range(n_bootstraps)] diff --git a/src/hidimstat/statistical_tools/gaussian_knockoffs.py b/src/hidimstat/statistical_tools/gaussian_knockoffs.py index 7f0387b03..8b88f6463 100644 --- a/src/hidimstat/statistical_tools/gaussian_knockoffs.py +++ b/src/hidimstat/statistical_tools/gaussian_knockoffs.py @@ -32,7 +32,7 @@ class GaussianKnockoffs: def __init__(self, cov_estimator, random_state=None, tol=1e-14): self.cov_estimator = cov_estimator self.tol = tol - self.rng = check_random_state(random_state) + self.random_state = random_state def fit(self, X): """ @@ -110,10 +110,11 @@ def sample(self): The synthetic variables. """ self._check_fit() + rng = check_random_state(self.random_state) n_samples, n_features = self.mu_tilde_.shape # create a uniform noise for all the data - u_tilde = self.rng.randn(n_samples, n_features) + u_tilde = rng.randn(n_samples, n_features) # Equation 1.4 in barber2015controlling X_tilde = self.mu_tilde_ + np.dot(u_tilde, self.sigma_tilde_decompose_) diff --git a/test/statistical_tools/test_gaussian_knockoffs.py b/test/statistical_tools/test_gaussian_knockoffs.py index ce3b269a3..d3801872f 100644 --- a/test/statistical_tools/test_gaussian_knockoffs.py +++ b/test/statistical_tools/test_gaussian_knockoffs.py @@ -23,21 +23,6 @@ def test_gaussian_equi(): assert X_tilde.shape == (n, p) -def test_gaussian_center(): - """test function of gaussian""" - seed = 42 - n = 100 - p = 50 - X, y, beta, noise = multivariate_simulation(n, p, seed=seed) - generator = GaussianKnockoffs( - cov_estimator=LedoitWolf(), - random_state=seed * 2, - ) - generator.fit(X=X) - X_tilde = generator.sample() - assert X_tilde.shape == (n, p) - - def test_gaussian_error(): """test function error""" seed = 42 From bd3b77af428391fa8590c68d29b6e1065dfe99ce Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Thu, 4 Sep 2025 11:23:53 +0200 Subject: [PATCH 12/44] Apply suggestions from code review Co-authored-by: Joseph Paillard --- src/hidimstat/statistical_tools/gaussian_knockoffs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/hidimstat/statistical_tools/gaussian_knockoffs.py b/src/hidimstat/statistical_tools/gaussian_knockoffs.py index 8b88f6463..6eae8a3ad 100644 --- a/src/hidimstat/statistical_tools/gaussian_knockoffs.py +++ b/src/hidimstat/statistical_tools/gaussian_knockoffs.py @@ -9,6 +9,7 @@ class GaussianKnockoffs: Generator for second-order Gaussian variables using the equi-correlated method. Creates synthetic variables that preserve the covariance structure of the original variables while ensuring conditional independence between the original and synthetic data. + Parameters ---------- cov_estimator : object @@ -40,6 +41,7 @@ def fit(self, X): This method estimates the parameters needed to generate Gaussian synthetic variables based on the input data. It follows a methodology for creating second-order synthetic variables that preserve the covariance structure. + Parameters ---------- X : array-like of shape (n_samples, n_features) From d8e9f5c81ac100a5a9c5e1769ee8c2af524d3e84 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 4 Sep 2025 11:25:04 +0200 Subject: [PATCH 13/44] reformat docstring --- .../statistical_tools/gaussian_knockoffs.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/hidimstat/statistical_tools/gaussian_knockoffs.py b/src/hidimstat/statistical_tools/gaussian_knockoffs.py index 6eae8a3ad..9d4932cc2 100644 --- a/src/hidimstat/statistical_tools/gaussian_knockoffs.py +++ b/src/hidimstat/statistical_tools/gaussian_knockoffs.py @@ -9,7 +9,7 @@ class GaussianKnockoffs: Generator for second-order Gaussian variables using the equi-correlated method. Creates synthetic variables that preserve the covariance structure of the original variables while ensuring conditional independence between the original and synthetic data. - + Parameters ---------- cov_estimator : object @@ -19,12 +19,14 @@ class GaussianKnockoffs: Random seed for generating synthetic data. tol : float, default=1e-14 Tolerance for numerical stability in matrix computations. + Attributes ---------- mu_tilde_ : ndarray of shape (n_samples, n_features) Mean matrix for generating synthetic variables. sigma_tilde_decompose_ : ndarray of shape (n_features, n_features) Cholesky decomposition of the synthetic covariance matrix. + References ---------- .. footbibliography:: @@ -41,16 +43,18 @@ def fit(self, X): This method estimates the parameters needed to generate Gaussian synthetic variables based on the input data. It follows a methodology for creating second-order synthetic variables that preserve the covariance structure. - + Parameters ---------- X : array-like of shape (n_samples, n_features) The input samples used to estimate the parameters for synthetic variable generation. The data is assumed to follow a Gaussian distribution. + Returns ------- self : object Returns the instance itself. + Notes ----- The method implements the following steps: @@ -90,6 +94,7 @@ def fit(self, X): def _check_fit(self): """ Check if the model has been fit before performing analysis. + Raises ------ ValueError @@ -106,6 +111,7 @@ def sample(self): Generate synthetic variables. This function generates synthetic variables that preserve the covariance structure of the original data while ensuring conditional independence. + Returns ------- X_tilde : 2D ndarray (n_samples, n_features) @@ -134,6 +140,7 @@ def _s_equi(sigma, tol=1e-14): covariance matrix sigma and a tolerance value tol, and returns a vector of diagonal values of the estimated matrix diag{s}. + Parameters ---------- sigma : 2D ndarray (n_features, n_features) @@ -142,10 +149,12 @@ def _s_equi(sigma, tol=1e-14): tol : float, optional A tolerance value used for numerical stability in the calculation of the eigenvalues of the correlation matrix. + Returns ------- 1D ndarray (n_features, ) A vector of diagonal values of the estimated matrix diag{s}. + Raises ------ Exception From 013bd636f127da32749e23719694975e76774463 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Thu, 4 Sep 2025 11:27:11 +0200 Subject: [PATCH 14/44] Update test/statistical_tools/test_gaussian_knockoffs.py Co-authored-by: Joseph Paillard --- .../test_gaussian_knockoffs.py | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/test/statistical_tools/test_gaussian_knockoffs.py b/test/statistical_tools/test_gaussian_knockoffs.py index d3801872f..e59811f8b 100644 --- a/test/statistical_tools/test_gaussian_knockoffs.py +++ b/test/statistical_tools/test_gaussian_knockoffs.py @@ -65,3 +65,29 @@ def test_s_equi_not_define_positive(): d = np.eye(n) sigma = u * d * u.T _s_equi(sigma) + + +def test_reproducibility_sample(): + X, _, _, _ = multivariate_simulation(100, 10, seed=0) + gaussian_sampler = GaussianKnockoffs(cov_estimator=LedoitWolf(), random_state=0) + gaussian_sampler.fit(X=X) + X_tilde_1 = gaussian_sampler.sample() + X_tilde_2 = gaussian_sampler.sample() + assert np.array_equal(X_tilde_1, X_tilde_2) + + +def test_randomness_sample(): + X, _, _, _ = multivariate_simulation(100, 10, seed=0) + gaussian_sampler = GaussianKnockoffs(cov_estimator=LedoitWolf(), random_state=None) + gaussian_sampler.fit(X=X) + X_tilde_1 = gaussian_sampler.sample() + X_tilde_2 = gaussian_sampler.sample() + assert not np.array_equal(X_tilde_1, X_tilde_2) + + gaussian_sampler_rng = GaussianKnockoffs( + cov_estimator=LedoitWolf(), random_state=np.random.RandomState(0) + ) + gaussian_sampler_rng.fit(X=X) + X_tilde_1 = gaussian_sampler_rng.sample() + X_tilde_2 = gaussian_sampler_rng.sample() + assert not np.array_equal(X_tilde_1, X_tilde_2) From 140dc6ecbb697a8a5974f8c8f6b04d593bdb7554 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 4 Sep 2025 11:29:48 +0200 Subject: [PATCH 15/44] improve tests --- test/statistical_tools/test_gaussian_knockoffs.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/test/statistical_tools/test_gaussian_knockoffs.py b/test/statistical_tools/test_gaussian_knockoffs.py index e59811f8b..f0500d25f 100644 --- a/test/statistical_tools/test_gaussian_knockoffs.py +++ b/test/statistical_tools/test_gaussian_knockoffs.py @@ -68,6 +68,7 @@ def test_s_equi_not_define_positive(): def test_reproducibility_sample(): + """Test the repeatability of the samples""" X, _, _, _ = multivariate_simulation(100, 10, seed=0) gaussian_sampler = GaussianKnockoffs(cov_estimator=LedoitWolf(), random_state=0) gaussian_sampler.fit(X=X) @@ -76,7 +77,8 @@ def test_reproducibility_sample(): assert np.array_equal(X_tilde_1, X_tilde_2) -def test_randomness_sample(): +def test_randomness_sample_no_seed(): + """Test the non repeatability of the samples when no seed""" X, _, _, _ = multivariate_simulation(100, 10, seed=0) gaussian_sampler = GaussianKnockoffs(cov_estimator=LedoitWolf(), random_state=None) gaussian_sampler.fit(X=X) @@ -84,6 +86,10 @@ def test_randomness_sample(): X_tilde_2 = gaussian_sampler.sample() assert not np.array_equal(X_tilde_1, X_tilde_2) + +def test_randomness_sample_rgn(): + """Test the non repeatability of the samples when the usage of random generator""" + X, _, _, _ = multivariate_simulation(100, 10, seed=0) gaussian_sampler_rng = GaussianKnockoffs( cov_estimator=LedoitWolf(), random_state=np.random.RandomState(0) ) From d26ce917c6e44d757592603376b93e683967bdb9 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 5 Sep 2025 11:16:22 +0200 Subject: [PATCH 16/44] Apply suggestions from code review Co-authored-by: bthirion --- src/hidimstat/statistical_tools/gaussian_knockoffs.py | 4 ++-- test/statistical_tools/test_gaussian_knockoffs.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/hidimstat/statistical_tools/gaussian_knockoffs.py b/src/hidimstat/statistical_tools/gaussian_knockoffs.py index 9d4932cc2..74b5ffa61 100644 --- a/src/hidimstat/statistical_tools/gaussian_knockoffs.py +++ b/src/hidimstat/statistical_tools/gaussian_knockoffs.py @@ -93,7 +93,7 @@ def fit(self, X): def _check_fit(self): """ - Check if the model has been fit before performing analysis. + Check if the model has been fit before sampling knockoffs. Raises ------ @@ -104,7 +104,7 @@ def _check_fit(self): if not hasattr(self, "mu_tilde_") or not hasattr( self, "sigma_tilde_decompose_" ): - raise ValueError("The GaussianGenerator requires to be fit before simulate") + raise ValueError("The GaussianGenerator requires to be fit before sampling") def sample(self): """ diff --git a/test/statistical_tools/test_gaussian_knockoffs.py b/test/statistical_tools/test_gaussian_knockoffs.py index f0500d25f..4dfddf163 100644 --- a/test/statistical_tools/test_gaussian_knockoffs.py +++ b/test/statistical_tools/test_gaussian_knockoffs.py @@ -13,7 +13,7 @@ def test_gaussian_equi(): seed = 42 n = 100 p = 50 - X, y, beta, noise = multivariate_simulation(n, p, seed=seed) + X, _, _, _ = multivariate_simulation(n, p, seed=seed) generator = GaussianKnockoffs( cov_estimator=LedoitWolf(), random_state=seed * 2, From 87c4e6efb91af77a869b2e4bf66a8426a128318d Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 5 Sep 2025 11:24:43 +0200 Subject: [PATCH 17/44] add assert --- test/statistical_tools/test_gaussian_knockoffs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/statistical_tools/test_gaussian_knockoffs.py b/test/statistical_tools/test_gaussian_knockoffs.py index 4dfddf163..593c661de 100644 --- a/test/statistical_tools/test_gaussian_knockoffs.py +++ b/test/statistical_tools/test_gaussian_knockoffs.py @@ -64,7 +64,8 @@ def test_s_equi_not_define_positive(): u, s, vh = np.linalg.svd(a) d = np.eye(n) sigma = u * d * u.T - _s_equi(sigma) + res = _s_equi(sigma) + np.testing.assert_equal(res / np.diag(sigma), np.ones_like(res)) def test_reproducibility_sample(): From 90f8455e851e4f3e6974e649cd45c7a04a457d06 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 5 Sep 2025 11:31:53 +0200 Subject: [PATCH 18/44] fix tests --- test/statistical_tools/test_gaussian_knockoffs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/statistical_tools/test_gaussian_knockoffs.py b/test/statistical_tools/test_gaussian_knockoffs.py index 593c661de..0ee606dae 100644 --- a/test/statistical_tools/test_gaussian_knockoffs.py +++ b/test/statistical_tools/test_gaussian_knockoffs.py @@ -34,7 +34,7 @@ def test_gaussian_error(): random_state=seed * 2, ) with pytest.raises( - ValueError, match="The GaussianGenerator requires to be fit before simulate" + ValueError, match="The GaussianGenerator requires to be fit before sampling" ): generator.sample() From 785eebc109f9c427ade2c748b26a7a400bffeb4c Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 8 Sep 2025 11:21:39 +0200 Subject: [PATCH 19/44] Improve test --- .../test_gaussian_knockoffs.py | 38 ++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/test/statistical_tools/test_gaussian_knockoffs.py b/test/statistical_tools/test_gaussian_knockoffs.py index 0ee606dae..93b217958 100644 --- a/test/statistical_tools/test_gaussian_knockoffs.py +++ b/test/statistical_tools/test_gaussian_knockoffs.py @@ -13,7 +13,8 @@ def test_gaussian_equi(): seed = 42 n = 100 p = 50 - X, _, _, _ = multivariate_simulation(n, p, seed=seed) + rho = 1.0 + X, _, _, _ = multivariate_simulation(n, p, rho=rho, seed=seed) generator = GaussianKnockoffs( cov_estimator=LedoitWolf(), random_state=seed * 2, @@ -21,6 +22,41 @@ def test_gaussian_equi(): generator.fit(X=X) X_tilde = generator.sample() assert X_tilde.shape == (n, p) + assert np.linalg.norm(X - X_tilde) < np.linalg.norm(X) + + +def test_gaussian_equi_2(): + """test function of gaussian""" + seed = 42 + n = 100 + p = 50 + rho = 0.5 + X, _, _, _ = multivariate_simulation(n, p, rho=rho, seed=seed) + generator = GaussianKnockoffs( + cov_estimator=LedoitWolf(), + random_state=seed * 2, + ) + generator.fit(X=X) + X_tilde = generator.sample() + assert X_tilde.shape == (n, p) + assert np.linalg.norm(X - X_tilde) < np.sqrt(2) * np.linalg.norm(X) + + +def test_gaussian_equi_3(): + """test function of gaussian""" + seed = 42 + n = 100 + p = 50 + rho = 0.0 + X, _, _, _ = multivariate_simulation(n, p, rho=rho, seed=seed) + generator = GaussianKnockoffs( + cov_estimator=LedoitWolf(), + random_state=seed * 2, + ) + generator.fit(X=X) + X_tilde = generator.sample() + assert X_tilde.shape == (n, p) + assert np.linalg.norm(X - X_tilde) < 5 / 3 * np.linalg.norm(X) def test_gaussian_error(): From 22bdd4bca3cc92118d47933abbcd172f65289536 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 9 Sep 2025 10:19:38 +0200 Subject: [PATCH 20/44] remove not necessary test --- .../test_gaussian_knockoffs.py | 34 ------------------- 1 file changed, 34 deletions(-) diff --git a/test/statistical_tools/test_gaussian_knockoffs.py b/test/statistical_tools/test_gaussian_knockoffs.py index 93b217958..bd2a8f637 100644 --- a/test/statistical_tools/test_gaussian_knockoffs.py +++ b/test/statistical_tools/test_gaussian_knockoffs.py @@ -9,23 +9,6 @@ def test_gaussian_equi(): - """test function of gaussian""" - seed = 42 - n = 100 - p = 50 - rho = 1.0 - X, _, _, _ = multivariate_simulation(n, p, rho=rho, seed=seed) - generator = GaussianKnockoffs( - cov_estimator=LedoitWolf(), - random_state=seed * 2, - ) - generator.fit(X=X) - X_tilde = generator.sample() - assert X_tilde.shape == (n, p) - assert np.linalg.norm(X - X_tilde) < np.linalg.norm(X) - - -def test_gaussian_equi_2(): """test function of gaussian""" seed = 42 n = 100 @@ -42,23 +25,6 @@ def test_gaussian_equi_2(): assert np.linalg.norm(X - X_tilde) < np.sqrt(2) * np.linalg.norm(X) -def test_gaussian_equi_3(): - """test function of gaussian""" - seed = 42 - n = 100 - p = 50 - rho = 0.0 - X, _, _, _ = multivariate_simulation(n, p, rho=rho, seed=seed) - generator = GaussianKnockoffs( - cov_estimator=LedoitWolf(), - random_state=seed * 2, - ) - generator.fit(X=X) - X_tilde = generator.sample() - assert X_tilde.shape == (n, p) - assert np.linalg.norm(X - X_tilde) < 5 / 3 * np.linalg.norm(X) - - def test_gaussian_error(): """test function error""" seed = 42 From 9429c2acede08b7dc49f603b6a013bf1e4a86e61 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 24 Sep 2025 15:43:18 +0200 Subject: [PATCH 21/44] Update the conditioal sampler for new fit signature --- src/hidimstat/knockoffs.py | 8 +--- .../statistical_tools/gaussian_knockoffs.py | 44 ++++++++++++++----- .../test_gaussian_knockoffs.py | 35 ++++++--------- 3 files changed, 49 insertions(+), 38 deletions(-) diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index 86ae4d133..bbc8b4ea9 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -189,13 +189,9 @@ def model_x_knockoff( X = StandardScaler().fit_transform(X) # Create knockoff variables - conditionnal_sampler = GaussianKnockoffs( - cov_estimator, - random_state=rng, - tol=tol_gauss, - ) + conditionnal_sampler = GaussianKnockoffs(cov_estimator, tol=tol_gauss) conditionnal_sampler.fit(X) - X_tildes = [conditionnal_sampler.sample() for i in range(n_bootstraps)] + X_tildes = conditionnal_sampler.sample(n_samples=n_bootstraps, random_state=rng) results = parallel( delayed(memory.cache(_stat_coefficient_diff))( diff --git a/src/hidimstat/statistical_tools/gaussian_knockoffs.py b/src/hidimstat/statistical_tools/gaussian_knockoffs.py index 74b5ffa61..9b6b66fdc 100644 --- a/src/hidimstat/statistical_tools/gaussian_knockoffs.py +++ b/src/hidimstat/statistical_tools/gaussian_knockoffs.py @@ -15,8 +15,6 @@ class GaussianKnockoffs: cov_estimator : object Estimator for computing the covariance matrix. Must implement fit and have a covariance_ attribute after fitting. - random_state : int or None, default=None - Random seed for generating synthetic data. tol : float, default=1e-14 Tolerance for numerical stability in matrix computations. @@ -32,10 +30,9 @@ class GaussianKnockoffs: .. footbibliography:: """ - def __init__(self, cov_estimator, random_state=None, tol=1e-14): + def __init__(self, cov_estimator, tol=1e-14): self.cov_estimator = cov_estimator self.tol = tol - self.random_state = random_state def fit(self, X): """ @@ -106,27 +103,52 @@ def _check_fit(self): ): raise ValueError("The GaussianGenerator requires to be fit before sampling") - def sample(self): + def sample( + self, + X: np.ndarray = None, + y: np.ndarray = None, + n_samples: int = 1, + random_state=None, + ): """ Generate synthetic variables. This function generates synthetic variables that preserve the covariance structure of the original data while ensuring conditional independence. + Parameters + ---------- + X : ndarray, default = None + The complementary of the considered set of variables, $X^{-j}$. + y : ndarray, default = None + The group of variables to sample, $X^j$. + n_samples : int, default=1 + The number of samples to draw. + random_state : int, default=None + The random state to use for sampling. + Returns ------- X_tilde : 2D ndarray (n_samples, n_features) The synthetic variables. """ + if X is not None: + warnings.warn("this new X won't be used for sample the distribution") + if y is not None: + warnings.warn("this new y won't be used for sample the distribution") self._check_fit() - rng = check_random_state(self.random_state) + rng = check_random_state(random_state) n_samples, n_features = self.mu_tilde_.shape - # create a uniform noise for all the data - u_tilde = rng.randn(n_samples, n_features) + X_tildes = [] + for i in range(n_samples): + # create a uniform noise for all the data + u_tilde = rng.randn(n_samples, n_features) - # Equation 1.4 in barber2015controlling - X_tilde = self.mu_tilde_ + np.dot(u_tilde, self.sigma_tilde_decompose_) - return X_tilde + # Equation 1.4 in barber2015controlling + X_tildes.append( + self.mu_tilde_ + np.dot(u_tilde, self.sigma_tilde_decompose_) + ) + return X_tildes def _s_equi(sigma, tol=1e-14): diff --git a/test/statistical_tools/test_gaussian_knockoffs.py b/test/statistical_tools/test_gaussian_knockoffs.py index bd2a8f637..721054e6b 100644 --- a/test/statistical_tools/test_gaussian_knockoffs.py +++ b/test/statistical_tools/test_gaussian_knockoffs.py @@ -15,12 +15,9 @@ def test_gaussian_equi(): p = 50 rho = 0.5 X, _, _, _ = multivariate_simulation(n, p, rho=rho, seed=seed) - generator = GaussianKnockoffs( - cov_estimator=LedoitWolf(), - random_state=seed * 2, - ) + generator = GaussianKnockoffs(cov_estimator=LedoitWolf()) generator.fit(X=X) - X_tilde = generator.sample() + X_tilde = generator.sample(random_state=seed * 2)[0] assert X_tilde.shape == (n, p) assert np.linalg.norm(X - X_tilde) < np.sqrt(2) * np.linalg.norm(X) @@ -31,14 +28,11 @@ def test_gaussian_error(): n = 100 p = 50 X, y, beta, noise = multivariate_simulation(n, p, seed=seed) - generator = GaussianKnockoffs( - cov_estimator=LedoitWolf(), - random_state=seed * 2, - ) + generator = GaussianKnockoffs(cov_estimator=LedoitWolf()) with pytest.raises( ValueError, match="The GaussianGenerator requires to be fit before sampling" ): - generator.sample() + generator.sample(random_state=seed * 2) def test_s_equi_not_define_positive(): @@ -73,30 +67,29 @@ def test_s_equi_not_define_positive(): def test_reproducibility_sample(): """Test the repeatability of the samples""" X, _, _, _ = multivariate_simulation(100, 10, seed=0) - gaussian_sampler = GaussianKnockoffs(cov_estimator=LedoitWolf(), random_state=0) + gaussian_sampler = GaussianKnockoffs(cov_estimator=LedoitWolf()) gaussian_sampler.fit(X=X) - X_tilde_1 = gaussian_sampler.sample() - X_tilde_2 = gaussian_sampler.sample() + X_tilde_1 = gaussian_sampler.sample(random_state=0) + X_tilde_2 = gaussian_sampler.sample(random_state=0) assert np.array_equal(X_tilde_1, X_tilde_2) def test_randomness_sample_no_seed(): """Test the non repeatability of the samples when no seed""" X, _, _, _ = multivariate_simulation(100, 10, seed=0) - gaussian_sampler = GaussianKnockoffs(cov_estimator=LedoitWolf(), random_state=None) + gaussian_sampler = GaussianKnockoffs(cov_estimator=LedoitWolf()) gaussian_sampler.fit(X=X) - X_tilde_1 = gaussian_sampler.sample() - X_tilde_2 = gaussian_sampler.sample() + X_tilde_1 = gaussian_sampler.sample(random_state=None) + X_tilde_2 = gaussian_sampler.sample(random_state=None) assert not np.array_equal(X_tilde_1, X_tilde_2) def test_randomness_sample_rgn(): """Test the non repeatability of the samples when the usage of random generator""" X, _, _, _ = multivariate_simulation(100, 10, seed=0) - gaussian_sampler_rng = GaussianKnockoffs( - cov_estimator=LedoitWolf(), random_state=np.random.RandomState(0) - ) + rng = np.random.RandomState(0) + gaussian_sampler_rng = GaussianKnockoffs(cov_estimator=LedoitWolf()) gaussian_sampler_rng.fit(X=X) - X_tilde_1 = gaussian_sampler_rng.sample() - X_tilde_2 = gaussian_sampler_rng.sample() + X_tilde_1 = gaussian_sampler_rng.sample(random_state=rng) + X_tilde_2 = gaussian_sampler_rng.sample(random_state=rng) assert not np.array_equal(X_tilde_1, X_tilde_2) From 4dfd67753fd7e031f65b2ac15d025ae06023602b Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Oct 2025 15:27:46 +0200 Subject: [PATCH 22/44] fix test --- src/hidimstat/statistical_tools/gaussian_knockoffs.py | 2 +- test/statistical_tools/test_gaussian_knockoffs.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hidimstat/statistical_tools/gaussian_knockoffs.py b/src/hidimstat/statistical_tools/gaussian_knockoffs.py index d664f6f63..c5f0f98cc 100644 --- a/src/hidimstat/statistical_tools/gaussian_knockoffs.py +++ b/src/hidimstat/statistical_tools/gaussian_knockoffs.py @@ -144,7 +144,7 @@ def sample( X_tildes = [] for i in range(n_samples): # create a uniform noise for all the data - u_tilde = rng.randn(n_samples, n_features) + u_tilde = rng.standard_normal([n_samples, n_features]) # Equation 1.4 in barber2015controlling X_tildes.append( diff --git a/test/statistical_tools/test_gaussian_knockoffs.py b/test/statistical_tools/test_gaussian_knockoffs.py index 721054e6b..12dfe883b 100644 --- a/test/statistical_tools/test_gaussian_knockoffs.py +++ b/test/statistical_tools/test_gaussian_knockoffs.py @@ -87,7 +87,7 @@ def test_randomness_sample_no_seed(): def test_randomness_sample_rgn(): """Test the non repeatability of the samples when the usage of random generator""" X, _, _, _ = multivariate_simulation(100, 10, seed=0) - rng = np.random.RandomState(0) + rng = np.random.default_rng(0) gaussian_sampler_rng = GaussianKnockoffs(cov_estimator=LedoitWolf()) gaussian_sampler_rng.fit(X=X) X_tilde_1 = gaussian_sampler_rng.sample(random_state=rng) From c5e1e76fae853e1782b66b5915256f102afaa93e Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Oct 2025 15:29:59 +0200 Subject: [PATCH 23/44] fix isort --- test/statistical_tools/test_gaussian_knockoffs.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/statistical_tools/test_gaussian_knockoffs.py b/test/statistical_tools/test_gaussian_knockoffs.py index 12dfe883b..58e4cb571 100644 --- a/test/statistical_tools/test_gaussian_knockoffs.py +++ b/test/statistical_tools/test_gaussian_knockoffs.py @@ -1,11 +1,12 @@ import numpy as np import pytest +from sklearn.covariance import LedoitWolf + +from hidimstat._utils.scenario import multivariate_simulation from hidimstat.statistical_tools.gaussian_knockoffs import ( - _s_equi, GaussianKnockoffs, + _s_equi, ) -from hidimstat._utils.scenario import multivariate_simulation -from sklearn.covariance import LedoitWolf def test_gaussian_equi(): From 45bb053d5c8d806fd1dcda3c373a677ccb7cc4e2 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Oct 2025 15:35:37 +0200 Subject: [PATCH 24/44] fix examples --- examples/plot_pitfalls_permutation_importance.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/examples/plot_pitfalls_permutation_importance.py b/examples/plot_pitfalls_permutation_importance.py index 7534865f8..a4e0ed310 100644 --- a/examples/plot_pitfalls_permutation_importance.py +++ b/examples/plot_pitfalls_permutation_importance.py @@ -14,9 +14,6 @@ import numpy as np -from hidimstat import CFI, PFI -from hidimstat.statistical_tools.conditional_sampling import ConditionalSampler - rng = np.random.default_rng(0) # %% @@ -270,7 +267,7 @@ from matplotlib.lines import Line2D -from hidimstat.conditional_sampling import ConditionalSampler +from hidimstat.statistical_tools.conditional_sampling import ConditionalSampler X_train, X_test = train_test_split( X, From 7e73c2aed8beddedbf9dd50dfc4d5ef53129e8ed Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Oct 2025 15:45:15 +0200 Subject: [PATCH 25/44] fix documentation --- docs/src/api.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/src/api.rst b/docs/src/api.rst index f11165742..f17b4ff76 100644 --- a/docs/src/api.rst +++ b/docs/src/api.rst @@ -52,7 +52,8 @@ Samplers :toctree: ./generated/api/class/ :template: class.rst - conditional_sampling.ConditionalSampler + statistical_tools.conditional_sampling.ConditionalSampler + statistical_tools.gaussian_knockoffs.GaussianKnockoffs Helper Functions ================ From 893670384407b85aae60c2844dd393b966ed0fd8 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Oct 2025 15:55:54 +0200 Subject: [PATCH 26/44] fix documentation --- src/hidimstat/statistical_tools/gaussian_knockoffs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/statistical_tools/gaussian_knockoffs.py b/src/hidimstat/statistical_tools/gaussian_knockoffs.py index c5f0f98cc..53228fa83 100644 --- a/src/hidimstat/statistical_tools/gaussian_knockoffs.py +++ b/src/hidimstat/statistical_tools/gaussian_knockoffs.py @@ -16,7 +16,7 @@ class GaussianKnockoffs: ---------- cov_estimator : object Estimator for computing the covariance matrix. Must implement fit and - have a covariance_ attribute after fitting. + have a `covariance_` attribute after fitting. tol : float, default=1e-14 Tolerance for numerical stability in matrix computations. From 03f914ccdb3e380b682b0237768f9eb118772b93 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Oct 2025 16:48:51 +0200 Subject: [PATCH 27/44] update the user guide. --- docs/src/helper_methods.rst | 16 ++++++++++++++++ docs/src/user_guide.rst | 2 +- 2 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 docs/src/helper_methods.rst diff --git a/docs/src/helper_methods.rst b/docs/src/helper_methods.rst new file mode 100644 index 000000000..05e01d8fc --- /dev/null +++ b/docs/src/helper_methods.rst @@ -0,0 +1,16 @@ +.. _helper_methods: + + +============== +Helper Methods +============== + +Statistical Tools +----------------- + +The statistical tool box contains the statistical methods and functions required +by the variable importance methods but is not specific to one method or more general. +Actually, there are the functions of aggregation pvalues, conditional samplers and +the management of multiple testing functions. + +TODO: need to be completed. diff --git a/docs/src/user_guide.rst b/docs/src/user_guide.rst index 48bbe5e66..43cc76267 100644 --- a/docs/src/user_guide.rst +++ b/docs/src/user_guide.rst @@ -21,4 +21,4 @@ Table of contents visualization.rst grouping.rst high_dimension.rst - \ No newline at end of file + helper_methods.rst \ No newline at end of file From a18fa8fd9fa06e6cf68069504068d0e0c873427e Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Oct 2025 17:01:21 +0200 Subject: [PATCH 28/44] try to fix the end of file --- docs/src/user_guide.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/user_guide.rst b/docs/src/user_guide.rst index 43cc76267..e490bf5b6 100644 --- a/docs/src/user_guide.rst +++ b/docs/src/user_guide.rst @@ -21,4 +21,4 @@ Table of contents visualization.rst grouping.rst high_dimension.rst - helper_methods.rst \ No newline at end of file + helper_methods.rst From 8ef5c063e7906d71f6dee5a9071dc889ad2920f2 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Mon, 13 Oct 2025 10:13:02 +0200 Subject: [PATCH 29/44] Apply suggestion from @bthirion Co-authored-by: bthirion --- test/statistical_tools/test_gaussian_knockoffs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/statistical_tools/test_gaussian_knockoffs.py b/test/statistical_tools/test_gaussian_knockoffs.py index 58e4cb571..769a36ce5 100644 --- a/test/statistical_tools/test_gaussian_knockoffs.py +++ b/test/statistical_tools/test_gaussian_knockoffs.py @@ -36,7 +36,7 @@ def test_gaussian_error(): generator.sample(random_state=seed * 2) -def test_s_equi_not_define_positive(): +def test_s_equi_not_definite_positive(): """test the warning and error of s_equi function""" n = 10 tol = 1e-7 From 720cf4418642d62259421ea4f206439af3c7acdc Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 13 Oct 2025 11:36:42 +0200 Subject: [PATCH 30/44] update makefile --- docs/Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/Makefile b/docs/Makefile index 711611b0d..cf76bb912 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -2,13 +2,14 @@ # # You can set these variables from the command line. -SPHINXOPTS = -j auto -W --keep-going -n +N_JOB = auto +SPHINXOPTS = -j ${N_JOB} -W --keep-going -n # -j/--jobs: N processes in parallel (auto=number of CPUs) # -W/--fail-on-warning: Turn warnings into errors. # --keep-going: Runs sphinx-build to completion and exits # with exit status 1 if errors are encountered. # -n/--nitpicky: This generates warnings for all missing references. -SPHINXBUILD = sphinx-build +SPHINXBUILD = "LANG=C python -X faulthandler -m sphinx" PAPER = BUILDDIR = ./_build From 410cefcfbb1a282d9e7c7a692e5b1dce623617ee Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 13 Oct 2025 11:37:41 +0200 Subject: [PATCH 31/44] Add comment [skip tests] --- docs/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/Makefile b/docs/Makefile index cf76bb912..f07ec09a1 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -9,7 +9,7 @@ SPHINXOPTS = -j ${N_JOB} -W --keep-going -n # --keep-going: Runs sphinx-build to completion and exits # with exit status 1 if errors are encountered. # -n/--nitpicky: This generates warnings for all missing references. -SPHINXBUILD = "LANG=C python -X faulthandler -m sphinx" +SPHINXBUILD = "LANG=C python -X faulthandler -m sphinx" # https://github.com/sphinx-doc/sphinx/issues/11449 PAPER = BUILDDIR = ./_build From 8b9619439d668b555a5592ed63f28825cae4e9f7 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 13 Oct 2025 11:50:46 +0200 Subject: [PATCH 32/44] update command --- tools/documentation/circleci/build_doc.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/documentation/circleci/build_doc.sh b/tools/documentation/circleci/build_doc.sh index a7e99abf0..5dd7c2041 100755 --- a/tools/documentation/circleci/build_doc.sh +++ b/tools/documentation/circleci/build_doc.sh @@ -114,7 +114,7 @@ else fi # The pipefail is requested to propagate exit code -set -o pipefail && cd docs && make $make_args 2>&1 | tee ~/output_sphinx.txt +set -o pipefail && cd docs && make N_JOB=1 $make_args 2>&1 | tee ~/output_sphinx.txt cd - set +o pipefail From dde7b47b578644384a9b74bfd0666adf349dc86d Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 13 Oct 2025 11:51:20 +0200 Subject: [PATCH 33/44] [skip tests] From 611d0ecc42290cf275b9fbe5b9636c4b8e99a634 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 13 Oct 2025 11:55:08 +0200 Subject: [PATCH 34/44] fix makefile --- docs/Makefile | 9 +++++---- tools/documentation/circleci/build_doc.sh | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/Makefile b/docs/Makefile index f07ec09a1..75d54320f 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -9,7 +9,8 @@ SPHINXOPTS = -j ${N_JOB} -W --keep-going -n # --keep-going: Runs sphinx-build to completion and exits # with exit status 1 if errors are encountered. # -n/--nitpicky: This generates warnings for all missing references. -SPHINXBUILD = "LANG=C python -X faulthandler -m sphinx" # https://github.com/sphinx-doc/sphinx/issues/11449 +SPHINXBUILD = LANG=C python -X faulthandler -m sphinx +# https://github.com/sphinx-doc/sphinx/issues/11449 PAPER = BUILDDIR = ./_build @@ -25,9 +26,9 @@ ifneq ($(EXAMPLES_PATTERN),) endif # User-friendly check for sphinx-build -ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) -$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) -endif +# ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) +# $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) +# endif # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 diff --git a/tools/documentation/circleci/build_doc.sh b/tools/documentation/circleci/build_doc.sh index 5dd7c2041..a7e99abf0 100755 --- a/tools/documentation/circleci/build_doc.sh +++ b/tools/documentation/circleci/build_doc.sh @@ -114,7 +114,7 @@ else fi # The pipefail is requested to propagate exit code -set -o pipefail && cd docs && make N_JOB=1 $make_args 2>&1 | tee ~/output_sphinx.txt +set -o pipefail && cd docs && make $make_args 2>&1 | tee ~/output_sphinx.txt cd - set +o pipefail From c92911f8e41b39700908271c703dca044d501968 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 13 Oct 2025 11:57:03 +0200 Subject: [PATCH 35/44] [skip tests] From 46b835862df81a36ae45dfdfbafa8a7fc3558bf1 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 13 Oct 2025 12:08:55 +0200 Subject: [PATCH 36/44] change signature of sample --- src/hidimstat/knockoffs.py | 2 +- .../statistical_tools/gaussian_knockoffs.py | 20 +++++-------------- 2 files changed, 6 insertions(+), 16 deletions(-) diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index be70a0fba..fd2d967ff 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -191,7 +191,7 @@ def model_x_knockoff( conditionnal_sampler = GaussianKnockoffs(cov_estimator, tol=tol_gauss) conditionnal_sampler.fit(X) X_tildes = conditionnal_sampler.sample( - n_samples=n_bootstraps, random_state=random_state + n_repeats=n_bootstraps, random_state=random_state ) results = parallel( diff --git a/src/hidimstat/statistical_tools/gaussian_knockoffs.py b/src/hidimstat/statistical_tools/gaussian_knockoffs.py index 53228fa83..59d0fe6ee 100644 --- a/src/hidimstat/statistical_tools/gaussian_knockoffs.py +++ b/src/hidimstat/statistical_tools/gaussian_knockoffs.py @@ -107,9 +107,7 @@ def _check_fit(self): def sample( self, - X: np.ndarray = None, - y: np.ndarray = None, - n_samples: int = 1, + n_repeats: int = 1, random_state=None, ): """ @@ -119,11 +117,7 @@ def sample( Parameters ---------- - X : ndarray, default = None - The complementary of the considered set of variables, $X^{-j}$. - y : ndarray, default = None - The group of variables to sample, $X^j$. - n_samples : int, default=1 + n_repeats : int, default=1 The number of samples to draw. random_state : int or None, default=None The random state to use for sampling. @@ -133,18 +127,14 @@ def sample( X_tilde : 2D ndarray (n_samples, n_features) The synthetic variables. """ - if X is not None: - warnings.warn("this new X won't be used for sample the distribution") - if y is not None: - warnings.warn("this new y won't be used for sample the distribution") self._check_fit() rng = check_random_state(random_state) - n_samples, n_features = self.mu_tilde_.shape + n_repeats, n_features = self.mu_tilde_.shape X_tildes = [] - for i in range(n_samples): + for i in range(n_repeats): # create a uniform noise for all the data - u_tilde = rng.standard_normal([n_samples, n_features]) + u_tilde = rng.standard_normal([n_repeats, n_features]) # Equation 1.4 in barber2015controlling X_tildes.append( From bd7ace1c78fdce9fd81b7fccd007d089a0cc96ac Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 13 Oct 2025 13:57:35 +0200 Subject: [PATCH 37/44] remove fix --- docs/Makefile | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/docs/Makefile b/docs/Makefile index 75d54320f..711611b0d 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -2,15 +2,13 @@ # # You can set these variables from the command line. -N_JOB = auto -SPHINXOPTS = -j ${N_JOB} -W --keep-going -n +SPHINXOPTS = -j auto -W --keep-going -n # -j/--jobs: N processes in parallel (auto=number of CPUs) # -W/--fail-on-warning: Turn warnings into errors. # --keep-going: Runs sphinx-build to completion and exits # with exit status 1 if errors are encountered. # -n/--nitpicky: This generates warnings for all missing references. -SPHINXBUILD = LANG=C python -X faulthandler -m sphinx -# https://github.com/sphinx-doc/sphinx/issues/11449 +SPHINXBUILD = sphinx-build PAPER = BUILDDIR = ./_build @@ -26,9 +24,9 @@ ifneq ($(EXAMPLES_PATTERN),) endif # User-friendly check for sphinx-build -# ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) -# $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) -# endif +ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) +$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) +endif # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 From 5cd94f38ebb56f53ba0da7ed72a13372498cf6b0 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 14 Oct 2025 12:04:15 +0200 Subject: [PATCH 38/44] remove documentation --- docs/src/helper_methods.rst | 16 ---------------- docs/src/user_guide.rst | 1 - 2 files changed, 17 deletions(-) delete mode 100644 docs/src/helper_methods.rst diff --git a/docs/src/helper_methods.rst b/docs/src/helper_methods.rst deleted file mode 100644 index 05e01d8fc..000000000 --- a/docs/src/helper_methods.rst +++ /dev/null @@ -1,16 +0,0 @@ -.. _helper_methods: - - -============== -Helper Methods -============== - -Statistical Tools ------------------ - -The statistical tool box contains the statistical methods and functions required -by the variable importance methods but is not specific to one method or more general. -Actually, there are the functions of aggregation pvalues, conditional samplers and -the management of multiple testing functions. - -TODO: need to be completed. diff --git a/docs/src/user_guide.rst b/docs/src/user_guide.rst index e490bf5b6..ef084b7c1 100644 --- a/docs/src/user_guide.rst +++ b/docs/src/user_guide.rst @@ -21,4 +21,3 @@ Table of contents visualization.rst grouping.rst high_dimension.rst - helper_methods.rst From e3450ec439d39b499ac9e91db635428b3e540c42 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Wed, 15 Oct 2025 11:07:05 +0200 Subject: [PATCH 39/44] Update src/hidimstat/statistical_tools/gaussian_knockoffs.py Co-authored-by: Joseph Paillard --- src/hidimstat/statistical_tools/gaussian_knockoffs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/statistical_tools/gaussian_knockoffs.py b/src/hidimstat/statistical_tools/gaussian_knockoffs.py index 59d0fe6ee..03ef19fed 100644 --- a/src/hidimstat/statistical_tools/gaussian_knockoffs.py +++ b/src/hidimstat/statistical_tools/gaussian_knockoffs.py @@ -124,7 +124,7 @@ def sample( Returns ------- - X_tilde : 2D ndarray (n_samples, n_features) + X_tilde : 3D ndarray (n_repeats, n_samples, n_features) The synthetic variables. """ self._check_fit() From b5847a190003c79fd0ddec8ad10439875d3db9e6 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Wed, 15 Oct 2025 11:07:16 +0200 Subject: [PATCH 40/44] Update src/hidimstat/statistical_tools/gaussian_knockoffs.py Co-authored-by: Joseph Paillard --- src/hidimstat/statistical_tools/gaussian_knockoffs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/statistical_tools/gaussian_knockoffs.py b/src/hidimstat/statistical_tools/gaussian_knockoffs.py index 03ef19fed..b6633dcb2 100644 --- a/src/hidimstat/statistical_tools/gaussian_knockoffs.py +++ b/src/hidimstat/statistical_tools/gaussian_knockoffs.py @@ -140,7 +140,7 @@ def sample( X_tildes.append( self.mu_tilde_ + np.dot(u_tilde, self.sigma_tilde_decompose_) ) - return X_tildes + return np.stack(X_tildes) def _s_equi(sigma, tol=1e-14): From 9f1ca5a946a6d697a67ef2084070c009d86a1b23 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Wed, 15 Oct 2025 11:07:29 +0200 Subject: [PATCH 41/44] Update src/hidimstat/statistical_tools/gaussian_knockoffs.py Co-authored-by: Joseph Paillard --- src/hidimstat/statistical_tools/gaussian_knockoffs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/hidimstat/statistical_tools/gaussian_knockoffs.py b/src/hidimstat/statistical_tools/gaussian_knockoffs.py index b6633dcb2..f1b846c20 100644 --- a/src/hidimstat/statistical_tools/gaussian_knockoffs.py +++ b/src/hidimstat/statistical_tools/gaussian_knockoffs.py @@ -18,7 +18,8 @@ class GaussianKnockoffs: Estimator for computing the covariance matrix. Must implement fit and have a `covariance_` attribute after fitting. tol : float, default=1e-14 - Tolerance for numerical stability in matrix computations. + Tolerance threshold. While the smallest eigenvalue of :math:`2\Sigma - diag(S)` + is smaller than this threshold, S is incrementally increased. Attributes ---------- From 285690c45e31b37bdb833fad036c2d2129601976 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Wed, 15 Oct 2025 11:07:38 +0200 Subject: [PATCH 42/44] Update src/hidimstat/statistical_tools/gaussian_knockoffs.py Co-authored-by: Joseph Paillard --- src/hidimstat/statistical_tools/gaussian_knockoffs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/statistical_tools/gaussian_knockoffs.py b/src/hidimstat/statistical_tools/gaussian_knockoffs.py index f1b846c20..b38bddc8f 100644 --- a/src/hidimstat/statistical_tools/gaussian_knockoffs.py +++ b/src/hidimstat/statistical_tools/gaussian_knockoffs.py @@ -119,7 +119,7 @@ def sample( Parameters ---------- n_repeats : int, default=1 - The number of samples to draw. + The number of sets of Gaussian knockoff variables random_state : int or None, default=None The random state to use for sampling. From 2f8b0ba1f656b6f1e3cef8f09fee5cc55eccb247 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 15 Oct 2025 11:12:34 +0200 Subject: [PATCH 43/44] format file --- src/hidimstat/statistical_tools/gaussian_knockoffs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hidimstat/statistical_tools/gaussian_knockoffs.py b/src/hidimstat/statistical_tools/gaussian_knockoffs.py index b38bddc8f..7f9c85ea9 100644 --- a/src/hidimstat/statistical_tools/gaussian_knockoffs.py +++ b/src/hidimstat/statistical_tools/gaussian_knockoffs.py @@ -18,8 +18,8 @@ class GaussianKnockoffs: Estimator for computing the covariance matrix. Must implement fit and have a `covariance_` attribute after fitting. tol : float, default=1e-14 - Tolerance threshold. While the smallest eigenvalue of :math:`2\Sigma - diag(S)` - is smaller than this threshold, S is incrementally increased. + Tolerance threshold. While the smallest eigenvalue of :math:`2\Sigma - diag(S)` + is smaller than this threshold, S is incrementally increased. Attributes ---------- From 43be1835bbbbc57b51a7e53ade2558a2c485f146 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 15 Oct 2025 11:16:42 +0200 Subject: [PATCH 44/44] add test with repeat --- .../test_gaussian_knockoffs.py | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/test/statistical_tools/test_gaussian_knockoffs.py b/test/statistical_tools/test_gaussian_knockoffs.py index 769a36ce5..db07ff4e6 100644 --- a/test/statistical_tools/test_gaussian_knockoffs.py +++ b/test/statistical_tools/test_gaussian_knockoffs.py @@ -75,6 +75,16 @@ def test_reproducibility_sample(): assert np.array_equal(X_tilde_1, X_tilde_2) +def test_reproducibility_sample_repeat(): + """Test the repeatability of the samples""" + X, _, _, _ = multivariate_simulation(100, 10, seed=0) + gaussian_sampler = GaussianKnockoffs(cov_estimator=LedoitWolf()) + gaussian_sampler.fit(X=X) + X_tilde_1 = gaussian_sampler.sample(n_repeats=3, random_state=0) + X_tilde_2 = gaussian_sampler.sample(n_repeats=3, random_state=0) + assert np.array_equal(X_tilde_1, X_tilde_2) + + def test_randomness_sample_no_seed(): """Test the non repeatability of the samples when no seed""" X, _, _, _ = multivariate_simulation(100, 10, seed=0) @@ -85,6 +95,16 @@ def test_randomness_sample_no_seed(): assert not np.array_equal(X_tilde_1, X_tilde_2) +def test_randomness_sample_no_seed_repeat(): + """Test the non repeatability of the samples when no seed""" + X, _, _, _ = multivariate_simulation(100, 10, seed=0) + gaussian_sampler = GaussianKnockoffs(cov_estimator=LedoitWolf()) + gaussian_sampler.fit(X=X) + X_tilde_1 = gaussian_sampler.sample(n_repeats=3, random_state=None) + X_tilde_2 = gaussian_sampler.sample(n_repeats=3, random_state=None) + assert not np.array_equal(X_tilde_1, X_tilde_2) + + def test_randomness_sample_rgn(): """Test the non repeatability of the samples when the usage of random generator""" X, _, _, _ = multivariate_simulation(100, 10, seed=0) @@ -94,3 +114,14 @@ def test_randomness_sample_rgn(): X_tilde_1 = gaussian_sampler_rng.sample(random_state=rng) X_tilde_2 = gaussian_sampler_rng.sample(random_state=rng) assert not np.array_equal(X_tilde_1, X_tilde_2) + + +def test_randomness_sample_rgn_repeat(): + """Test the non repeatability of the samples when the usage of random generator""" + X, _, _, _ = multivariate_simulation(100, 10, seed=0) + rng = np.random.default_rng(0) + gaussian_sampler_rng = GaussianKnockoffs(cov_estimator=LedoitWolf()) + gaussian_sampler_rng.fit(X=X) + X_tilde_1 = gaussian_sampler_rng.sample(n_repeats=3, random_state=rng) + X_tilde_2 = gaussian_sampler_rng.sample(n_repeats=3, random_state=rng) + assert not np.array_equal(X_tilde_1, X_tilde_2)