Skip to content

Commit 190300d

Browse files
committed
gaussian distribution reformat
1 parent 3ecce71 commit 190300d

File tree

3 files changed

+279
-191
lines changed

3 files changed

+279
-191
lines changed

src/hidimstat/gaussian_knockoff.py

Lines changed: 0 additions & 191 deletions
This file was deleted.
Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
import warnings
2+
3+
import numpy as np
4+
from sklearn.preprocessing import StandardScaler
5+
from sklearn.utils import check_random_state
6+
7+
8+
class GaussianDistribution:
9+
"""
10+
Generator for second-order Gaussian variables using the equi-correlated method.
11+
Creates synthetic variables that preserve the covariance structure of the original
12+
variables while ensuring conditional independence between the original and synthetic data.
13+
Parameters
14+
----------
15+
cov_estimator : object
16+
Estimator for computing the covariance matrix. Must implement fit and
17+
have a covariance_ attribute after fitting.
18+
random_state : int or None, default=None
19+
Random seed for generating synthetic data.
20+
centered : bool, default=False
21+
Whether to center and scale the input data before generating synthetic variables.
22+
tol : float, default=1e-14
23+
Tolerance for numerical stability in matrix computations.
24+
Attributes
25+
----------
26+
mu_tilde_ : ndarray of shape (n_samples, n_features)
27+
Mean matrix for generating synthetic variables.
28+
sigma_tilde_decompose_ : ndarray of shape (n_features, n_features)
29+
Cholesky decomposition of the synthetic covariance matrix.
30+
References
31+
----------
32+
.. footbibliography::
33+
"""
34+
35+
def __init__(self, cov_estimator, random_state=None, centered=False, tol=1e-14):
36+
self.cov_estimator = cov_estimator
37+
self.centered = centered
38+
self.tol = tol
39+
self.rng = check_random_state(random_state)
40+
41+
def fit(self, X):
42+
"""
43+
Fit the Gaussian synthetic variable generator.
44+
This method estimates the parameters needed to generate Gaussian synthetic variables
45+
based on the input data. It follows a methodology for creating second-order
46+
synthetic variables that preserve the covariance structure.
47+
Parameters
48+
----------
49+
X : array-like of shape (n_samples, n_features)
50+
The input samples used to estimate the parameters for synthetic variable generation.
51+
The data is assumed to follow a Gaussian distribution.
52+
Returns
53+
-------
54+
self : object
55+
Returns the instance itself.
56+
Notes
57+
-----
58+
The method implements the following steps:
59+
1. Centers and scales the data if specified
60+
2. Estimates mean and covariance of input data
61+
3. Computes parameters for synthetic variable generation
62+
"""
63+
_, n_features = X.shape
64+
if self.centered:
65+
X_ = StandardScaler().fit_transform(X)
66+
else:
67+
X_ = X
68+
69+
# estimation of X distribution
70+
# original implementation:
71+
# https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/create_second_order.R
72+
mu = X_.mean(axis=0)
73+
sigma = self.cov_estimator.fit(X_).covariance_
74+
75+
diag_s = np.diag(_s_equi(sigma, tol=self.tol))
76+
77+
sigma_inv_s = np.linalg.solve(sigma, diag_s)
78+
79+
# First part on the RHS of equation 1.4 in barber2015controlling
80+
self.mu_tilde_ = X - np.dot(X - mu, sigma_inv_s)
81+
# To calculate the Cholesky decomposition later on
82+
sigma_tilde = 2 * diag_s - diag_s.dot(sigma_inv_s)
83+
# test is the matrix is positive definite
84+
while not np.all(np.linalg.eigvalsh(sigma_tilde) > self.tol):
85+
sigma_tilde += 1e-10 * np.eye(n_features)
86+
warnings.warn(
87+
"The conditional covariance matrix for knockoffs is not positive "
88+
"definite. Adding minor positive value to the matrix.",
89+
UserWarning,
90+
)
91+
92+
self.sigma_tilde_decompose_ = np.linalg.cholesky(sigma_tilde)
93+
94+
return self
95+
96+
def _check_fit(self):
97+
"""
98+
Check if the model has been fit before performing analysis.
99+
Raises
100+
------
101+
ValueError
102+
If any of the required attributes are missing, indicating the model
103+
hasn't been fit before generating synthetic variables.
104+
"""
105+
if not hasattr(self, "mu_tilde_") or not hasattr(
106+
self, "sigma_tilde_decompose_"
107+
):
108+
raise ValueError("The GaussianGenerator requires to be fit before simulate")
109+
110+
def sample(self):
111+
"""
112+
Generate synthetic variables.
113+
This function generates synthetic variables that preserve the covariance structure
114+
of the original data while ensuring conditional independence.
115+
Returns
116+
-------
117+
X_tilde : 2D ndarray (n_samples, n_features)
118+
The synthetic variables.
119+
"""
120+
self._check_fit()
121+
n_samples, n_features = self.mu_tilde_.shape
122+
123+
# create a uniform noise for all the data
124+
u_tilde = self.rng.randn(n_samples, n_features)
125+
126+
# Equation 1.4 in barber2015controlling
127+
X_tilde = self.mu_tilde_ + np.dot(u_tilde, self.sigma_tilde_decompose_)
128+
return X_tilde
129+
130+
131+
def _s_equi(sigma, tol=1e-14):
132+
"""
133+
Estimate the diagonal matrix of correlation between real
134+
and knockoff variables using the equi-correlated equation.
135+
This function estimates the diagonal matrix of correlation
136+
between real and knockoff variables using the equi-correlated
137+
equation described in :footcite:t:`barber2015controlling` and
138+
:footcite:t:`candes2018panning`. It takes as input the empirical
139+
covariance matrix sigma and a tolerance value tol,
140+
and returns a vector of diagonal values of the estimated
141+
matrix diag{s}.
142+
Parameters
143+
----------
144+
sigma : 2D ndarray (n_features, n_features)
145+
The empirical covariance matrix calculated from
146+
the original design matrix.
147+
tol : float, optional
148+
A tolerance value used for numerical stability in the calculation
149+
of the eigenvalues of the correlation matrix.
150+
Returns
151+
-------
152+
1D ndarray (n_features, )
153+
A vector of diagonal values of the estimated matrix diag{s}.
154+
Raises
155+
------
156+
Exception
157+
If the covariance matrix is not positive-definite.
158+
"""
159+
n_features = sigma.shape[0]
160+
161+
# Convert covariance matrix to correlation matrix
162+
# as example see cov2corr from statsmodels
163+
features_std = np.sqrt(np.diag(sigma))
164+
scale = np.outer(features_std, features_std)
165+
corr_matrix = sigma / scale
166+
167+
eig_value = np.linalg.eigvalsh(corr_matrix)
168+
lambda_min = np.min(eig_value[0])
169+
# check if the matrix is positive-defined
170+
if lambda_min <= 0:
171+
raise Exception("The covariance matrix is not positive-definite.")
172+
173+
s = np.ones(n_features) * min(2 * lambda_min, 1)
174+
175+
psd = np.all(np.linalg.eigvalsh(2 * corr_matrix - np.diag(s)) > tol)
176+
s_eps = 0
177+
while not psd:
178+
if s_eps == 0:
179+
s_eps = np.finfo(type(s[0])).eps # avoid zeros
180+
else:
181+
s_eps *= 10
182+
# if all eigval > 0 then the matrix is positive define
183+
psd = np.all(
184+
np.linalg.eigvalsh(2 * corr_matrix - np.diag(s * (1 - s_eps))) > tol
185+
)
186+
warnings.warn(
187+
"The equi-correlated matrix for knockoffs is not positive "
188+
f"definite. Reduce the value of distance by {s_eps}.",
189+
UserWarning,
190+
)
191+
192+
s = s * (1 - s_eps)
193+
194+
return s * np.diag(sigma)

0 commit comments

Comments
 (0)