Skip to content

Commit 3e2fbe0

Browse files
committed
formart utils
1 parent 6217732 commit 3e2fbe0

File tree

1 file changed

+174
-0
lines changed

1 file changed

+174
-0
lines changed

hidimstat/utils.py

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
import numpy as np
2+
3+
4+
def quantile_aggregation(pvals, gamma=0.5, gamma_min=0.05, adaptive=False):
5+
"""
6+
This function implements the quantile aggregation method for p-values.
7+
"""
8+
# if pvalues are one-dimensional, do nothing
9+
if pvals.shape[0] == 1:
10+
return pvals[0]
11+
if adaptive:
12+
return _adaptive_quantile_aggregation(pvals, gamma_min)
13+
else:
14+
return _fixed_quantile_aggregation(pvals, gamma)
15+
16+
17+
def fdr_threshold(pvals, fdr=0.1, method="bhq", reshaping_function=None):
18+
if method == "bhq":
19+
return _bhq_threshold(pvals, fdr=fdr)
20+
elif method == "bhy":
21+
return _bhy_threshold(pvals, fdr=fdr, reshaping_function=reshaping_function)
22+
elif method == "ebh":
23+
return _ebh_threshold(pvals, fdr=fdr)
24+
else:
25+
raise ValueError("{} is not support FDR control method".format(method))
26+
27+
28+
def cal_fdp_power(selected, non_zero_index, r_index=False):
29+
"""Calculate power and False Discovery Proportion
30+
31+
Parameters
32+
----------
33+
selected: list index (in R format) of selected non-null variables
34+
non_zero_index: true index of non-null variables
35+
r_index : True if the index is taken from rpy2 inference
36+
37+
Returns
38+
-------
39+
fdp: False Discoveries Proportion
40+
power: percentage of correctly selected variables over total number of
41+
non-null variables
42+
43+
"""
44+
# selected is the index list in R and will be different from index of
45+
# python by 1 unit
46+
47+
if selected.size == 0:
48+
return 0.0, 0.0
49+
50+
n_positives = len(non_zero_index)
51+
52+
if r_index:
53+
selected = selected - 1
54+
55+
true_positive = np.intersect1d(selected, non_zero_index)
56+
false_positive = np.setdiff1d(selected, true_positive)
57+
58+
fdp = len(false_positive) / max(1, len(selected))
59+
power = min(len(true_positive), n_positives) / n_positives
60+
61+
return fdp, power
62+
63+
64+
def _bhq_threshold(pvals, fdr=0.1):
65+
"""Standard Benjamini-Hochberg for controlling False discovery rate"""
66+
n_features = len(pvals)
67+
pvals_sorted = np.sort(pvals)
68+
selected_index = 2 * n_features
69+
for i in range(n_features - 1, -1, -1):
70+
if pvals_sorted[i] <= fdr * (i + 1) / n_features:
71+
selected_index = i
72+
break
73+
if selected_index <= n_features:
74+
return pvals_sorted[selected_index]
75+
else:
76+
return -1.0
77+
78+
79+
def _ebh_threshold(evals, fdr=0.1):
80+
"""e-BH procedure for FDR control (see Wang and Ramdas 2021)"""
81+
n_features = len(evals)
82+
evals_sorted = -np.sort(-evals) # sort in descending order
83+
selected_index = 2 * n_features
84+
for i in range(n_features - 1, -1, -1):
85+
if evals_sorted[i] >= n_features / (fdr * (i + 1)):
86+
selected_index = i
87+
break
88+
if selected_index <= n_features:
89+
return evals_sorted[selected_index]
90+
else:
91+
return np.infty
92+
93+
94+
def _bhy_threshold(pvals, reshaping_function=None, fdr=0.1):
95+
"""Benjamini-Hochberg-Yekutieli procedure for controlling FDR, with input
96+
shape function. Reference: Ramdas et al (2017)
97+
"""
98+
n_features = len(pvals)
99+
pvals_sorted = np.sort(pvals)
100+
selected_index = 2 * n_features
101+
# Default value for reshaping function -- defined in
102+
# Benjamini & Yekutieli (2001)
103+
if reshaping_function is None:
104+
temp = np.arange(n_features)
105+
sum_inverse = np.sum(1 / (temp + 1))
106+
return _bhq_threshold(pvals, fdr / sum_inverse)
107+
else:
108+
for i in range(n_features - 1, -1, -1):
109+
if pvals_sorted[i] <= fdr * reshaping_function(i + 1) / n_features:
110+
selected_index = i
111+
break
112+
if selected_index <= n_features:
113+
return pvals_sorted[selected_index]
114+
else:
115+
return -1.0
116+
117+
118+
def _fixed_quantile_aggregation(pvals, gamma=0.5):
119+
"""Quantile aggregation function based on Meinshausen et al (2008)
120+
121+
Parameters
122+
----------
123+
pvals : 2D ndarray (n_sampling_with_repetition, n_test)
124+
p-value (adjusted)
125+
126+
gamma : float
127+
Percentile value used for aggregation.
128+
129+
Returns
130+
-------
131+
1D ndarray (n_tests, )
132+
Vector of aggregated p-values
133+
"""
134+
converted_score = (1 / gamma) * (np.percentile(pvals, q=100 * gamma, axis=0))
135+
136+
return np.minimum(1, converted_score)
137+
138+
139+
def _adaptive_quantile_aggregation(pvals, gamma_min=0.05):
140+
"""adaptive version of the quantile aggregation method, Meinshausen et al.
141+
(2008)"""
142+
gammas = np.arange(gamma_min, 1.05, 0.05)
143+
list_Q = np.array([_fixed_quantile_aggregation(pvals, gamma) for gamma in gammas])
144+
145+
return np.minimum(1, (1 - np.log(gamma_min)) * list_Q.min(0))
146+
147+
148+
def _lambda_max(X, y, use_noise_estimate=True):
149+
"""Calculation of lambda_max, the smallest value of regularization parameter in
150+
lasso program for non-zero coefficient
151+
"""
152+
n_samples, _ = X.shape
153+
154+
if not use_noise_estimate:
155+
return np.max(np.dot(X.T, y)) / n_samples
156+
157+
norm_y = np.linalg.norm(y, ord=2)
158+
sigma_0 = (norm_y / np.sqrt(n_samples)) * 1e-3
159+
sig_star = max(sigma_0, norm_y / np.sqrt(n_samples))
160+
161+
return np.max(np.abs(np.dot(X.T, y)) / (n_samples * sig_star))
162+
163+
164+
def _check_vim_predict_method(method):
165+
"""Check if the method is a valid method for variable importance measure
166+
prediction"""
167+
if method in ["predict", "predict_proba", "decision_function", "transform"]:
168+
return method
169+
else:
170+
raise ValueError(
171+
"The method {} is not a valid method for variable importance measure prediction".format(
172+
method
173+
)
174+
)

0 commit comments

Comments
 (0)