Skip to content

Commit 9c2d77c

Browse files
committed
add method for selection base on FDR
1 parent 3ecce71 commit 9c2d77c

File tree

1 file changed

+189
-9
lines changed

1 file changed

+189
-9
lines changed

src/hidimstat/base_variable_importance.py

Lines changed: 189 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
from sklearn.base import BaseEstimator
44
import numpy as np
55

6+
from hidimstat.statistical_tools.multiple_testing import fdr_threshold
7+
from hidimstat.statistical_tools.aggregation import quantile_aggregation
8+
69

710
class BaseVariableImportance(BaseEstimator):
811
"""
@@ -34,15 +37,22 @@ def __init__(self):
3437
self.importances_ = None
3538
self.pvalues_ = None
3639
self.selections_ = None
40+
self.test_scores_ = None
41+
42+
def _check_importance(self):
43+
"""
44+
Checks if the importance scores and p-values have been computed.
45+
"""
46+
if self.importances_ is None:
47+
raise ValueError(
48+
"The importances need to be called before calling this method"
49+
)
3750

3851
def selection(
3952
self, k_best=None, percentile=None, threshold=None, threshold_pvalue=None
4053
):
4154
"""
4255
Selects features based on variable importance.
43-
In case several arguments are different from None,
44-
the returned selection is the conjunction of all of them.
45-
4656
Parameters
4757
----------
4858
k_best : int, optional, default=None
@@ -53,7 +63,6 @@ def selection(
5363
Selects features with importance scores above the specified threshold.
5464
threshold_pvalue : float, optional, default=None
5565
Selects features with p-values below the specified threshold.
56-
5766
Returns
5867
-------
5968
selection : array-like of shape (n_features,)
@@ -123,11 +132,182 @@ def selection(
123132

124133
return self.selections_
125134

126-
def _check_importance(self):
135+
def selection_fdr(
136+
self,
137+
fdr,
138+
fdr_control="bhq",
139+
evalues=False,
140+
reshaping_function=None,
141+
adaptive_aggregation=False,
142+
gamma=0.5,
143+
):
127144
"""
128-
Checks if the importance scores and p-values have been computed.
145+
Performs feature selection based on False Discovery Rate (FDR) control.
146+
147+
This method selects features by controlling the FDR using either p-values or e-values
148+
derived from test scores. It supports different FDR control methods and optional
149+
adaptive aggregation of the statistical values.
150+
151+
Parameters
152+
----------
153+
fdr : float, default=None
154+
The target false discovery rate level (between 0 and 1)
155+
fdr_control: string, default="bhq"
156+
The FDR control method to use. Options are:
157+
- "bhq": Benjamini-Hochberg procedure
158+
- 'bhy': Benjamini-Hochberg-Yekutieli procedure
159+
- "ebh": e-BH procedure (only for e-values)
160+
evalues: boolean, default=False
161+
If True, uses e-values for selection. If False, uses p-values.
162+
reshaping_function: callable, default=None
163+
Reshaping function for BHY method, default uses sum of reciprocals
164+
adaptive_aggregation: boolean, default=False
165+
If True, uses adaptive weights for p-value aggregation.
166+
Only applicable when evalues=False.
167+
gamma: boolean, default=0.5
168+
The gamma parameter for quantile aggregation of p-values.
169+
Only used when evalues=False.
170+
171+
Returns
172+
-------
173+
numpy.ndarray
174+
Boolean array indicating selected features (True for selected, False for not selected)
175+
176+
Raises
177+
------
178+
AssertionError
179+
If test_scores_ is None or if incompatible combinations of parameters are provided
129180
"""
130-
if self.importances_ is None:
131-
raise ValueError(
132-
"The importances need to be called before calling this method"
181+
self._check_importance()
182+
assert (
183+
self.test_scores_ is not None
184+
), "this method doesn't support selection base on FDR"
185+
186+
if not evalues:
187+
assert fdr_control != "ebh", "for p-value, the fdr control can't be 'ebh'"
188+
pvalues = np.array(
189+
[
190+
_empirical_pval(self.test_scores_[i])
191+
for i in range(len(self.test_scores_))
192+
]
133193
)
194+
aggregated_pval = quantile_aggregation(
195+
pvalues, gamma=gamma, adaptive=adaptive_aggregation
196+
)
197+
threshold = fdr_threshold(
198+
aggregated_pval,
199+
fdr=fdr,
200+
method=fdr_control,
201+
reshaping_function=reshaping_function,
202+
)
203+
selected = aggregated_pval <= threshold
204+
else:
205+
assert fdr_control == "ebh", "for e-value, the fdr control need to be 'ebh'"
206+
ko_threshold = []
207+
for test_score in self.test_scores_:
208+
ko_threshold.append(_estimated_threshold(test_score, fdr=fdr))
209+
evalues = np.array(
210+
[
211+
_empirical_eval(self.test_scores_[i], ko_threshold[i])
212+
for i in range(len(self.test_scores_))
213+
]
214+
)
215+
aggregated_eval = np.mean(evalues, axis=0)
216+
threshold = fdr_threshold(
217+
aggregated_eval,
218+
fdr=fdr,
219+
method=fdr_control,
220+
reshaping_function=reshaping_function,
221+
)
222+
selected = aggregated_eval >= threshold
223+
return selected
224+
225+
226+
def _estimated_threshold(test_score, fdr=0.1):
227+
"""
228+
Calculate the threshold based on the procedure stated in the knockoff article.
229+
Original code:
230+
https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/knockoff_filter.R
231+
Parameters
232+
----------
233+
test_score : 1D ndarray, shape (n_features, )
234+
Vector of test statistic.
235+
fdr : float
236+
Desired controlled FDR (false discovery rate) level.
237+
Returns
238+
-------
239+
threshold : float or np.inf
240+
Threshold level.
241+
"""
242+
offset = 1 # Offset equals 1 is the knockoff+ procedure.
243+
244+
threshold_mesh = np.sort(np.abs(test_score[test_score != 0]))
245+
np.concatenate(
246+
[[0], threshold_mesh, [np.inf]]
247+
) # if there is no solution, the threshold is inf
248+
# find the right value of t for getting a good fdr
249+
# Equation 1.8 of barber2015controlling and 3.10 in Candès 2018
250+
threshold = 0.0
251+
for threshold in threshold_mesh:
252+
false_pos = np.sum(test_score <= -threshold)
253+
selected = np.sum(test_score >= threshold)
254+
if (offset + false_pos) / np.maximum(selected, 1) <= fdr:
255+
break
256+
return threshold
257+
258+
259+
def _empirical_pval(test_score):
260+
"""
261+
Compute the empirical p-values from the test based on knockoff+.
262+
Parameters
263+
----------
264+
test_score : 1D ndarray, shape (n_features, )
265+
Vector of test statistics.
266+
Returns
267+
-------
268+
pvals : 1D ndarray, shape (n_features, )
269+
Vector of empirical p-values.
270+
"""
271+
pvals = []
272+
n_features = test_score.size
273+
274+
offset = 1 # Offset equals 1 is the knockoff+ procedure.
275+
276+
test_score_inv = -test_score
277+
for i in range(n_features):
278+
if test_score[i] <= 0:
279+
pvals.append(1)
280+
else:
281+
pvals.append(
282+
(offset + np.sum(test_score_inv >= test_score[i])) / n_features
283+
)
284+
285+
return np.array(pvals)
286+
287+
288+
def _empirical_eval(test_score, ko_threshold):
289+
"""
290+
Compute the empirical e-values from the test based on knockoff.
291+
Parameters
292+
----------
293+
test_score : 1D ndarray, shape (n_features, )
294+
Vector of test statistics.
295+
ko_threshold : float
296+
Threshold level.
297+
Returns
298+
-------
299+
evals : 1D ndarray, shape (n_features, )
300+
Vector of empirical e-values.
301+
"""
302+
evals = []
303+
n_features = test_score.size
304+
305+
offset = 1 # Offset equals 1 is the knockoff+ procedure.
306+
307+
for i in range(n_features):
308+
if test_score[i] < ko_threshold:
309+
evals.append(0)
310+
else:
311+
evals.append(n_features / (offset + np.sum(test_score <= -ko_threshold)))
312+
313+
return np.array(evals)

0 commit comments

Comments
 (0)