33from sklearn .base import BaseEstimator
44import numpy as np
55
6+ from hidimstat .statistical_tools .multiple_testing import fdr_threshold
7+ from hidimstat .statistical_tools .aggregation import quantile_aggregation
8+
69
710class BaseVariableImportance (BaseEstimator ):
811 """
@@ -34,15 +37,22 @@ def __init__(self):
3437 self .importances_ = None
3538 self .pvalues_ = None
3639 self .selections_ = None
40+ self .test_scores_ = None
41+
42+ def _check_importance (self ):
43+ """
44+ Checks if the importance scores and p-values have been computed.
45+ """
46+ if self .importances_ is None :
47+ raise ValueError (
48+ "The importances need to be called before calling this method"
49+ )
3750
3851 def selection (
3952 self , k_best = None , percentile = None , threshold = None , threshold_pvalue = None
4053 ):
4154 """
4255 Selects features based on variable importance.
43- In case several arguments are different from None,
44- the returned selection is the conjunction of all of them.
45-
4656 Parameters
4757 ----------
4858 k_best : int, optional, default=None
@@ -53,7 +63,6 @@ def selection(
5363 Selects features with importance scores above the specified threshold.
5464 threshold_pvalue : float, optional, default=None
5565 Selects features with p-values below the specified threshold.
56-
5766 Returns
5867 -------
5968 selection : array-like of shape (n_features,)
@@ -123,11 +132,182 @@ def selection(
123132
124133 return self .selections_
125134
126- def _check_importance (self ):
135+ def selection_fdr (
136+ self ,
137+ fdr ,
138+ fdr_control = "bhq" ,
139+ evalues = False ,
140+ reshaping_function = None ,
141+ adaptive_aggregation = False ,
142+ gamma = 0.5 ,
143+ ):
127144 """
128- Checks if the importance scores and p-values have been computed.
145+ Performs feature selection based on False Discovery Rate (FDR) control.
146+
147+ This method selects features by controlling the FDR using either p-values or e-values
148+ derived from test scores. It supports different FDR control methods and optional
149+ adaptive aggregation of the statistical values.
150+
151+ Parameters
152+ ----------
153+ fdr : float, default=None
154+ The target false discovery rate level (between 0 and 1)
155+ fdr_control: string, default="bhq"
156+ The FDR control method to use. Options are:
157+ - "bhq": Benjamini-Hochberg procedure
158+ - 'bhy': Benjamini-Hochberg-Yekutieli procedure
159+ - "ebh": e-BH procedure (only for e-values)
160+ evalues: boolean, default=False
161+ If True, uses e-values for selection. If False, uses p-values.
162+ reshaping_function: callable, default=None
163+ Reshaping function for BHY method, default uses sum of reciprocals
164+ adaptive_aggregation: boolean, default=False
165+ If True, uses adaptive weights for p-value aggregation.
166+ Only applicable when evalues=False.
167+ gamma: boolean, default=0.5
168+ The gamma parameter for quantile aggregation of p-values.
169+ Only used when evalues=False.
170+
171+ Returns
172+ -------
173+ numpy.ndarray
174+ Boolean array indicating selected features (True for selected, False for not selected)
175+
176+ Raises
177+ ------
178+ AssertionError
179+ If test_scores_ is None or if incompatible combinations of parameters are provided
129180 """
130- if self .importances_ is None :
131- raise ValueError (
132- "The importances need to be called before calling this method"
181+ self ._check_importance ()
182+ assert (
183+ self .test_scores_ is not None
184+ ), "this method doesn't support selection base on FDR"
185+
186+ if not evalues :
187+ assert fdr_control != "ebh" , "for p-value, the fdr control can't be 'ebh'"
188+ pvalues = np .array (
189+ [
190+ _empirical_pval (self .test_scores_ [i ])
191+ for i in range (len (self .test_scores_ ))
192+ ]
133193 )
194+ aggregated_pval = quantile_aggregation (
195+ pvalues , gamma = gamma , adaptive = adaptive_aggregation
196+ )
197+ threshold = fdr_threshold (
198+ aggregated_pval ,
199+ fdr = fdr ,
200+ method = fdr_control ,
201+ reshaping_function = reshaping_function ,
202+ )
203+ selected = aggregated_pval <= threshold
204+ else :
205+ assert fdr_control == "ebh" , "for e-value, the fdr control need to be 'ebh'"
206+ ko_threshold = []
207+ for test_score in self .test_scores_ :
208+ ko_threshold .append (_estimated_threshold (test_score , fdr = fdr ))
209+ evalues = np .array (
210+ [
211+ _empirical_eval (self .test_scores_ [i ], ko_threshold [i ])
212+ for i in range (len (self .test_scores_ ))
213+ ]
214+ )
215+ aggregated_eval = np .mean (evalues , axis = 0 )
216+ threshold = fdr_threshold (
217+ aggregated_eval ,
218+ fdr = fdr ,
219+ method = fdr_control ,
220+ reshaping_function = reshaping_function ,
221+ )
222+ selected = aggregated_eval >= threshold
223+ return selected
224+
225+
226+ def _estimated_threshold (test_score , fdr = 0.1 ):
227+ """
228+ Calculate the threshold based on the procedure stated in the knockoff article.
229+ Original code:
230+ https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/knockoff_filter.R
231+ Parameters
232+ ----------
233+ test_score : 1D ndarray, shape (n_features, )
234+ Vector of test statistic.
235+ fdr : float
236+ Desired controlled FDR (false discovery rate) level.
237+ Returns
238+ -------
239+ threshold : float or np.inf
240+ Threshold level.
241+ """
242+ offset = 1 # Offset equals 1 is the knockoff+ procedure.
243+
244+ threshold_mesh = np .sort (np .abs (test_score [test_score != 0 ]))
245+ np .concatenate (
246+ [[0 ], threshold_mesh , [np .inf ]]
247+ ) # if there is no solution, the threshold is inf
248+ # find the right value of t for getting a good fdr
249+ # Equation 1.8 of barber2015controlling and 3.10 in Candès 2018
250+ threshold = 0.0
251+ for threshold in threshold_mesh :
252+ false_pos = np .sum (test_score <= - threshold )
253+ selected = np .sum (test_score >= threshold )
254+ if (offset + false_pos ) / np .maximum (selected , 1 ) <= fdr :
255+ break
256+ return threshold
257+
258+
259+ def _empirical_pval (test_score ):
260+ """
261+ Compute the empirical p-values from the test based on knockoff+.
262+ Parameters
263+ ----------
264+ test_score : 1D ndarray, shape (n_features, )
265+ Vector of test statistics.
266+ Returns
267+ -------
268+ pvals : 1D ndarray, shape (n_features, )
269+ Vector of empirical p-values.
270+ """
271+ pvals = []
272+ n_features = test_score .size
273+
274+ offset = 1 # Offset equals 1 is the knockoff+ procedure.
275+
276+ test_score_inv = - test_score
277+ for i in range (n_features ):
278+ if test_score [i ] <= 0 :
279+ pvals .append (1 )
280+ else :
281+ pvals .append (
282+ (offset + np .sum (test_score_inv >= test_score [i ])) / n_features
283+ )
284+
285+ return np .array (pvals )
286+
287+
288+ def _empirical_eval (test_score , ko_threshold ):
289+ """
290+ Compute the empirical e-values from the test based on knockoff.
291+ Parameters
292+ ----------
293+ test_score : 1D ndarray, shape (n_features, )
294+ Vector of test statistics.
295+ ko_threshold : float
296+ Threshold level.
297+ Returns
298+ -------
299+ evals : 1D ndarray, shape (n_features, )
300+ Vector of empirical e-values.
301+ """
302+ evals = []
303+ n_features = test_score .size
304+
305+ offset = 1 # Offset equals 1 is the knockoff+ procedure.
306+
307+ for i in range (n_features ):
308+ if test_score [i ] < ko_threshold :
309+ evals .append (0 )
310+ else :
311+ evals .append (n_features / (offset + np .sum (test_score <= - ko_threshold )))
312+
313+ return np .array (evals )
0 commit comments