Skip to content

Commit 894ff9d

Browse files
lionelkuschbthirionjpaillard
authored
Add selection with fdr and associate test (#361)
* add method for selection base on FDR * fix default of the qunatile aggragation * fix selection * update docstring * fix docstring * Add test for 1 test_score * change the usage of test fdr without aggregation * remove a print in test * Update selection * remove function for knockoff * update selection_fdr * fix selection * improve selection * fix some part of the selection * fix test * try to fix test * fix seed in generation of data * fix docstring * Fix attribute in base_variable_importance * change name * fix docstrign * fix linter * Mixin for selectionfdr * fix tests * fix format * put back the selection_fdr in base class * fix error of docstring * Apply suggestion from @bthirion Co-authored-by: bthirion <[email protected]> * Apply suggestion from @bthirion Co-authored-by: bthirion <[email protected]> * Apply suggestion from @bthirion Co-authored-by: bthirion <[email protected]> * Apply suggestion from @bthirion Co-authored-by: bthirion <[email protected]> * Apply suggestion from @bthirion Co-authored-by: bthirion <[email protected]> * chaneg name of fixture * remove all from k_best * rename the variable * chnage borm for percentil * fix tests * improve selection method * update test and the changement of signature * improve coverage * change defautl value * Update src/hidimstat/base_variable_importance.py Co-authored-by: bthirion <[email protected]> * Update src/hidimstat/base_variable_importance.py Co-authored-by: bthirion <[email protected]> * Update src/hidimstat/base_variable_importance.py Co-authored-by: bthirion <[email protected]> * Update src/hidimstat/base_variable_importance.py Co-authored-by: Joseph Paillard <[email protected]> * update following the comments * fix bug * selection one criteria * fix tests * fix format * fix k_lowest * remove randomization in tests * move all the tests for base importance in one file * fix seed --------- Co-authored-by: bthirion <[email protected]> Co-authored-by: Joseph Paillard <[email protected]>
1 parent 7d642a4 commit 894ff9d

8 files changed

+664
-181
lines changed

src/hidimstat/base_variable_importance.py

Lines changed: 241 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,99 @@
66
from sklearn.base import BaseEstimator
77

88
from hidimstat._utils.exception import InternalError
9+
from hidimstat.statistical_tools.multiple_testing import fdr_threshold
10+
11+
12+
def _selection_generic(
13+
values,
14+
k_best=None,
15+
k_lowest=None,
16+
percentile=None,
17+
threshold_max=None,
18+
threshold_min=None,
19+
):
20+
"""
21+
Helper function for selecting features based on multiple criteria.
22+
23+
Parameters
24+
----------
25+
values : array-like of shape (n_features,)
26+
Values to use for feature selection (e.g., importance scores or p-values)
27+
k_best : int, default=None
28+
Selects the top k features based on values.
29+
k_lowest : int, default=None
30+
Selects the lowest k features based on values.
31+
percentile : float, default=None
32+
Selects features based on a specified percentile of values.
33+
threshold_max : float, default=None
34+
Selects features with values below the specified maximum threshold.
35+
threshold_min : float, default=None
36+
Selects features with values above the specified minimum threshold.
37+
38+
Returns
39+
-------
40+
selection : array-like of shape (n_features,)
41+
Boolean array indicating the selected features.
42+
"""
43+
n_criteria = np.sum(
44+
[
45+
criteria is not None
46+
for criteria in [k_best, k_lowest, percentile, threshold_max, threshold_min]
47+
]
48+
)
49+
assert n_criteria <= 1, "Only support selection based on one criteria."
50+
if k_best is not None:
51+
assert k_best >= 1, "k_best needs to be positive or None"
52+
if k_best > values.shape[0]:
53+
warnings.warn(
54+
f"k={k_best} is greater than n_features={values.shape[0]}. "
55+
"All the features will be returned."
56+
)
57+
mask_k_best = np.zeros_like(values, dtype=bool)
58+
59+
# based on SelectKBest in Scikit-Learn
60+
# Request a stable sort. Mergesort takes more memory (~40MB per
61+
# megafeature on x86-64).
62+
mask_k_best[np.argsort(values, kind="mergesort")[-k_best:]] = 1
63+
return mask_k_best
64+
elif k_lowest is not None:
65+
assert k_lowest >= 1, "k_lowest needs to be positive or None"
66+
if k_lowest > values.shape[0]:
67+
warnings.warn(
68+
f"k={k_lowest} is greater than n_features={values.shape[0]}. "
69+
"All the features will be returned."
70+
)
71+
mask_k_lowest = np.zeros_like(values, dtype=bool)
72+
73+
# based on SelectKBest in Scikit-Learn
74+
# Request a stable sort. Mergesort takes more memory (~40MB per
75+
# megafeature on x86-64).
76+
mask_k_lowest[np.argsort(values, kind="mergesort")[:k_lowest]] = 1
77+
return mask_k_lowest
78+
elif percentile is not None:
79+
assert (
80+
0 < percentile < 100
81+
), "percentile must be between 0 and 100 (exclusive). Got {}.".format(
82+
percentile
83+
)
84+
# based on SelectPercentile in Scikit-Learn
85+
threshold_percentile = np.percentile(values, 100 - percentile)
86+
mask_percentile = values > threshold_percentile
87+
ties = np.where(values == threshold_percentile)[0]
88+
if len(ties):
89+
max_feats = int(len(values) * percentile / 100)
90+
kept_ties = ties[: max_feats - mask_percentile.sum()]
91+
mask_percentile[kept_ties] = True
92+
return mask_percentile
93+
elif threshold_max is not None:
94+
mask_threshold_max = values < threshold_max
95+
return mask_threshold_max
96+
elif threshold_min is not None:
97+
mask_threshold_min = values > threshold_min
98+
return mask_threshold_min
99+
else:
100+
no_mask = np.ones_like(values, dtype=bool)
101+
return no_mask
9102

10103

11104
class BaseVariableImportance(BaseEstimator):
@@ -21,8 +114,6 @@ class BaseVariableImportance(BaseEstimator):
21114
The computed importance scores for each feature.
22115
pvalues_ : array-like of shape (n_features,), default=None
23116
The computed p-values for each feature.
24-
selections_ : array-like of shape (n_features,), default=None
25-
Binary mask indicating selected features.
26117
27118
Methods
28119
-------
@@ -37,104 +128,178 @@ def __init__(self):
37128
super().__init__()
38129
self.importances_ = None
39130
self.pvalues_ = None
40-
self.selections_ = None
41131

42-
def selection(
43-
self, k_best=None, percentile=None, threshold=None, threshold_pvalue=None
132+
def _check_importance(self):
133+
"""
134+
Checks if the importance scores have been computed.
135+
"""
136+
if self.importances_ is None:
137+
raise ValueError(
138+
"The importances need to be called before calling this method"
139+
)
140+
141+
def importance_selection(
142+
self, k_best=None, percentile=None, threshold_max=None, threshold_min=None
44143
):
45144
"""
46145
Selects features based on variable importance.
47-
In case several arguments are different from None,
48-
the returned selection is the conjunction of all of them.
49146
50147
Parameters
51148
----------
52-
k_best : int, optional, default=None
149+
k_best : int, default=None
53150
Selects the top k features based on importance scores.
54-
percentile : float, optional, default=None
151+
percentile : float, default=None
55152
Selects features based on a specified percentile of importance scores.
56-
threshold : float, optional, default=None
57-
Selects features with importance scores above the specified threshold.
58-
threshold_pvalue : float, optional, default=None
59-
Selects features with p-values below the specified threshold.
153+
threshold_max : float, default=None
154+
Selects features with importance scores below the specified maximum threshold.
155+
threshold_min : float, default=None
156+
Selects features with importance scores above the specified minimum threshold.
60157
61158
Returns
62159
-------
63160
selection : array-like of shape (n_features,)
64161
Binary array indicating the selected features.
65162
"""
66163
self._check_importance()
67-
if k_best is not None:
68-
if not isinstance(k_best, str) and k_best > self.importances_.shape[1]:
69-
warnings.warn(
70-
f"k={k_best} is greater than n_features={self.importances_.shape[1]}. "
71-
"All the features will be returned."
72-
)
73-
assert k_best > 0, "k_best needs to be positive and not null"
74-
if percentile is not None:
75-
assert (
76-
0 < percentile and percentile < 100
77-
), "percentile needs to be between 0 and 100"
78-
if threshold_pvalue is not None:
79-
assert (
80-
0 < threshold_pvalue and threshold_pvalue < 1
81-
), "threshold_pvalue needs to be between 0 and 1"
82-
83-
# base on SelectKBest of Scikit-Learn
84-
if k_best == "all":
85-
mask_k_best = np.ones(self.importances_.shape, dtype=bool)
86-
elif k_best == 0:
87-
mask_k_best = np.zeros(self.importances_.shape, dtype=bool)
88-
elif k_best is not None:
89-
mask_k_best = np.zeros(self.importances_.shape, dtype=bool)
90-
91-
# Request a stable sort. Mergesort takes more memory (~40MB per
92-
# megafeature on x86-64).
93-
mask_k_best[np.argsort(self.importances_, kind="mergesort")[-k_best:]] = 1
94-
else:
95-
mask_k_best = np.ones(self.importances_.shape, dtype=bool)
96-
97-
# base on SelectPercentile of Scikit-Learn
98-
if percentile == 100:
99-
mask_percentile = np.ones(len(self.importances_), dtype=bool)
100-
elif percentile == 0:
101-
mask_percentile = np.zeros(len(self.importances_), dtype=bool)
102-
elif percentile is not None:
103-
threshold = np.percentile(self.importances_, 100 - percentile)
104-
mask_percentile = self.importances_ > threshold
105-
ties = np.where(self.importances_ == threshold)[0]
106-
if len(ties):
107-
max_feats = int(len(self.importances_) * percentile / 100)
108-
kept_ties = ties[: max_feats - mask_percentile.sum()]
109-
mask_percentile[kept_ties] = True
110-
else:
111-
mask_percentile = np.ones(self.importances_.shape, dtype=bool)
164+
return _selection_generic(
165+
self.importances_,
166+
k_best=k_best,
167+
percentile=percentile,
168+
threshold_max=threshold_max,
169+
threshold_min=threshold_min,
170+
)
112171

113-
if threshold is not None:
114-
mask_threshold = self.importances_ < threshold
115-
else:
116-
mask_threshold = np.ones(self.importances_.shape, dtype=bool)
172+
def pvalue_selection(
173+
self,
174+
k_lowest=None,
175+
percentile=None,
176+
threshold_max=0.05,
177+
threshold_min=None,
178+
alternative_hypothesis=False,
179+
):
180+
"""
181+
Selects features based on p-values.
117182
118-
# base on SelectFpr of Scikit-Learn
119-
if threshold_pvalue is not None:
120-
mask_threshold_pvalue = self.pvalues_ < threshold_pvalue
121-
else:
122-
mask_threshold_pvalue = np.ones(self.importances_.shape, dtype=bool)
183+
Parameters
184+
----------
185+
k_lowest : int, default=None
186+
Selects the k features with lowest p-values.
187+
percentile : float, default=None
188+
Selects features based on a specified percentile of p-values.
189+
threshold_max : float, default=0.05
190+
Selects features with p-values below the specified maximum threshold (0 to 1).
191+
threshold_min : float, default=None
192+
Selects features with p-values above the specified minimum threshold (0 to 1).
193+
alternative_hypothesis : bool, default=False
194+
If True, selects based on 1-pvalues instead of p-values.
123195
124-
self.selections_ = (
125-
mask_k_best & mask_percentile & mask_threshold & mask_threshold_pvalue
196+
Returns
197+
-------
198+
selection : array-like of shape (n_features,)
199+
Binary array indicating the selected features (True for selected).
200+
"""
201+
self._check_importance()
202+
assert (
203+
self.pvalues_ is not None
204+
), "The selection on p-value can't be done because the current method does not compute p-values."
205+
if threshold_min is not None:
206+
assert (
207+
0 < threshold_min and threshold_min < 1
208+
), "threshold_min needs to be between 0 and 1"
209+
if threshold_max is not None:
210+
assert (
211+
0 < threshold_max and threshold_max < 1
212+
), "threshold_max needs to be between 0 and 1"
213+
assert alternative_hypothesis is None or isinstance(
214+
alternative_hypothesis, bool
215+
), "alternative_hippothesis can have only three values: True, False and None."
216+
return _selection_generic(
217+
self.pvalues_ if not alternative_hypothesis else 1 - self.pvalues_,
218+
k_lowest=k_lowest,
219+
percentile=percentile,
220+
threshold_max=threshold_max,
221+
threshold_min=threshold_min,
126222
)
127223

128-
return self.selections_
129-
130-
def _check_importance(self):
224+
def fdr_selection(
225+
self,
226+
fdr,
227+
fdr_control="bhq",
228+
reshaping_function=None,
229+
alternative_hypothesis=False,
230+
):
131231
"""
132-
Checks if the importance scores have been computed.
232+
Performs feature selection based on False Discovery Rate (FDR) control.
233+
234+
Parameters
235+
----------
236+
fdr : float
237+
The target false discovery rate level (between 0 and 1)
238+
fdr_control: {'bhq', 'bhy'}, default='bhq'
239+
The FDR control method to use:
240+
- 'bhq': Benjamini-Hochberg procedure
241+
- 'bhy': Benjamini-Hochberg-Yekutieli procedure
242+
reshaping_function: callable or None, default=None
243+
Optional reshaping function for FDR control methods.
244+
If None, defaults to sum of reciprocals for 'bhy'.
245+
alternative_hippothesis: bool or None, default=False
246+
If False, selects features with small p-values.
247+
If True, selects features with large p-values (close to 1).
248+
If None, selects features that have either small or large p-values.
249+
250+
Returns
251+
-------
252+
selected : ndarray of bool
253+
Boolean mask of selected features.
254+
True indicates selected features, False indicates non-selected features.
255+
256+
Raises
257+
------
258+
ValueError
259+
If `importances_` haven't been computed yet
260+
AssertionError
261+
If `pvalues_` are missing or fdr_control is invalid
133262
"""
134-
if self.importances_ is None:
135-
raise ValueError(
136-
"The importances need to be called before calling this method"
263+
self._check_importance()
264+
assert 0 < fdr and fdr < 1, "FDR needs to be between 0 and 1 excluded"
265+
assert (
266+
self.pvalues_ is not None
267+
), "FDR-based selection requires p-values to be computed first. The current method does not support p-values."
268+
assert (
269+
fdr_control == "bhq" or fdr_control == "bhy"
270+
), "only 'bhq' and 'bhy' are supported"
271+
assert alternative_hypothesis is None or isinstance(
272+
alternative_hypothesis, bool
273+
), "alternative_hippothesis can have only three values: True, False and None."
274+
275+
# selection on pvalue
276+
if alternative_hypothesis is None or not alternative_hypothesis:
277+
threshold_pvalues = fdr_threshold(
278+
self.pvalues_,
279+
fdr=fdr,
280+
method=fdr_control,
281+
reshaping_function=reshaping_function,
137282
)
283+
selected_pvalues = self.pvalues_ <= threshold_pvalues
284+
else:
285+
selected_pvalues = np.zeros_like(self.pvalues_, dtype=bool)
286+
287+
# selection on 1-pvalue
288+
if alternative_hypothesis is None or alternative_hypothesis:
289+
threshold_one_minus_pvalues = fdr_threshold(
290+
1 - self.pvalues_,
291+
fdr=fdr,
292+
method=fdr_control,
293+
reshaping_function=reshaping_function,
294+
)
295+
selected_one_minus_pvalues = (
296+
1 - self.pvalues_
297+
) <= threshold_one_minus_pvalues
298+
else:
299+
selected_one_minus_pvalues = np.zeros_like(self.pvalues_, dtype=bool)
300+
301+
selected = selected_pvalues | selected_one_minus_pvalues
302+
return selected
138303

139304
def plot_importance(
140305
self,

0 commit comments

Comments
 (0)