66from sklearn .base import BaseEstimator
77
88from hidimstat ._utils .exception import InternalError
9+ from hidimstat .statistical_tools .multiple_testing import fdr_threshold
10+
11+
12+ def _selection_generic (
13+ values ,
14+ k_best = None ,
15+ k_lowest = None ,
16+ percentile = None ,
17+ threshold_max = None ,
18+ threshold_min = None ,
19+ ):
20+ """
21+ Helper function for selecting features based on multiple criteria.
22+
23+ Parameters
24+ ----------
25+ values : array-like of shape (n_features,)
26+ Values to use for feature selection (e.g., importance scores or p-values)
27+ k_best : int, default=None
28+ Selects the top k features based on values.
29+ k_lowest : int, default=None
30+ Selects the lowest k features based on values.
31+ percentile : float, default=None
32+ Selects features based on a specified percentile of values.
33+ threshold_max : float, default=None
34+ Selects features with values below the specified maximum threshold.
35+ threshold_min : float, default=None
36+ Selects features with values above the specified minimum threshold.
37+
38+ Returns
39+ -------
40+ selection : array-like of shape (n_features,)
41+ Boolean array indicating the selected features.
42+ """
43+ n_criteria = np .sum (
44+ [
45+ criteria is not None
46+ for criteria in [k_best , k_lowest , percentile , threshold_max , threshold_min ]
47+ ]
48+ )
49+ assert n_criteria <= 1 , "Only support selection based on one criteria."
50+ if k_best is not None :
51+ assert k_best >= 1 , "k_best needs to be positive or None"
52+ if k_best > values .shape [0 ]:
53+ warnings .warn (
54+ f"k={ k_best } is greater than n_features={ values .shape [0 ]} . "
55+ "All the features will be returned."
56+ )
57+ mask_k_best = np .zeros_like (values , dtype = bool )
58+
59+ # based on SelectKBest in Scikit-Learn
60+ # Request a stable sort. Mergesort takes more memory (~40MB per
61+ # megafeature on x86-64).
62+ mask_k_best [np .argsort (values , kind = "mergesort" )[- k_best :]] = 1
63+ return mask_k_best
64+ elif k_lowest is not None :
65+ assert k_lowest >= 1 , "k_lowest needs to be positive or None"
66+ if k_lowest > values .shape [0 ]:
67+ warnings .warn (
68+ f"k={ k_lowest } is greater than n_features={ values .shape [0 ]} . "
69+ "All the features will be returned."
70+ )
71+ mask_k_lowest = np .zeros_like (values , dtype = bool )
72+
73+ # based on SelectKBest in Scikit-Learn
74+ # Request a stable sort. Mergesort takes more memory (~40MB per
75+ # megafeature on x86-64).
76+ mask_k_lowest [np .argsort (values , kind = "mergesort" )[:k_lowest ]] = 1
77+ return mask_k_lowest
78+ elif percentile is not None :
79+ assert (
80+ 0 < percentile < 100
81+ ), "percentile must be between 0 and 100 (exclusive). Got {}." .format (
82+ percentile
83+ )
84+ # based on SelectPercentile in Scikit-Learn
85+ threshold_percentile = np .percentile (values , 100 - percentile )
86+ mask_percentile = values > threshold_percentile
87+ ties = np .where (values == threshold_percentile )[0 ]
88+ if len (ties ):
89+ max_feats = int (len (values ) * percentile / 100 )
90+ kept_ties = ties [: max_feats - mask_percentile .sum ()]
91+ mask_percentile [kept_ties ] = True
92+ return mask_percentile
93+ elif threshold_max is not None :
94+ mask_threshold_max = values < threshold_max
95+ return mask_threshold_max
96+ elif threshold_min is not None :
97+ mask_threshold_min = values > threshold_min
98+ return mask_threshold_min
99+ else :
100+ no_mask = np .ones_like (values , dtype = bool )
101+ return no_mask
9102
10103
11104class BaseVariableImportance (BaseEstimator ):
@@ -21,8 +114,6 @@ class BaseVariableImportance(BaseEstimator):
21114 The computed importance scores for each feature.
22115 pvalues_ : array-like of shape (n_features,), default=None
23116 The computed p-values for each feature.
24- selections_ : array-like of shape (n_features,), default=None
25- Binary mask indicating selected features.
26117
27118 Methods
28119 -------
@@ -37,104 +128,178 @@ def __init__(self):
37128 super ().__init__ ()
38129 self .importances_ = None
39130 self .pvalues_ = None
40- self .selections_ = None
41131
42- def selection (
43- self , k_best = None , percentile = None , threshold = None , threshold_pvalue = None
132+ def _check_importance (self ):
133+ """
134+ Checks if the importance scores have been computed.
135+ """
136+ if self .importances_ is None :
137+ raise ValueError (
138+ "The importances need to be called before calling this method"
139+ )
140+
141+ def importance_selection (
142+ self , k_best = None , percentile = None , threshold_max = None , threshold_min = None
44143 ):
45144 """
46145 Selects features based on variable importance.
47- In case several arguments are different from None,
48- the returned selection is the conjunction of all of them.
49146
50147 Parameters
51148 ----------
52- k_best : int, optional, default=None
149+ k_best : int, default=None
53150 Selects the top k features based on importance scores.
54- percentile : float, optional, default=None
151+ percentile : float, default=None
55152 Selects features based on a specified percentile of importance scores.
56- threshold : float, optional , default=None
57- Selects features with importance scores above the specified threshold.
58- threshold_pvalue : float, optional , default=None
59- Selects features with p-values below the specified threshold.
153+ threshold_max : float, default=None
154+ Selects features with importance scores below the specified maximum threshold.
155+ threshold_min : float, default=None
156+ Selects features with importance scores above the specified minimum threshold.
60157
61158 Returns
62159 -------
63160 selection : array-like of shape (n_features,)
64161 Binary array indicating the selected features.
65162 """
66163 self ._check_importance ()
67- if k_best is not None :
68- if not isinstance (k_best , str ) and k_best > self .importances_ .shape [1 ]:
69- warnings .warn (
70- f"k={ k_best } is greater than n_features={ self .importances_ .shape [1 ]} . "
71- "All the features will be returned."
72- )
73- assert k_best > 0 , "k_best needs to be positive and not null"
74- if percentile is not None :
75- assert (
76- 0 < percentile and percentile < 100
77- ), "percentile needs to be between 0 and 100"
78- if threshold_pvalue is not None :
79- assert (
80- 0 < threshold_pvalue and threshold_pvalue < 1
81- ), "threshold_pvalue needs to be between 0 and 1"
82-
83- # base on SelectKBest of Scikit-Learn
84- if k_best == "all" :
85- mask_k_best = np .ones (self .importances_ .shape , dtype = bool )
86- elif k_best == 0 :
87- mask_k_best = np .zeros (self .importances_ .shape , dtype = bool )
88- elif k_best is not None :
89- mask_k_best = np .zeros (self .importances_ .shape , dtype = bool )
90-
91- # Request a stable sort. Mergesort takes more memory (~40MB per
92- # megafeature on x86-64).
93- mask_k_best [np .argsort (self .importances_ , kind = "mergesort" )[- k_best :]] = 1
94- else :
95- mask_k_best = np .ones (self .importances_ .shape , dtype = bool )
96-
97- # base on SelectPercentile of Scikit-Learn
98- if percentile == 100 :
99- mask_percentile = np .ones (len (self .importances_ ), dtype = bool )
100- elif percentile == 0 :
101- mask_percentile = np .zeros (len (self .importances_ ), dtype = bool )
102- elif percentile is not None :
103- threshold = np .percentile (self .importances_ , 100 - percentile )
104- mask_percentile = self .importances_ > threshold
105- ties = np .where (self .importances_ == threshold )[0 ]
106- if len (ties ):
107- max_feats = int (len (self .importances_ ) * percentile / 100 )
108- kept_ties = ties [: max_feats - mask_percentile .sum ()]
109- mask_percentile [kept_ties ] = True
110- else :
111- mask_percentile = np .ones (self .importances_ .shape , dtype = bool )
164+ return _selection_generic (
165+ self .importances_ ,
166+ k_best = k_best ,
167+ percentile = percentile ,
168+ threshold_max = threshold_max ,
169+ threshold_min = threshold_min ,
170+ )
112171
113- if threshold is not None :
114- mask_threshold = self .importances_ < threshold
115- else :
116- mask_threshold = np .ones (self .importances_ .shape , dtype = bool )
172+ def pvalue_selection (
173+ self ,
174+ k_lowest = None ,
175+ percentile = None ,
176+ threshold_max = 0.05 ,
177+ threshold_min = None ,
178+ alternative_hypothesis = False ,
179+ ):
180+ """
181+ Selects features based on p-values.
117182
118- # base on SelectFpr of Scikit-Learn
119- if threshold_pvalue is not None :
120- mask_threshold_pvalue = self .pvalues_ < threshold_pvalue
121- else :
122- mask_threshold_pvalue = np .ones (self .importances_ .shape , dtype = bool )
183+ Parameters
184+ ----------
185+ k_lowest : int, default=None
186+ Selects the k features with lowest p-values.
187+ percentile : float, default=None
188+ Selects features based on a specified percentile of p-values.
189+ threshold_max : float, default=0.05
190+ Selects features with p-values below the specified maximum threshold (0 to 1).
191+ threshold_min : float, default=None
192+ Selects features with p-values above the specified minimum threshold (0 to 1).
193+ alternative_hypothesis : bool, default=False
194+ If True, selects based on 1-pvalues instead of p-values.
123195
124- self .selections_ = (
125- mask_k_best & mask_percentile & mask_threshold & mask_threshold_pvalue
196+ Returns
197+ -------
198+ selection : array-like of shape (n_features,)
199+ Binary array indicating the selected features (True for selected).
200+ """
201+ self ._check_importance ()
202+ assert (
203+ self .pvalues_ is not None
204+ ), "The selection on p-value can't be done because the current method does not compute p-values."
205+ if threshold_min is not None :
206+ assert (
207+ 0 < threshold_min and threshold_min < 1
208+ ), "threshold_min needs to be between 0 and 1"
209+ if threshold_max is not None :
210+ assert (
211+ 0 < threshold_max and threshold_max < 1
212+ ), "threshold_max needs to be between 0 and 1"
213+ assert alternative_hypothesis is None or isinstance (
214+ alternative_hypothesis , bool
215+ ), "alternative_hippothesis can have only three values: True, False and None."
216+ return _selection_generic (
217+ self .pvalues_ if not alternative_hypothesis else 1 - self .pvalues_ ,
218+ k_lowest = k_lowest ,
219+ percentile = percentile ,
220+ threshold_max = threshold_max ,
221+ threshold_min = threshold_min ,
126222 )
127223
128- return self .selections_
129-
130- def _check_importance (self ):
224+ def fdr_selection (
225+ self ,
226+ fdr ,
227+ fdr_control = "bhq" ,
228+ reshaping_function = None ,
229+ alternative_hypothesis = False ,
230+ ):
131231 """
132- Checks if the importance scores have been computed.
232+ Performs feature selection based on False Discovery Rate (FDR) control.
233+
234+ Parameters
235+ ----------
236+ fdr : float
237+ The target false discovery rate level (between 0 and 1)
238+ fdr_control: {'bhq', 'bhy'}, default='bhq'
239+ The FDR control method to use:
240+ - 'bhq': Benjamini-Hochberg procedure
241+ - 'bhy': Benjamini-Hochberg-Yekutieli procedure
242+ reshaping_function: callable or None, default=None
243+ Optional reshaping function for FDR control methods.
244+ If None, defaults to sum of reciprocals for 'bhy'.
245+ alternative_hippothesis: bool or None, default=False
246+ If False, selects features with small p-values.
247+ If True, selects features with large p-values (close to 1).
248+ If None, selects features that have either small or large p-values.
249+
250+ Returns
251+ -------
252+ selected : ndarray of bool
253+ Boolean mask of selected features.
254+ True indicates selected features, False indicates non-selected features.
255+
256+ Raises
257+ ------
258+ ValueError
259+ If `importances_` haven't been computed yet
260+ AssertionError
261+ If `pvalues_` are missing or fdr_control is invalid
133262 """
134- if self .importances_ is None :
135- raise ValueError (
136- "The importances need to be called before calling this method"
263+ self ._check_importance ()
264+ assert 0 < fdr and fdr < 1 , "FDR needs to be between 0 and 1 excluded"
265+ assert (
266+ self .pvalues_ is not None
267+ ), "FDR-based selection requires p-values to be computed first. The current method does not support p-values."
268+ assert (
269+ fdr_control == "bhq" or fdr_control == "bhy"
270+ ), "only 'bhq' and 'bhy' are supported"
271+ assert alternative_hypothesis is None or isinstance (
272+ alternative_hypothesis , bool
273+ ), "alternative_hippothesis can have only three values: True, False and None."
274+
275+ # selection on pvalue
276+ if alternative_hypothesis is None or not alternative_hypothesis :
277+ threshold_pvalues = fdr_threshold (
278+ self .pvalues_ ,
279+ fdr = fdr ,
280+ method = fdr_control ,
281+ reshaping_function = reshaping_function ,
137282 )
283+ selected_pvalues = self .pvalues_ <= threshold_pvalues
284+ else :
285+ selected_pvalues = np .zeros_like (self .pvalues_ , dtype = bool )
286+
287+ # selection on 1-pvalue
288+ if alternative_hypothesis is None or alternative_hypothesis :
289+ threshold_one_minus_pvalues = fdr_threshold (
290+ 1 - self .pvalues_ ,
291+ fdr = fdr ,
292+ method = fdr_control ,
293+ reshaping_function = reshaping_function ,
294+ )
295+ selected_one_minus_pvalues = (
296+ 1 - self .pvalues_
297+ ) <= threshold_one_minus_pvalues
298+ else :
299+ selected_one_minus_pvalues = np .zeros_like (self .pvalues_ , dtype = bool )
300+
301+ selected = selected_pvalues | selected_one_minus_pvalues
302+ return selected
138303
139304 def plot_importance (
140305 self ,
0 commit comments