Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
63 commits
Select commit Hold shift + click to select a range
9c2d77c
add method for selection base on FDR
lionelkusch Aug 28, 2025
5314c37
fix default of the qunatile aggragation
lionelkusch Aug 28, 2025
be837e0
fix selection
lionelkusch Aug 28, 2025
1a42592
update docstring
lionelkusch Aug 28, 2025
5854f2e
fix docstring
lionelkusch Aug 28, 2025
3c08f75
Add test for 1 test_score
lionelkusch Aug 29, 2025
7f3a117
change the usage of test fdr without aggregation
lionelkusch Sep 1, 2025
21250b4
remove a print in test
lionelkusch Sep 1, 2025
17d9d95
Update selection
lionelkusch Sep 2, 2025
e8134d8
remove function for knockoff
lionelkusch Sep 9, 2025
51685e8
update selection_fdr
lionelkusch Sep 9, 2025
39ec78f
fix selection
lionelkusch Sep 9, 2025
f3ff485
improve selection
lionelkusch Sep 10, 2025
817af11
fix some part of the selection
lionelkusch Sep 10, 2025
846296a
Merge branch 'main' into PR_selection
lionelkusch Sep 10, 2025
7e256c2
fix test
lionelkusch Sep 10, 2025
5cc731c
try to fix test
lionelkusch Sep 10, 2025
90e1425
fix seed in generation of data
lionelkusch Sep 10, 2025
21d0614
fix docstring
lionelkusch Sep 10, 2025
5e19e1b
Fix attribute in base_variable_importance
lionelkusch Sep 11, 2025
c0af81a
change name
lionelkusch Sep 11, 2025
08ddbaa
fix docstrign
lionelkusch Sep 12, 2025
c22da7a
Merge branch 'main' into PR_selection
lionelkusch Sep 12, 2025
1329fa6
Merge branch 'main' into PR_selection
lionelkusch Sep 17, 2025
7d58380
fix linter
lionelkusch Sep 17, 2025
bc6d5c5
Mixin for selectionfdr
lionelkusch Sep 17, 2025
155c47a
fix tests
lionelkusch Sep 17, 2025
f90e6bc
fix format
lionelkusch Sep 17, 2025
d343b68
Merge branch 'main' into PR_selection
lionelkusch Sep 23, 2025
710bec4
put back the selection_fdr in base class
lionelkusch Sep 23, 2025
e66af14
fix error of docstring
lionelkusch Sep 23, 2025
f680738
Apply suggestion from @bthirion
lionelkusch Sep 24, 2025
f079d24
Apply suggestion from @bthirion
lionelkusch Sep 24, 2025
beed44f
Apply suggestion from @bthirion
lionelkusch Sep 24, 2025
ab262ad
Apply suggestion from @bthirion
lionelkusch Sep 24, 2025
ba43d4a
Apply suggestion from @bthirion
lionelkusch Sep 24, 2025
e9a4432
chaneg name of fixture
lionelkusch Sep 24, 2025
7ab0af3
Merge branch 'main' into PR_selection
lionelkusch Sep 26, 2025
53ed888
remove all from k_best
lionelkusch Sep 29, 2025
dddbb4a
rename the variable
lionelkusch Sep 29, 2025
c06e1b9
chnage borm for percentil
lionelkusch Sep 29, 2025
66ec73e
fix tests
lionelkusch Sep 29, 2025
1fee076
Merge branch 'main' into PR_selection
lionelkusch Sep 29, 2025
65f6fd0
improve selection method
lionelkusch Oct 2, 2025
20a9b0e
update test and the changement of signature
lionelkusch Oct 2, 2025
6982b55
improve coverage
lionelkusch Oct 2, 2025
9b7f7fe
Merge branch 'main' into PR_selection
lionelkusch Oct 2, 2025
3b89e1e
change defautl value
lionelkusch Oct 2, 2025
79a58b6
Update src/hidimstat/base_variable_importance.py
lionelkusch Oct 3, 2025
7e5442b
Update src/hidimstat/base_variable_importance.py
lionelkusch Oct 3, 2025
9da3607
Update src/hidimstat/base_variable_importance.py
lionelkusch Oct 3, 2025
ed39b3d
Update src/hidimstat/base_variable_importance.py
lionelkusch Oct 3, 2025
d86644d
update following the comments
lionelkusch Oct 3, 2025
9812660
fix bug
lionelkusch Oct 3, 2025
626e47a
Merge branch 'main' into PR_selection
lionelkusch Oct 9, 2025
b28965c
selection one criteria
lionelkusch Oct 9, 2025
c7e8d69
fix tests
lionelkusch Oct 9, 2025
529d28a
fix format
lionelkusch Oct 9, 2025
b633e15
fix k_lowest
lionelkusch Oct 9, 2025
b02a2e9
Merge branch 'main' into PR_selection
lionelkusch Oct 9, 2025
246bfb6
remove randomization in tests
lionelkusch Oct 10, 2025
62f71a4
move all the tests for base importance in one file
lionelkusch Oct 10, 2025
f10bf06
fix seed
lionelkusch Oct 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
317 changes: 241 additions & 76 deletions src/hidimstat/base_variable_importance.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,99 @@
from sklearn.base import BaseEstimator

from hidimstat._utils.exception import InternalError
from hidimstat.statistical_tools.multiple_testing import fdr_threshold


def _selection_generic(
values,
k_best=None,
k_lowest=None,
percentile=None,
threshold_max=None,
threshold_min=None,
):
"""
Helper function for selecting features based on multiple criteria.

Parameters
----------
values : array-like of shape (n_features,)
Values to use for feature selection (e.g., importance scores or p-values)
k_best : int, default=None
Selects the top k features based on values.
k_lowest : int, default=None
Selects the lowest k features based on values.
percentile : float, default=None
Selects features based on a specified percentile of values.
threshold_max : float, default=None
Selects features with values below the specified maximum threshold.
threshold_min : float, default=None
Selects features with values above the specified minimum threshold.

Returns
-------
selection : array-like of shape (n_features,)
Boolean array indicating the selected features.
"""
n_criteria = np.sum(
[
criteria is not None
for criteria in [k_best, k_lowest, percentile, threshold_max, threshold_min]
]
)
assert n_criteria <= 1, "Only support selection based on one criteria."
if k_best is not None:
assert k_best >= 1, "k_best needs to be positive or None"
if k_best > values.shape[0]:
warnings.warn(
f"k={k_best} is greater than n_features={values.shape[0]}. "
"All the features will be returned."
)
mask_k_best = np.zeros_like(values, dtype=bool)

# based on SelectKBest in Scikit-Learn
# Request a stable sort. Mergesort takes more memory (~40MB per
# megafeature on x86-64).
mask_k_best[np.argsort(values, kind="mergesort")[-k_best:]] = 1
return mask_k_best
elif k_lowest is not None:
assert k_lowest >= 1, "k_lowest needs to be positive or None"
if k_lowest > values.shape[0]:
warnings.warn(
f"k={k_lowest} is greater than n_features={values.shape[0]}. "
"All the features will be returned."
)
mask_k_lowest = np.zeros_like(values, dtype=bool)

# based on SelectKBest in Scikit-Learn
# Request a stable sort. Mergesort takes more memory (~40MB per
# megafeature on x86-64).
mask_k_lowest[np.argsort(values, kind="mergesort")[:k_lowest]] = 1
return mask_k_lowest
elif percentile is not None:
assert (
0 < percentile < 100
), "percentile must be between 0 and 100 (exclusive). Got {}.".format(
percentile
)
# based on SelectPercentile in Scikit-Learn
threshold_percentile = np.percentile(values, 100 - percentile)
mask_percentile = values > threshold_percentile
ties = np.where(values == threshold_percentile)[0]
if len(ties):
max_feats = int(len(values) * percentile / 100)
kept_ties = ties[: max_feats - mask_percentile.sum()]
mask_percentile[kept_ties] = True
return mask_percentile
elif threshold_max is not None:
mask_threshold_max = values < threshold_max
return mask_threshold_max
elif threshold_min is not None:
mask_threshold_min = values > threshold_min
return mask_threshold_min
else:
no_mask = np.ones_like(values, dtype=bool)
return no_mask


class BaseVariableImportance(BaseEstimator):
Expand All @@ -21,8 +114,6 @@ class BaseVariableImportance(BaseEstimator):
The computed importance scores for each feature.
pvalues_ : array-like of shape (n_features,), default=None
The computed p-values for each feature.
selections_ : array-like of shape (n_features,), default=None
Binary mask indicating selected features.

Methods
-------
Expand All @@ -37,104 +128,178 @@ def __init__(self):
super().__init__()
self.importances_ = None
self.pvalues_ = None
self.selections_ = None

def selection(
self, k_best=None, percentile=None, threshold=None, threshold_pvalue=None
def _check_importance(self):
"""
Checks if the importance scores have been computed.
"""
if self.importances_ is None:
raise ValueError(
"The importances need to be called before calling this method"
)

def importance_selection(
self, k_best=None, percentile=None, threshold_max=None, threshold_min=None
):
"""
Selects features based on variable importance.
In case several arguments are different from None,
the returned selection is the conjunction of all of them.

Parameters
----------
k_best : int, optional, default=None
k_best : int, default=None
Selects the top k features based on importance scores.
percentile : float, optional, default=None
percentile : float, default=None
Selects features based on a specified percentile of importance scores.
threshold : float, optional, default=None
Selects features with importance scores above the specified threshold.
threshold_pvalue : float, optional, default=None
Selects features with p-values below the specified threshold.
threshold_max : float, default=None
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure whether this argument really makes sense ?
I think I would have a unique threshold argument for this function.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's because sometimes, we want to have the maximum or the minimum.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see issue ##481

Selects features with importance scores below the specified maximum threshold.
threshold_min : float, default=None
Selects features with importance scores above the specified minimum threshold.

Returns
-------
selection : array-like of shape (n_features,)
Binary array indicating the selected features.
"""
self._check_importance()
if k_best is not None:
if not isinstance(k_best, str) and k_best > self.importances_.shape[1]:
warnings.warn(
f"k={k_best} is greater than n_features={self.importances_.shape[1]}. "
"All the features will be returned."
)
assert k_best > 0, "k_best needs to be positive and not null"
if percentile is not None:
assert (
0 < percentile and percentile < 100
), "percentile needs to be between 0 and 100"
if threshold_pvalue is not None:
assert (
0 < threshold_pvalue and threshold_pvalue < 1
), "threshold_pvalue needs to be between 0 and 1"

# base on SelectKBest of Scikit-Learn
if k_best == "all":
mask_k_best = np.ones(self.importances_.shape, dtype=bool)
elif k_best == 0:
mask_k_best = np.zeros(self.importances_.shape, dtype=bool)
elif k_best is not None:
mask_k_best = np.zeros(self.importances_.shape, dtype=bool)

# Request a stable sort. Mergesort takes more memory (~40MB per
# megafeature on x86-64).
mask_k_best[np.argsort(self.importances_, kind="mergesort")[-k_best:]] = 1
else:
mask_k_best = np.ones(self.importances_.shape, dtype=bool)

# base on SelectPercentile of Scikit-Learn
if percentile == 100:
mask_percentile = np.ones(len(self.importances_), dtype=bool)
elif percentile == 0:
mask_percentile = np.zeros(len(self.importances_), dtype=bool)
elif percentile is not None:
threshold = np.percentile(self.importances_, 100 - percentile)
mask_percentile = self.importances_ > threshold
ties = np.where(self.importances_ == threshold)[0]
if len(ties):
max_feats = int(len(self.importances_) * percentile / 100)
kept_ties = ties[: max_feats - mask_percentile.sum()]
mask_percentile[kept_ties] = True
else:
mask_percentile = np.ones(self.importances_.shape, dtype=bool)
return _selection_generic(
self.importances_,
k_best=k_best,
percentile=percentile,
threshold_max=threshold_max,
threshold_min=threshold_min,
)

if threshold is not None:
mask_threshold = self.importances_ < threshold
else:
mask_threshold = np.ones(self.importances_.shape, dtype=bool)
def pvalue_selection(
self,
k_lowest=None,
percentile=None,
threshold_max=0.05,
threshold_min=None,
alternative_hypothesis=False,
):
"""
Selects features based on p-values.

# base on SelectFpr of Scikit-Learn
if threshold_pvalue is not None:
mask_threshold_pvalue = self.pvalues_ < threshold_pvalue
else:
mask_threshold_pvalue = np.ones(self.importances_.shape, dtype=bool)
Parameters
----------
k_lowest : int, default=None
Selects the k features with lowest p-values.
percentile : float, default=None
Selects features based on a specified percentile of p-values.
threshold_max : float, default=0.05
Selects features with p-values below the specified maximum threshold (0 to 1).
threshold_min : float, default=None
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

similarly, I don't see any use case for threshold_min here.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The first idea is to propose a generic way of selection.
My first idea of using it is to have a selecting the feature to discard.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see issue ##481

Selects features with p-values above the specified minimum threshold (0 to 1).
alternative_hypothesis : bool, default=False
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see the use case for alternative hypothesis.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was present in the EnCluDL, I add the option for keeping the same possibilities.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see issue ##481

If True, selects based on 1-pvalues instead of p-values.

self.selections_ = (
mask_k_best & mask_percentile & mask_threshold & mask_threshold_pvalue
Returns
-------
selection : array-like of shape (n_features,)
Binary array indicating the selected features (True for selected).
"""
self._check_importance()
assert (
self.pvalues_ is not None
), "The selection on p-value can't be done because the current method does not compute p-values."
if threshold_min is not None:
assert (
0 < threshold_min and threshold_min < 1
), "threshold_min needs to be between 0 and 1"
if threshold_max is not None:
assert (
0 < threshold_max and threshold_max < 1
), "threshold_max needs to be between 0 and 1"
assert alternative_hypothesis is None or isinstance(
alternative_hypothesis, bool
), "alternative_hippothesis can have only three values: True, False and None."
return _selection_generic(
self.pvalues_ if not alternative_hypothesis else 1 - self.pvalues_,
k_lowest=k_lowest,
percentile=percentile,
threshold_max=threshold_max,
threshold_min=threshold_min,
)

return self.selections_

def _check_importance(self):
def fdr_selection(
self,
fdr,
fdr_control="bhq",
reshaping_function=None,
alternative_hypothesis=False,
):
"""
Checks if the importance scores have been computed.
Performs feature selection based on False Discovery Rate (FDR) control.

Parameters
----------
fdr : float
The target false discovery rate level (between 0 and 1)
fdr_control: {'bhq', 'bhy'}, default='bhq'
The FDR control method to use:
- 'bhq': Benjamini-Hochberg procedure
- 'bhy': Benjamini-Hochberg-Yekutieli procedure
reshaping_function: callable or None, default=None
Optional reshaping function for FDR control methods.
If None, defaults to sum of reciprocals for 'bhy'.
alternative_hippothesis: bool or None, default=False
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same thing here, I don't see any reason to consider an alternative hypothesis. This is because importance tests are all one-sided tests that test whether importance is greater 0 (=significantly different from 0, in that case).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was present in the EnCluDL, I add the option for keeping the same possibilities.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but there are good reasons for that: EncluDL yields a signed statistic, not dCRT.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see issue ##481

If False, selects features with small p-values.
If True, selects features with large p-values (close to 1).
If None, selects features that have either small or large p-values.

Returns
-------
selected : ndarray of bool
Boolean mask of selected features.
True indicates selected features, False indicates non-selected features.

Raises
------
ValueError
If `importances_` haven't been computed yet
AssertionError
If `pvalues_` are missing or fdr_control is invalid
"""
if self.importances_ is None:
raise ValueError(
"The importances need to be called before calling this method"
self._check_importance()
assert 0 < fdr and fdr < 1, "FDR needs to be between 0 and 1 excluded"
assert (
self.pvalues_ is not None
), "FDR-based selection requires p-values to be computed first. The current method does not support p-values."
assert (
fdr_control == "bhq" or fdr_control == "bhy"
), "only 'bhq' and 'bhy' are supported"
assert alternative_hypothesis is None or isinstance(
alternative_hypothesis, bool
), "alternative_hippothesis can have only three values: True, False and None."

# selection on pvalue
if alternative_hypothesis is None or not alternative_hypothesis:
threshold_pvalues = fdr_threshold(
self.pvalues_,
fdr=fdr,
method=fdr_control,
reshaping_function=reshaping_function,
)
selected_pvalues = self.pvalues_ <= threshold_pvalues
else:
selected_pvalues = np.zeros_like(self.pvalues_, dtype=bool)

# selection on 1-pvalue
if alternative_hypothesis is None or alternative_hypothesis:
threshold_one_minus_pvalues = fdr_threshold(
1 - self.pvalues_,
fdr=fdr,
method=fdr_control,
reshaping_function=reshaping_function,
)
selected_one_minus_pvalues = (
1 - self.pvalues_
) <= threshold_one_minus_pvalues
else:
selected_one_minus_pvalues = np.zeros_like(self.pvalues_, dtype=bool)

selected = selected_pvalues | selected_one_minus_pvalues
return selected

def plot_importance(
self,
Expand Down
Loading