Skip to content

Commit 2233583

Browse files
committed
add files
1 parent d6f223e commit 2233583

File tree

3 files changed

+586
-0
lines changed

3 files changed

+586
-0
lines changed

src/hidimstat/marginal/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .leave_one_covariate_in import LeaveOneCovariateIn
2+
3+
__all__ = ["LeaveOneCovariateIn"]
Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
import numpy as np
2+
from joblib import Parallel, delayed
3+
from sklearn.base import check_is_fitted, clone
4+
from sklearn.metrics import root_mean_squared_error
5+
from typing import override
6+
7+
from hidimstat._utils.utils import _check_vim_predict_method
8+
from hidimstat.base_variable_importance import (
9+
BaseVariableImportance,
10+
VariableImportanceFeatureGroup,
11+
)
12+
13+
14+
class LeaveOneCovariateIn(BaseVariableImportance, VariableImportanceFeatureGroup):
15+
def __init__(
16+
self,
17+
estimator,
18+
loss: callable = root_mean_squared_error,
19+
method: str = "predict",
20+
n_jobs: int = 1,
21+
):
22+
"""
23+
Leave One Covariate In.
24+
For more details, see the section 7.2 of :footcite:t:`ewald2024guide`.
25+
26+
Parameters
27+
----------
28+
estimator : sklearn compatible estimator, optional
29+
The estimator to use for the prediction.
30+
loss : callable, default=root_mean_squared_error
31+
The function to compute the loss when comparing the perturbed model
32+
to the original model.
33+
method : str, default="predict"
34+
The method used for making predictions. This determines the predictions
35+
passed to the loss function. Supported methods are "predict",
36+
"predict_proba", "decision_function", "transform".
37+
n_jobs : int, default=1
38+
The number of parallel jobs to run. Parallelization is done over the
39+
features or groups of features.
40+
"""
41+
super().__init__()
42+
check_is_fitted(estimator)
43+
self.estimator = estimator
44+
self.loss = loss
45+
_check_vim_predict_method(method)
46+
self.method = method
47+
self.n_jobs = n_jobs
48+
# generated attributes
49+
self._list_univariate_model = []
50+
self.loss_reference_ = None
51+
52+
@override
53+
def fit(self, X, y, features_groups=None):
54+
"""
55+
Fit the marginal information variable importance model.
56+
57+
Parameters
58+
----------
59+
X : array-like of shape (n_samples, n_features)
60+
The input samples.
61+
y : array-like of shape (n_samples,)
62+
The target values.
63+
features_groups : dict, optional
64+
A dictionary where the keys are group identifiers and the values are lists
65+
of feature indices or names for each group. If None, each feature is
66+
treated as its own group.
67+
68+
Returns
69+
-------
70+
self : object
71+
Returns the instance itself.
72+
"""
73+
super().fit(X, y, features_groups)
74+
X_ = np.asarray(X)
75+
y_ = np.asarray(y)
76+
77+
# Parallelize the computation of the importance scores for each group
78+
self._list_univariate_model = Parallel(n_jobs=self.n_jobs)(
79+
delayed(self._joblib_fit_one_features_group)(X_, y_, features_groups_ids)
80+
for features_groups_ids in self._features_groups_ids
81+
)
82+
83+
def predict(self, X):
84+
"""
85+
Compute the predictions after perturbation of the data for each group of
86+
features.
87+
88+
Parameters
89+
----------
90+
X : array-like of shape (n_samples, n_features)
91+
The input samples.
92+
y : array-like of shape (n_samples,)
93+
The target values.
94+
95+
Returns
96+
-------
97+
out : array-like of shape (n_features_groups, n_samples)
98+
The predictions for each group of features.
99+
"""
100+
self._check_fit(X)
101+
X_ = np.asarray(X)
102+
103+
# Parallelize the computation of the importance scores for each group
104+
out_list = Parallel(n_jobs=self.n_jobs)(
105+
delayed(self._joblib_predict_one_features_group)(X_, features_group_id, features_groups_ids)
106+
for features_group_id, features_groups_ids in enumerate(self._features_groups_ids)
107+
)
108+
return np.array(out_list)
109+
110+
def importance(self, X, y):
111+
"""
112+
Compute the marginal importance scores for each group of features.
113+
114+
Parameters
115+
----------
116+
X : array-like of shape (n_samples, n_features)
117+
The input samples.
118+
y : array-like of shape (n_samples,)
119+
The target values.
120+
121+
Returns
122+
-------
123+
out_dict : dict
124+
A dictionary containing:
125+
- 'loss_reference' : float
126+
Loss of the original model predictions
127+
- 'loss' : dict
128+
Losses for each group's univariate predictions
129+
- 'importance' : ndarray of shape (n_features_groups,)
130+
Marginal importance scores for each feature group
131+
"""
132+
self._check_fit(X)
133+
134+
y_pred = self.predict(X)
135+
136+
# reference to a dummy model
137+
if len(y_pred[0].shape) == 1 or y_pred[0].shape[1] == 1:
138+
# Regression: take the average value as reference
139+
y_ref = np.mean(y) * np.ones_like(y_pred[0])
140+
self.loss_reference_ = self.loss(y, y_ref)
141+
else:
142+
# Classification: take the most frequent value
143+
values, counts = np.unique(y, return_counts=True)
144+
y_ref = np.zeros_like(y_pred[0])
145+
y_ref[:, np.argmax(counts)] = 1.0
146+
self.loss_reference_ = self.loss(y, y_ref)
147+
148+
self.importances_ = []
149+
for y_pred_j in y_pred:
150+
self.importances_.append(self.loss_reference_ - self.loss(y, y_pred_j))
151+
self.pvalues_ = None # estimated pvlaue for method
152+
return self.importances_
153+
154+
def fit_importance(self, X, y, cv, features_groups=None):
155+
"""
156+
Fits the model to the data and computes feature importance.
157+
158+
Parameters
159+
----------
160+
X : array-like of shape (n_samples, n_features)
161+
The input data.
162+
y : array-like of shape (n_samples,)
163+
The target values.
164+
cv :
165+
Cross-validation parameter.
166+
features_groups : dict, optional
167+
A dictionary where the keys are group identifiers and the values are lists
168+
of feature indices or names for each group. If None, each feature is
169+
treated as its own group.
170+
171+
Returns
172+
-------
173+
importance : array-like
174+
The computed feature importance scores.
175+
"""
176+
list_attribute_saved = ["importances_", "pvalues_", "_list_univariate_model"]
177+
save_value_attributes = []
178+
for train_index, test_index in cv.split(X):
179+
X_train, X_test = X[train_index], X[test_index]
180+
y_train, y_test = y[train_index], y[test_index]
181+
self.fit(X_train, y_train, features_groups=features_groups)
182+
self.importance(X_test, y_test)
183+
save_value_attributes.append(
184+
[getattr(self, attribute) for attribute in list_attribute_saved]
185+
)
186+
# create an array of attributes:
187+
for attribute in list_attribute_saved:
188+
setattr(self, attribute, [])
189+
for value_attribute in save_value_attributes:
190+
for attribute, value in zip(list_attribute_saved, value_attribute):
191+
getattr(self, attribute).append(value)
192+
193+
return np.mean(self.importances_, axis=0)
194+
195+
def _joblib_fit_one_features_group(self, X, y, features_group_ids):
196+
"""
197+
Helper function to fit a univariate model for a single group.
198+
199+
Parameters
200+
----------
201+
X : array-like of shape (n_samples, n_features)
202+
The input samples.
203+
y : array-like of shape (n_samples,)
204+
The target values.
205+
features_group_ids : array-like
206+
The indices of features belonging to this group.
207+
208+
Returns
209+
-------
210+
object
211+
The fitted univariate model for this group.
212+
"""
213+
univariate_model = clone(self.estimator)
214+
return univariate_model.fit(X[:, features_group_ids].reshape(-1, len(features_group_ids)), y)
215+
216+
def _joblib_predict_one_features_group(self, X, index_features_group, features_group_ids):
217+
"""
218+
Helper function to predict for a single group.
219+
220+
Parameters
221+
----------
222+
X : array-like of shape (n_samples, n_features)
223+
The input samples.
224+
index_features_group : int
225+
The index of the group in _list_univariate_model.
226+
features_group_ids : array-like
227+
The indices of features belonging to this group.
228+
229+
Returns
230+
-------
231+
float
232+
The prediction score for this group.
233+
"""
234+
y_pred_loci = getattr(self._list_univariate_model[index_features_group], self.method)(
235+
X[:, features_group_ids].reshape(-1, len(features_group_ids))
236+
)
237+
return y_pred_loci

0 commit comments

Comments
 (0)