Skip to content

Commit c7b6430

Browse files
authored
Merge pull request #48 from simai-ml/add-sample_weight
Add sample weight
2 parents 43966bf + 36c8c7d commit c7b6430

File tree

3 files changed

+131
-7
lines changed

3 files changed

+131
-7
lines changed

HISTORY.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@
22
History
33
=======
44

5+
0.2.1 (2020-XX-XX)
6+
------------------
7+
8+
* Add sample_weight argument in fit method
9+
510
0.2.0 (2021-05-21)
611
------------------
712

mapie/estimators.py

Lines changed: 100 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
from __future__ import annotations
22
from typing import Optional, Union, Iterable, Tuple, List
3+
from inspect import signature
34

45
import numpy as np
56
from joblib import Parallel, delayed
67
from sklearn.utils import check_X_y, check_array
7-
from sklearn.utils.validation import check_is_fitted
8+
from sklearn.utils.validation import check_is_fitted, _check_sample_weight
89
from sklearn.base import clone
910
from sklearn.base import BaseEstimator, RegressorMixin
1011
from sklearn.linear_model import LinearRegression
@@ -276,14 +277,91 @@ def _check_alpha(self, alpha: Union[float, Iterable[float]]) -> np.ndarray:
276277
raise ValueError("Invalid alpha. Allowed values are between 0 and 1.")
277278
return alpha_np
278279

280+
def _check_null_weight(
281+
self,
282+
sample_weight: ArrayLike,
283+
X: ArrayLike,
284+
y: ArrayLike
285+
) -> Tuple[ArrayLike, ArrayLike, ArrayLike]:
286+
"""
287+
Check sample weights and remove samples with null sample weights.
288+
289+
Parameters
290+
----------
291+
sample_weight : ArrayLike
292+
Sample weights.
293+
X : ArrayLike
294+
Training samples.
295+
y : ArrayLike
296+
Training labels.
297+
298+
Returns
299+
-------
300+
sample_weight : ArrayLike
301+
Non-null sample weights.
302+
X : ArrayLike
303+
Training samples with non-null weights.
304+
y : ArrayLike
305+
Training labels with non-null weights.
306+
"""
307+
if sample_weight is not None:
308+
sample_weight = _check_sample_weight(sample_weight, X)
309+
non_null_weight = sample_weight != 0
310+
X, y = X[non_null_weight, :], y[non_null_weight]
311+
sample_weight = sample_weight[non_null_weight]
312+
return sample_weight, X, y
313+
314+
def _fit_estimator(
315+
self,
316+
estimator: RegressorMixin,
317+
X: ArrayLike,
318+
y: ArrayLike,
319+
supports_sw: bool,
320+
sample_weight: ArrayLike
321+
) -> RegressorMixin:
322+
"""
323+
Fit an estimator on training data by distinguishing two cases:
324+
- the estimator supports sample weights and sample weights are provided.
325+
- the estimator does not support samples weights or samples weights are not provided
326+
327+
Parameters
328+
----------
329+
estimator : RegressorMixin
330+
Estimator to train.
331+
332+
X : ArrayLike of shape (n_samples, n_features)
333+
Input data.
334+
335+
y : ArrayLike of shape (n_samples,)
336+
Input labels.
337+
338+
supports_sw : bool
339+
Whether or not estimator supports sample weights.
340+
341+
sample_weight : ArrayLike of shape (n_samples,)
342+
Sample weights. If None, then samples are equally weighted. By default None.
343+
344+
Returns
345+
-------
346+
RegressorMixin
347+
Fitted estimator.
348+
"""
349+
if sample_weight is not None and supports_sw:
350+
estimator.fit(X, y, sample_weight=sample_weight)
351+
else:
352+
estimator.fit(X, y)
353+
return estimator
354+
279355
def _fit_and_predict_oof_model(
280356
self,
281357
estimator: RegressorMixin,
282358
X: ArrayLike,
283359
y: ArrayLike,
284360
train_index: ArrayLike,
285361
val_index: ArrayLike,
286-
k: int
362+
k: int,
363+
supports_sw: bool,
364+
sample_weight: Optional[ArrayLike] = None
287365
) -> Tuple[RegressorMixin, ArrayLike, ArrayLike, ArrayLike]:
288366
"""
289367
Fit a single out-of-fold model on a given training set and
@@ -309,6 +387,12 @@ def _fit_and_predict_oof_model(
309387
k : int
310388
Split identification number.
311389
390+
supports_sw : bool
391+
Whether or not estimator supports sample weights.
392+
393+
sample_weight : ArrayLike of shape (n_samples,)
394+
Sample weights. If None, then samples are equally weighted. By default None.
395+
312396
Returns
313397
-------
314398
Tuple[RegressorMixin, ArrayLike, ArrayLike, ArrayLike]
@@ -319,12 +403,13 @@ def _fit_and_predict_oof_model(
319403
- [3]: Validation data indices, of shapes (n_samples_val,)
320404
"""
321405
X_train, y_train, X_val = X[train_index], y[train_index], X[val_index]
322-
estimator.fit(X_train, y_train)
406+
sample_weight_train = sample_weight[train_index] if sample_weight is not None else None
407+
estimator = self._fit_estimator(estimator, X_train, y_train, supports_sw, sample_weight_train)
323408
y_pred = estimator.predict(X_val)
324409
val_id = np.full_like(y_pred, k)
325410
return estimator, y_pred, val_id, val_index
326411

327-
def fit(self, X: ArrayLike, y: ArrayLike) -> MapieRegressor:
412+
def fit(self, X: ArrayLike, y: ArrayLike, sample_weight: Optional[ArrayLike] = None) -> MapieRegressor:
328413
"""
329414
Fit estimator and compute residuals used for prediction intervals.
330415
Fit the base estimator under the ``single_estimator_`` attribute.
@@ -339,6 +424,12 @@ def fit(self, X: ArrayLike, y: ArrayLike) -> MapieRegressor:
339424
y : ArrayLike of shape (n_samples,)
340425
Training labels.
341426
427+
sample_weight : ArrayLike of shape (n_samples,), default=None
428+
Sample weights for fitting the out-of-fold models. If None, then samples are equally weighted.
429+
If some weights are null, their corresponding observations are removed before the fitting process and
430+
hence have no residuals.
431+
If weights are non-uniform, residuals are still uniformly weighted.
432+
342433
Returns
343434
-------
344435
MapieRegressor
@@ -348,17 +439,20 @@ def fit(self, X: ArrayLike, y: ArrayLike) -> MapieRegressor:
348439
cv = self._check_cv(self.cv)
349440
estimator = self._check_estimator(self.estimator)
350441
X, y = check_X_y(X, y, force_all_finite=False, dtype=["float64", "object"])
442+
fit_parameters = signature(estimator.fit).parameters
443+
supports_sw = "sample_weight" in fit_parameters
444+
sample_weight, X, y = self._check_null_weight(sample_weight, X, y)
351445
y_pred = np.empty_like(y, dtype=float)
352446
self.estimators_: List[RegressorMixin] = []
353447
self.n_features_in_ = X.shape[1]
354448
self.k_ = np.empty_like(y, dtype=int)
355-
self.single_estimator_ = clone(estimator).fit(X, y)
449+
self.single_estimator_ = self._fit_estimator(clone(estimator), X, y, supports_sw, sample_weight)
356450
if self.method == "naive":
357451
y_pred = self.single_estimator_.predict(X)
358452
else:
359453
cv_outputs = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
360454
delayed(self._fit_and_predict_oof_model)(
361-
clone(estimator), X, y, train_index, val_index, k
455+
clone(estimator), X, y, train_index, val_index, k, supports_sw, sample_weight
362456
) for k, (train_index, val_index) in enumerate(cv.split(X))
363457
)
364458
self.estimators_, predictions, val_ids, val_indices = map(list, zip(*cv_outputs))

mapie/tests/test_estimators.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from typing import Any, Union, Optional
22
from typing_extensions import TypedDict
3+
from inspect import signature
34

45
import pytest
56
import numpy as np
@@ -54,7 +55,7 @@
5455
SKLEARN_EXCLUDED_CHECKS = {
5556
"check_regressors_train",
5657
"check_pipeline_consistency",
57-
"check_fit_score_takes_y"
58+
"check_fit_score_takes_y",
5859
}
5960

6061

@@ -75,6 +76,13 @@ def test_default_parameters() -> None:
7576
assert mapie.n_jobs is None
7677

7778

79+
def test_default_sample_weight() -> None:
80+
"""Test default sample weights"""
81+
mapie = MapieRegressor()
82+
mapie.fit(X_toy, y_toy)
83+
assert signature(mapie.fit).parameters["sample_weight"].default is None
84+
85+
7886
def test_fit() -> None:
7987
"""Test that fit raises no errors."""
8088
mapie = MapieRegressor()
@@ -354,3 +362,20 @@ def test_results_single_and_multi_jobs(strategy: str) -> None:
354362
y_preds_single = mapie_single.predict(X_toy)
355363
y_preds_multi = mapie_multi.predict(X_toy)
356364
np.testing.assert_almost_equal(y_preds_single, y_preds_multi)
365+
366+
367+
@pytest.mark.parametrize("strategy", [*STRATEGIES])
368+
def test_results_with_constant_sample_weights(strategy: str) -> None:
369+
"""Test PIs when sample weights are None or constant with different values."""
370+
n_samples = len(X_reg)
371+
mapie0 = MapieRegressor(alpha=0.05, **STRATEGIES[strategy])
372+
mapie0.fit(X_reg, y_reg, sample_weight=None)
373+
mapie1 = MapieRegressor(alpha=0.05, **STRATEGIES[strategy])
374+
mapie1.fit(X_reg, y_reg, sample_weight=np.ones(shape=n_samples))
375+
mapie2 = MapieRegressor(alpha=0.05, **STRATEGIES[strategy])
376+
mapie2.fit(X_reg, y_reg, sample_weight=np.ones(shape=n_samples)*5)
377+
y_preds0 = mapie0.predict(X_reg)
378+
y_preds1 = mapie1.predict(X_reg)
379+
y_preds2 = mapie2.predict(X_reg)
380+
np.testing.assert_almost_equal(y_preds0, y_preds1)
381+
np.testing.assert_almost_equal(y_preds1, y_preds2)

0 commit comments

Comments
 (0)