Skip to content

Commit f7e477d

Browse files
authored
ENH add validate parameter to FunctionSampler (#637)
* add whats new * add whats new * fix * always import pandas * test documentation only with all dependencies installed * ENH add validate parameter to FunctionSampler * DOC add whats new and parameter in user guide * create X y for regression
1 parent 0493258 commit f7e477d

File tree

4 files changed

+98
-8
lines changed

4 files changed

+98
-8
lines changed

doc/miscellaneous.rst

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,23 @@ to retain the 10 first elements of the array ``X`` and ``y``::
3434
>>> np.all(y_res == y[:10])
3535
True
3636

37+
In addition, the parameter ``validate`` control input checking. For instance,
38+
turning ``validate=False`` allows to pass any type of target ``y`` and do some
39+
sampling for regression targets.
40+
41+
>>> from sklearn.datasets import make_regression
42+
>>> X_reg, y_reg = make_regression(n_samples=100, random_state=42)
43+
>>> rng = np.random.RandomState(42)
44+
>>> def dummy_sampler(X, y):
45+
... indices = rng.choice(np.arange(X.shape[0]), size=10)
46+
... return X[indices], y[indices]
47+
>>> sampler = FunctionSampler(func=dummy_sampler, validate=False)
48+
>>> X_res, y_res = sampler.fit_resample(X_reg, y_reg)
49+
>>> y_res
50+
array([ 41.49112498, -142.78526195, 85.55095317, 141.43321419,
51+
75.46571114, -67.49177372, 159.72700509, -169.80498923,
52+
211.95889757, 211.95889757])
53+
3754
We illustrate the use of such sampler to implement an outlier rejection
3855
estimator which can be easily used within a
3956
:class:`imblearn.pipeline.Pipeline`:

doc/whats_new/v0.6.rst

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,15 @@ Enhancement
4242
...........
4343

4444
- :class:`imblearn.under_sampling.RandomUnderSampling`,
45-
:class:`imblearn.over_sampling.RandomOverSampling`,,
45+
:class:`imblearn.over_sampling.RandomOverSampling`,
4646
:class:`imblearn.datasets.make_imbalance` accepts Pandas DataFrame in and
4747
will output Pandas DataFrame.
4848
:pr:`636` by :user:`Guillaume Lemaitre <glemaitre>`.
4949

50+
- :class:`imblearn.FunctionSampler` accepts a parameter ``validate`` allowing
51+
to check or not the input ``X`` and ``y``.
52+
:pr:`637` by :user:`Guillaume Lemaitre <glemaitre>`.
53+
5054
Deprecation
5155
...........
5256

imblearn/base.py

Lines changed: 52 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -127,9 +127,11 @@ def __init__(self, sampling_strategy="auto"):
127127
self.sampling_strategy = sampling_strategy
128128

129129
@staticmethod
130-
def _check_X_y(X, y):
130+
def _check_X_y(X, y, accept_sparse=None):
131+
if accept_sparse is None:
132+
accept_sparse = ["csr", "csc"]
131133
y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
132-
X, y = check_X_y(X, y, accept_sparse=["csr", "csc"])
134+
X, y = check_X_y(X, y, accept_sparse=accept_sparse)
133135
return X, y, binarize_y
134136

135137

@@ -156,6 +158,11 @@ class FunctionSampler(BaseSampler):
156158
kw_args : dict, optional (default=None)
157159
The keyword argument expected by ``func``.
158160
161+
validate : bool, default=True
162+
Whether or not to bypass the validation of ``X`` and ``y``. Turning-off
163+
validation allows to use the ``FunctionSampler`` with any type of
164+
data.
165+
159166
Notes
160167
-----
161168
@@ -202,16 +209,55 @@ class FunctionSampler(BaseSampler):
202209

203210
_sampling_type = "bypass"
204211

205-
def __init__(self, func=None, accept_sparse=True, kw_args=None):
212+
def __init__(self, func=None, accept_sparse=True, kw_args=None,
213+
validate=True):
206214
super().__init__()
207215
self.func = func
208216
self.accept_sparse = accept_sparse
209217
self.kw_args = kw_args
218+
self.validate = validate
210219

211-
def _fit_resample(self, X, y):
212-
X, y = check_X_y(
213-
X, y, accept_sparse=["csr", "csc"] if self.accept_sparse else False
220+
def fit_resample(self, X, y):
221+
"""Resample the dataset.
222+
223+
Parameters
224+
----------
225+
X : {array-like, sparse matrix}, shape (n_samples, n_features)
226+
Matrix containing the data which have to be sampled.
227+
228+
y : array-like, shape (n_samples,)
229+
Corresponding label for each sample in X.
230+
231+
Returns
232+
-------
233+
X_resampled : {array-like, sparse matrix}, shape \
234+
(n_samples_new, n_features)
235+
The array containing the resampled data.
236+
237+
y_resampled : array-like, shape (n_samples_new,)
238+
The corresponding label of `X_resampled`.
239+
240+
"""
241+
if self.validate:
242+
check_classification_targets(y)
243+
X, y, binarize_y = self._check_X_y(
244+
X, y, accept_sparse=self.accept_sparse
245+
)
246+
247+
self.sampling_strategy_ = check_sampling_strategy(
248+
self.sampling_strategy, y, self._sampling_type
214249
)
250+
251+
output = self._fit_resample(X, y)
252+
253+
if self.validate and binarize_y:
254+
y_sampled = label_binarize(output[1], np.unique(y))
255+
if len(output) == 2:
256+
return output[0], y_sampled
257+
return output[0], y_sampled, output[2]
258+
return output
259+
260+
def _fit_resample(self, X, y):
215261
func = _identity if self.func is None else self.func
216262
output = func(X, y, **(self.kw_args if self.kw_args else {}))
217263
return output

imblearn/tests/test_base.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,23 @@
55

66
import pytest
77

8+
import numpy as np
89
from scipy import sparse
910

1011
from sklearn.datasets import load_iris
12+
from sklearn.datasets import make_regression
13+
from sklearn.linear_model import LinearRegression
14+
from sklearn.utils import _safe_indexing
15+
from sklearn.utils.multiclass import type_of_target
1116
from sklearn.utils._testing import assert_array_equal
1217
from sklearn.utils._testing import assert_allclose_dense_sparse
1318

1419
from imblearn.datasets import make_imbalance
15-
from imblearn import FunctionSampler
20+
from imblearn.pipeline import make_pipeline
1621
from imblearn.under_sampling import RandomUnderSampler
1722

23+
from imblearn import FunctionSampler
24+
1825
iris = load_iris()
1926
X, y = make_imbalance(
2027
iris.data, iris.target, sampling_strategy={0: 10, 1: 25}, random_state=0
@@ -71,3 +78,19 @@ def func(X, y, sampling_strategy, random_state):
7178
X_res_2, y_res_2 = RandomUnderSampler(random_state=0).fit_resample(X, y)
7279
assert_allclose_dense_sparse(X_res, X_res_2)
7380
assert_array_equal(y_res, y_res_2)
81+
82+
83+
def test_function_sampler_validate():
84+
# check that we can let a pass a regression variable by turning down the
85+
# validation
86+
X, y = make_regression()
87+
88+
def dummy_sampler(X, y):
89+
indices = np.random.choice(np.arange(X.shape[0]), size=100)
90+
return _safe_indexing(X, indices), _safe_indexing(y, indices)
91+
92+
sampler = FunctionSampler(func=dummy_sampler, validate=False)
93+
pipeline = make_pipeline(sampler, LinearRegression())
94+
y_pred = pipeline.fit(X, y).predict(X)
95+
96+
assert type_of_target(y_pred) == 'continuous'

0 commit comments

Comments
 (0)