ENH add validate parameter to FunctionSampler (#637)

glemaitre · web-flow · commit f7e477d89a15 · 2019-11-15T07:49:42.000+01:00
* add whats new

* add whats new

* fix

* always import pandas

* test documentation only with all dependencies installed

* ENH add validate parameter to FunctionSampler

* DOC add whats new and parameter in user guide

* create X y for regression
diff --git a/doc/miscellaneous.rst b/doc/miscellaneous.rst
@@ -34,6 +34,23 @@ to retain the 10 first elements of the array ``X`` and ``y``::
   >>> np.all(y_res == y[:10])
   True
 
+In addition, the parameter ``validate`` control input checking. For instance,
+turning ``validate=False`` allows to pass any type of target ``y`` and do some
+sampling for regression targets.
+
+  >>> from sklearn.datasets import make_regression
+  >>> X_reg, y_reg = make_regression(n_samples=100, random_state=42)
+  >>> rng = np.random.RandomState(42)
+  >>> def dummy_sampler(X, y):
+  ...     indices = rng.choice(np.arange(X.shape[0]), size=10)
+  ...     return X[indices], y[indices]
+  >>> sampler = FunctionSampler(func=dummy_sampler, validate=False)
+  >>> X_res, y_res = sampler.fit_resample(X_reg, y_reg)
+  >>> y_res
+  array([  41.49112498, -142.78526195,   85.55095317,  141.43321419,
+           75.46571114,  -67.49177372,  159.72700509, -169.80498923,
+          211.95889757,  211.95889757])
+
 We illustrate the use of such sampler to implement an outlier rejection
 estimator which can be easily used within a
 :class:`imblearn.pipeline.Pipeline`:
diff --git a/doc/whats_new/v0.6.rst b/doc/whats_new/v0.6.rst
@@ -42,11 +42,15 @@ Enhancement
 ...........
 
 - :class:`imblearn.under_sampling.RandomUnderSampling`,
-  :class:`imblearn.over_sampling.RandomOverSampling`,,
+  :class:`imblearn.over_sampling.RandomOverSampling`,
   :class:`imblearn.datasets.make_imbalance` accepts Pandas DataFrame in and
   will output Pandas DataFrame.
   :pr:`636` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+- :class:`imblearn.FunctionSampler` accepts a parameter ``validate`` allowing
+  to check or not the input ``X`` and ``y``.
+  :pr:`637` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 Deprecation
 ...........
 
diff --git a/imblearn/base.py b/imblearn/base.py
@@ -127,9 +127,11 @@ def __init__(self, sampling_strategy="auto"):
         self.sampling_strategy = sampling_strategy
 
     @staticmethod
-    def _check_X_y(X, y):
+    def _check_X_y(X, y, accept_sparse=None):
+        if accept_sparse is None:
+            accept_sparse = ["csr", "csc"]
         y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
-        X, y = check_X_y(X, y, accept_sparse=["csr", "csc"])
+        X, y = check_X_y(X, y, accept_sparse=accept_sparse)
         return X, y, binarize_y
 
 
@@ -156,6 +158,11 @@ class FunctionSampler(BaseSampler):
     kw_args : dict, optional (default=None)
         The keyword argument expected by ``func``.
 
+    validate : bool, default=True
+        Whether or not to bypass the validation of ``X`` and ``y``. Turning-off
+        validation allows to use the ``FunctionSampler`` with any type of
+        data.
+
     Notes
     -----
 
@@ -202,16 +209,55 @@ class FunctionSampler(BaseSampler):
 
     _sampling_type = "bypass"
 
-    def __init__(self, func=None, accept_sparse=True, kw_args=None):
+    def __init__(self, func=None, accept_sparse=True, kw_args=None,
+                 validate=True):
         super().__init__()
         self.func = func
         self.accept_sparse = accept_sparse
         self.kw_args = kw_args
+        self.validate = validate
 
-    def _fit_resample(self, X, y):
-        X, y = check_X_y(
-            X, y, accept_sparse=["csr", "csc"] if self.accept_sparse else False
+    def fit_resample(self, X, y):
+        """Resample the dataset.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Matrix containing the data which have to be sampled.
+
+        y : array-like, shape (n_samples,)
+            Corresponding label for each sample in X.
+
+        Returns
+        -------
+        X_resampled : {array-like, sparse matrix}, shape \
+(n_samples_new, n_features)
+            The array containing the resampled data.
+
+        y_resampled : array-like, shape (n_samples_new,)
+            The corresponding label of `X_resampled`.
+
+        """
+        if self.validate:
+            check_classification_targets(y)
+            X, y, binarize_y = self._check_X_y(
+                X, y, accept_sparse=self.accept_sparse
+            )
+
+        self.sampling_strategy_ = check_sampling_strategy(
+            self.sampling_strategy, y, self._sampling_type
         )
+
+        output = self._fit_resample(X, y)
+
+        if self.validate and binarize_y:
+            y_sampled = label_binarize(output[1], np.unique(y))
+            if len(output) == 2:
+                return output[0], y_sampled
+            return output[0], y_sampled, output[2]
+        return output
+
+    def _fit_resample(self, X, y):
         func = _identity if self.func is None else self.func
         output = func(X, y, **(self.kw_args if self.kw_args else {}))
         return output
diff --git a/imblearn/tests/test_base.py b/imblearn/tests/test_base.py
@@ -5,16 +5,23 @@
 
 import pytest
 
+import numpy as np
 from scipy import sparse
 
 from sklearn.datasets import load_iris
+from sklearn.datasets import make_regression
+from sklearn.linear_model import LinearRegression
+from sklearn.utils import _safe_indexing
+from sklearn.utils.multiclass import type_of_target
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_allclose_dense_sparse
 
 from imblearn.datasets import make_imbalance
-from imblearn import FunctionSampler
+from imblearn.pipeline import make_pipeline
 from imblearn.under_sampling import RandomUnderSampler
 
+from imblearn import FunctionSampler
+
 iris = load_iris()
 X, y = make_imbalance(
     iris.data, iris.target, sampling_strategy={0: 10, 1: 25}, random_state=0
@@ -71,3 +78,19 @@ def func(X, y, sampling_strategy, random_state):
     X_res_2, y_res_2 = RandomUnderSampler(random_state=0).fit_resample(X, y)
     assert_allclose_dense_sparse(X_res, X_res_2)
     assert_array_equal(y_res, y_res_2)
+
+
+def test_function_sampler_validate():
+    # check that we can let a pass a regression variable by turning down the
+    # validation
+    X, y = make_regression()
+
+    def dummy_sampler(X, y):
+        indices = np.random.choice(np.arange(X.shape[0]), size=100)
+        return _safe_indexing(X, indices), _safe_indexing(y, indices)
+
+    sampler = FunctionSampler(func=dummy_sampler, validate=False)
+    pipeline = make_pipeline(sampler, LinearRegression())
+    y_pred = pipeline.fit(X, y).predict(X)
+
+    assert type_of_target(y_pred) == 'continuous'