EHN RandomUnderSampler/RandomOverSampler/make_imbalance accepts pandas dataframe in/out

glemaitre · glemaitre · commit 0493258bdb4d · 2019-11-15T00:30:07.000+01:00
diff --git a/.travis.yml b/.travis.yml
@@ -30,17 +30,17 @@ matrix:
   include:
     # This environment tests the using anaconda
     # Ubuntu 14.04 environment
-    - env: DISTRIB="ubuntu"
+    - env: DISTRIB="ubuntu" TEST_DOC="true"
     # Latest release
     - env: DISTRIB="conda" PYTHON_VERSION="3.7"
            NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master"
-           OPTIONAL_DEPS="keras"
+           OPTIONAL_DEPS="keras" TEST_DOC="true"
     - env: DISTRIB="conda" PYTHON_VERSION="3.7"
            NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master"
-           OPTIONAL_DEPS="tensorflow"
+           OPTIONAL_DEPS="tensorflow" TEST_DOC="true"
     - env: DISTRIB="conda" PYTHON_VERSION="3.7"
            NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master"
-           OPTIONAL_DEPS="false"
+           OPTIONAL_DEPS="false" TEST_DOC="false"
 
 install: source build_tools/travis/install.sh
 script: bash build_tools/travis/test_script.sh
diff --git a/build_tools/travis/test_script.sh b/build_tools/travis/test_script.sh
@@ -25,7 +25,9 @@ run_tests(){
 
     # Test doc
     cd $OLDPWD
-    make test-doc
+    if [[ "$TEST_DOC" == "true" ]]; then
+        make test-doc
+    fi
 }
 
 if [[ "$SKIP_TESTS" != "true" ]]; then
diff --git a/doc/datasets/index.rst b/doc/datasets/index.rst
@@ -131,5 +131,23 @@ Instead of a dictionary, a function can be defined and directly pass to
   >>> sorted(Counter(y_imb).items())
   [(0, 25), (1, 35), (2, 47)]
 
+It would also work with pandas dataframe::
+
+  >>> from sklearn.datasets import fetch_openml
+  >>> df, y = fetch_openml(
+  ...     'iris', version=1, return_X_y=True, as_frame=True)
+  >>> df_resampled, y_resampled = make_imbalance(
+  ...     df, y, sampling_strategy={'Iris-setosa': 10, 'Iris-versicolor': 20},
+  ...     random_state=42)
+  >>> df_resampled.head()
+          sepallength  sepalwidth  petallength  petalwidth
+    13          4.3         3.0          1.1         0.1
+    39          5.1         3.4          1.5         0.2
+    30          4.8         3.1          1.6         0.2
+    45          4.8         3.0          1.4         0.3
+    17          5.1         3.5          1.4         0.3
+  >>> Counter(y_resampled)
+  Counter({'Iris-virginica': 50, 'Iris-versicolor': 20, 'Iris-setosa': 10})
+
 See :ref:`sphx_glr_auto_examples_datasets_plot_make_imbalance.py` and
 :ref:`sphx_glr_auto_examples_plot_sampling_strategy_usage.py`.
diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst
@@ -71,6 +71,15 @@ In addition, :class:`RandomOverSampler` allows to sample heterogeneous data
   >>> print(y_resampled)
   [0 0 1 1]
 
+It would also work with pandas dataframe::
+
+  >>> from sklearn.datasets import fetch_openml
+  >>> df_adult, y_adult = fetch_openml(
+  ...     'adult', version=2, as_frame=True, return_X_y=True)
+  >>> df_adult.head()  # doctest: +SKIP
+  >>> df_resampled, y_resampled = ros.fit_resample(df_adult, y_adult)
+  >>> df_resampled.head()  # doctest: +SKIP
+
 .. _smote_adasyn:
 
 From random over-sampling to SMOTE and ADASYN
diff --git a/doc/under_sampling.rst b/doc/under_sampling.rst
@@ -116,6 +116,15 @@ In addition, :class:`RandomUnderSampler` allows to sample heterogeneous data
   >>> print(y_resampled)
   [0 1]
 
+It would also work with pandas dataframe::
+
+  >>> from sklearn.datasets import fetch_openml
+  >>> df_adult, y_adult = fetch_openml(
+  ...     'adult', version=2, as_frame=True, return_X_y=True)
+  >>> df_adult.head()  # doctest: +SKIP
+  >>> df_resampled, y_resampled = rus.fit_resample(df_adult, y_adult)
+  >>> df_resampled.head()  # doctest: +SKIP
+
 :class:`NearMiss` adds some heuristic rules to select samples [MZ2003]_.
 :class:`NearMiss` implements 3 different types of heuristic which can be
 selected with the parameter ``version``::
diff --git a/doc/whats_new/v0.6.rst b/doc/whats_new/v0.6.rst
@@ -38,6 +38,15 @@ Maintenance
   parameters `max_samples` and `ccp_alpha`.
   :pr:`621` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+Enhancement
+...........
+
+- :class:`imblearn.under_sampling.RandomUnderSampling`,
+  :class:`imblearn.over_sampling.RandomOverSampling`,,
+  :class:`imblearn.datasets.make_imbalance` accepts Pandas DataFrame in and
+  will output Pandas DataFrame.
+  :pr:`636` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 Deprecation
 ...........
 
diff --git a/imblearn/datasets/_imbalance.py b/imblearn/datasets/_imbalance.py
@@ -7,8 +7,6 @@
 
 from collections import Counter
 
-from sklearn.utils import check_X_y
-
 from ..under_sampling import RandomUnderSampler
 from ..utils import check_sampling_strategy
 
@@ -26,7 +24,7 @@ def make_imbalance(
 
     Parameters
     ----------
-    X : ndarray, shape (n_samples, n_features)
+    X : {array-like, dataframe}, shape (n_samples, n_features)
         Matrix containing the data to be imbalanced.
 
     y : ndarray, shape (n_samples, )
@@ -58,7 +56,7 @@ def make_imbalance(
 
     Returns
     -------
-    X_resampled : ndarray, shape (n_samples_new, n_features)
+    X_resampled : {ndarray, dataframe}, shape (n_samples_new, n_features)
         The array containing the imbalanced data.
 
     y_resampled : ndarray, shape (n_samples_new)
@@ -88,7 +86,6 @@ def make_imbalance(
     Distribution after imbalancing: Counter({2: 30, 1: 20, 0: 10})
 
     """
-    X, y = check_X_y(X, y)
     target_stats = Counter(y)
     # restrict ratio to be a dict or a callable
     if isinstance(sampling_strategy, dict) or callable(sampling_strategy):
diff --git a/imblearn/datasets/tests/test_imbalance.py b/imblearn/datasets/tests/test_imbalance.py
@@ -9,6 +9,7 @@
 import numpy as np
 
 from sklearn.datasets import load_iris
+from sklearn.datasets import fetch_openml
 
 from imblearn.datasets import make_imbalance
 
@@ -52,3 +53,22 @@ def test_make_imbalance_dict(iris, sampling_strategy, expected_counts):
     X, y = iris
     _, y_ = make_imbalance(X, y, sampling_strategy=sampling_strategy)
     assert Counter(y_) == expected_counts
+
+
+@pytest.mark.parametrize("as_frame", [True, False], ids=['dataframe', 'array'])
+@pytest.mark.parametrize(
+    "sampling_strategy, expected_counts",
+    [
+        ({'Iris-setosa': 10, 'Iris-versicolor': 20, 'Iris-virginica': 30},
+         {'Iris-setosa': 10, 'Iris-versicolor': 20, 'Iris-virginica': 30}),
+        ({'Iris-setosa': 10, 'Iris-versicolor': 20},
+         {'Iris-setosa': 10, 'Iris-versicolor': 20, 'Iris-virginica': 50}),
+    ],
+)
+def test_make_imbalanced_iris(as_frame, sampling_strategy, expected_counts):
+    pytest.importorskip("pandas")
+    X, y = fetch_openml('iris', version=1, return_X_y=True, as_frame=as_frame)
+    X_res, y_res = make_imbalance(X, y, sampling_strategy=sampling_strategy)
+    if as_frame:
+        assert hasattr(X_res, "loc")
+    assert Counter(y_res) == expected_counts
diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py
@@ -7,7 +7,7 @@
 from collections import Counter
 
 import numpy as np
-from sklearn.utils import check_X_y
+from sklearn.utils import check_array
 from sklearn.utils import check_random_state
 from sklearn.utils import _safe_indexing
 
@@ -74,7 +74,12 @@ def __init__(self, sampling_strategy="auto", random_state=None):
     @staticmethod
     def _check_X_y(X, y):
         y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
-        X, y = check_X_y(X, y, accept_sparse=["csr", "csc"], dtype=None)
+        if not hasattr(X, "loc"):
+            # Do not convert dataframe
+            X = check_array(X, accept_sparse=["csr", "csc"], dtype=None)
+        y = check_array(
+            y, accept_sparse=["csr", "csc"], dtype=None, ensure_2d=False
+        )
         return X, y, binarize_y
 
     def _fit_resample(self, X, y):
diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py
@@ -6,6 +6,8 @@
 from collections import Counter
 
 import numpy as np
+import pytest
+
 from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_array_equal
 
@@ -37,9 +39,15 @@ def test_ros_init():
     assert ros.random_state == RND_SEED
 
 
-def test_ros_fit_resample():
+@pytest.mark.parametrize("as_frame", [True, False], ids=['dataframe', 'array'])
+def test_ros_fit_resample(as_frame):
+    if as_frame:
+        pd = pytest.importorskip("pandas")
+        X_ = pd.DataFrame(X)
+    else:
+        X_ = X
     ros = RandomOverSampler(random_state=RND_SEED)
-    X_resampled, y_resampled = ros.fit_resample(X, Y)
+    X_resampled, y_resampled = ros.fit_resample(X_, Y)
     X_gt = np.array(
         [
             [0.04352327, -0.20515826],
@@ -59,6 +67,11 @@ def test_ros_fit_resample():
         ]
     )
     y_gt = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0])
+
+    if as_frame:
+        assert hasattr(X_resampled, "loc")
+        X_resampled = X_resampled.to_numpy()
+
     assert_allclose(X_resampled, X_gt)
     assert_array_equal(y_resampled, y_gt)
 
diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py
@@ -80,7 +80,9 @@ def __init__(
     @staticmethod
     def _check_X_y(X, y):
         y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
-        X = check_array(X, accept_sparse=["csr", "csc"], dtype=None)
+        if not hasattr(X, "loc"):
+            # Do not convert dataframe
+            X = check_array(X, accept_sparse=["csr", "csc"], dtype=None)
         y = check_array(
             y, accept_sparse=["csr", "csc"], dtype=None, ensure_2d=False
         )
diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py
@@ -6,6 +6,8 @@
 from collections import Counter
 
 import numpy as np
+import pytest
+
 from sklearn.utils._testing import assert_array_equal
 
 from imblearn.under_sampling import RandomUnderSampler
@@ -28,9 +30,15 @@
 Y = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1])
 
 
-def test_rus_fit_resample():
+@pytest.mark.parametrize("as_frame", [True, False], ids=['dataframe', 'array'])
+def test_rus_fit_resample(as_frame):
+    if as_frame:
+        pd = pytest.importorskip("pandas")
+        X_ = pd.DataFrame(X)
+    else:
+        X_ = X
     rus = RandomUnderSampler(random_state=RND_SEED, replacement=True)
-    X_resampled, y_resampled = rus.fit_resample(X, Y)
+    X_resampled, y_resampled = rus.fit_resample(X_, Y)
 
     X_gt = np.array(
         [
@@ -44,6 +52,10 @@ def test_rus_fit_resample():
     )
     y_gt = np.array([0, 0, 0, 1, 1, 1])
 
+    if as_frame:
+        assert hasattr(X_resampled, "loc")
+        X_resampled = X_resampled.to_numpy()
+
     assert_array_equal(X_resampled, X_gt)
     assert_array_equal(y_resampled, y_gt)
 

Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,9 @@ run_tests(){`
`25`	`25`
`26`	`26`	`# Test doc`
`27`	`27`	`cd $OLDPWD`
`28`		`- make test-doc`
	`28`	`+ if [[ "$TEST_DOC" == "true" ]]; then`
	`29`	`+ make test-doc`
	`30`	`+ fi`
`29`	`31`	`}`
`30`	`32`
`31`	`33`	`if [[ "$SKIP_TESTS" != "true" ]]; then`