Skip to content

Commit 0493258

Browse files
committed
EHN RandomUnderSampler/RandomOverSampler/make_imbalance accepts pandas dataframe in/out
1 parent afbf781 commit 0493258

File tree

12 files changed

+113
-17
lines changed

12 files changed

+113
-17
lines changed

.travis.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,17 +30,17 @@ matrix:
3030
include:
3131
# This environment tests the using anaconda
3232
# Ubuntu 14.04 environment
33-
- env: DISTRIB="ubuntu"
33+
- env: DISTRIB="ubuntu" TEST_DOC="true"
3434
# Latest release
3535
- env: DISTRIB="conda" PYTHON_VERSION="3.7"
3636
NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master"
37-
OPTIONAL_DEPS="keras"
37+
OPTIONAL_DEPS="keras" TEST_DOC="true"
3838
- env: DISTRIB="conda" PYTHON_VERSION="3.7"
3939
NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master"
40-
OPTIONAL_DEPS="tensorflow"
40+
OPTIONAL_DEPS="tensorflow" TEST_DOC="true"
4141
- env: DISTRIB="conda" PYTHON_VERSION="3.7"
4242
NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master"
43-
OPTIONAL_DEPS="false"
43+
OPTIONAL_DEPS="false" TEST_DOC="false"
4444

4545
install: source build_tools/travis/install.sh
4646
script: bash build_tools/travis/test_script.sh

build_tools/travis/test_script.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,9 @@ run_tests(){
2525

2626
# Test doc
2727
cd $OLDPWD
28-
make test-doc
28+
if [[ "$TEST_DOC" == "true" ]]; then
29+
make test-doc
30+
fi
2931
}
3032

3133
if [[ "$SKIP_TESTS" != "true" ]]; then

doc/datasets/index.rst

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,5 +131,23 @@ Instead of a dictionary, a function can be defined and directly pass to
131131
>>> sorted(Counter(y_imb).items())
132132
[(0, 25), (1, 35), (2, 47)]
133133

134+
It would also work with pandas dataframe::
135+
136+
>>> from sklearn.datasets import fetch_openml
137+
>>> df, y = fetch_openml(
138+
... 'iris', version=1, return_X_y=True, as_frame=True)
139+
>>> df_resampled, y_resampled = make_imbalance(
140+
... df, y, sampling_strategy={'Iris-setosa': 10, 'Iris-versicolor': 20},
141+
... random_state=42)
142+
>>> df_resampled.head()
143+
sepallength sepalwidth petallength petalwidth
144+
13 4.3 3.0 1.1 0.1
145+
39 5.1 3.4 1.5 0.2
146+
30 4.8 3.1 1.6 0.2
147+
45 4.8 3.0 1.4 0.3
148+
17 5.1 3.5 1.4 0.3
149+
>>> Counter(y_resampled)
150+
Counter({'Iris-virginica': 50, 'Iris-versicolor': 20, 'Iris-setosa': 10})
151+
134152
See :ref:`sphx_glr_auto_examples_datasets_plot_make_imbalance.py` and
135153
:ref:`sphx_glr_auto_examples_plot_sampling_strategy_usage.py`.

doc/over_sampling.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,15 @@ In addition, :class:`RandomOverSampler` allows to sample heterogeneous data
7171
>>> print(y_resampled)
7272
[0 0 1 1]
7373

74+
It would also work with pandas dataframe::
75+
76+
>>> from sklearn.datasets import fetch_openml
77+
>>> df_adult, y_adult = fetch_openml(
78+
... 'adult', version=2, as_frame=True, return_X_y=True)
79+
>>> df_adult.head() # doctest: +SKIP
80+
>>> df_resampled, y_resampled = ros.fit_resample(df_adult, y_adult)
81+
>>> df_resampled.head() # doctest: +SKIP
82+
7483
.. _smote_adasyn:
7584

7685
From random over-sampling to SMOTE and ADASYN

doc/under_sampling.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,15 @@ In addition, :class:`RandomUnderSampler` allows to sample heterogeneous data
116116
>>> print(y_resampled)
117117
[0 1]
118118

119+
It would also work with pandas dataframe::
120+
121+
>>> from sklearn.datasets import fetch_openml
122+
>>> df_adult, y_adult = fetch_openml(
123+
... 'adult', version=2, as_frame=True, return_X_y=True)
124+
>>> df_adult.head() # doctest: +SKIP
125+
>>> df_resampled, y_resampled = rus.fit_resample(df_adult, y_adult)
126+
>>> df_resampled.head() # doctest: +SKIP
127+
119128
:class:`NearMiss` adds some heuristic rules to select samples [MZ2003]_.
120129
:class:`NearMiss` implements 3 different types of heuristic which can be
121130
selected with the parameter ``version``::

doc/whats_new/v0.6.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,15 @@ Maintenance
3838
parameters `max_samples` and `ccp_alpha`.
3939
:pr:`621` by :user:`Guillaume Lemaitre <glemaitre>`.
4040

41+
Enhancement
42+
...........
43+
44+
- :class:`imblearn.under_sampling.RandomUnderSampling`,
45+
:class:`imblearn.over_sampling.RandomOverSampling`,,
46+
:class:`imblearn.datasets.make_imbalance` accepts Pandas DataFrame in and
47+
will output Pandas DataFrame.
48+
:pr:`636` by :user:`Guillaume Lemaitre <glemaitre>`.
49+
4150
Deprecation
4251
...........
4352

imblearn/datasets/_imbalance.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77

88
from collections import Counter
99

10-
from sklearn.utils import check_X_y
11-
1210
from ..under_sampling import RandomUnderSampler
1311
from ..utils import check_sampling_strategy
1412

@@ -26,7 +24,7 @@ def make_imbalance(
2624
2725
Parameters
2826
----------
29-
X : ndarray, shape (n_samples, n_features)
27+
X : {array-like, dataframe}, shape (n_samples, n_features)
3028
Matrix containing the data to be imbalanced.
3129
3230
y : ndarray, shape (n_samples, )
@@ -58,7 +56,7 @@ def make_imbalance(
5856
5957
Returns
6058
-------
61-
X_resampled : ndarray, shape (n_samples_new, n_features)
59+
X_resampled : {ndarray, dataframe}, shape (n_samples_new, n_features)
6260
The array containing the imbalanced data.
6361
6462
y_resampled : ndarray, shape (n_samples_new)
@@ -88,7 +86,6 @@ def make_imbalance(
8886
Distribution after imbalancing: Counter({2: 30, 1: 20, 0: 10})
8987
9088
"""
91-
X, y = check_X_y(X, y)
9289
target_stats = Counter(y)
9390
# restrict ratio to be a dict or a callable
9491
if isinstance(sampling_strategy, dict) or callable(sampling_strategy):

imblearn/datasets/tests/test_imbalance.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import numpy as np
1010

1111
from sklearn.datasets import load_iris
12+
from sklearn.datasets import fetch_openml
1213

1314
from imblearn.datasets import make_imbalance
1415

@@ -52,3 +53,22 @@ def test_make_imbalance_dict(iris, sampling_strategy, expected_counts):
5253
X, y = iris
5354
_, y_ = make_imbalance(X, y, sampling_strategy=sampling_strategy)
5455
assert Counter(y_) == expected_counts
56+
57+
58+
@pytest.mark.parametrize("as_frame", [True, False], ids=['dataframe', 'array'])
59+
@pytest.mark.parametrize(
60+
"sampling_strategy, expected_counts",
61+
[
62+
({'Iris-setosa': 10, 'Iris-versicolor': 20, 'Iris-virginica': 30},
63+
{'Iris-setosa': 10, 'Iris-versicolor': 20, 'Iris-virginica': 30}),
64+
({'Iris-setosa': 10, 'Iris-versicolor': 20},
65+
{'Iris-setosa': 10, 'Iris-versicolor': 20, 'Iris-virginica': 50}),
66+
],
67+
)
68+
def test_make_imbalanced_iris(as_frame, sampling_strategy, expected_counts):
69+
pytest.importorskip("pandas")
70+
X, y = fetch_openml('iris', version=1, return_X_y=True, as_frame=as_frame)
71+
X_res, y_res = make_imbalance(X, y, sampling_strategy=sampling_strategy)
72+
if as_frame:
73+
assert hasattr(X_res, "loc")
74+
assert Counter(y_res) == expected_counts

imblearn/over_sampling/_random_over_sampler.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from collections import Counter
88

99
import numpy as np
10-
from sklearn.utils import check_X_y
10+
from sklearn.utils import check_array
1111
from sklearn.utils import check_random_state
1212
from sklearn.utils import _safe_indexing
1313

@@ -74,7 +74,12 @@ def __init__(self, sampling_strategy="auto", random_state=None):
7474
@staticmethod
7575
def _check_X_y(X, y):
7676
y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
77-
X, y = check_X_y(X, y, accept_sparse=["csr", "csc"], dtype=None)
77+
if not hasattr(X, "loc"):
78+
# Do not convert dataframe
79+
X = check_array(X, accept_sparse=["csr", "csc"], dtype=None)
80+
y = check_array(
81+
y, accept_sparse=["csr", "csc"], dtype=None, ensure_2d=False
82+
)
7883
return X, y, binarize_y
7984

8085
def _fit_resample(self, X, y):

imblearn/over_sampling/tests/test_random_over_sampler.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
from collections import Counter
77

88
import numpy as np
9+
import pytest
10+
911
from sklearn.utils._testing import assert_allclose
1012
from sklearn.utils._testing import assert_array_equal
1113

@@ -37,9 +39,15 @@ def test_ros_init():
3739
assert ros.random_state == RND_SEED
3840

3941

40-
def test_ros_fit_resample():
42+
@pytest.mark.parametrize("as_frame", [True, False], ids=['dataframe', 'array'])
43+
def test_ros_fit_resample(as_frame):
44+
if as_frame:
45+
pd = pytest.importorskip("pandas")
46+
X_ = pd.DataFrame(X)
47+
else:
48+
X_ = X
4149
ros = RandomOverSampler(random_state=RND_SEED)
42-
X_resampled, y_resampled = ros.fit_resample(X, Y)
50+
X_resampled, y_resampled = ros.fit_resample(X_, Y)
4351
X_gt = np.array(
4452
[
4553
[0.04352327, -0.20515826],
@@ -59,6 +67,11 @@ def test_ros_fit_resample():
5967
]
6068
)
6169
y_gt = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0])
70+
71+
if as_frame:
72+
assert hasattr(X_resampled, "loc")
73+
X_resampled = X_resampled.to_numpy()
74+
6275
assert_allclose(X_resampled, X_gt)
6376
assert_array_equal(y_resampled, y_gt)
6477

0 commit comments

Comments
 (0)