Skip to content

Commit dcfa5f3

Browse files
authored
FIX handle full NaT columns properly in Random*Sampler (#1059)
1 parent 9e976a4 commit dcfa5f3

File tree

4 files changed

+69
-1
lines changed

4 files changed

+69
-1
lines changed

doc/whats_new/v0.12.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@ Bug fixes
1818
the number of samples in the minority class.
1919
:pr:`1012` by :user:`Guillaume Lemaitre <glemaitre>`.
2020

21+
- Fix a bug in :class:`~imblearn.under_sampling.RandomUnderSampler` and
22+
:class:`~imblearn.over_sampling.RandomOverSampler` where a column containing only
23+
NaT was not handled correctly.
24+
:pr:`1059` by :user:`Guillaume Lemaitre <glemaitre>`.
25+
2126
Compatibility
2227
.............
2328

imblearn/over_sampling/tests/test_random_over_sampler.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,3 +287,26 @@ def test_random_over_sampling_datetime():
287287
pd.testing.assert_series_equal(X_res.dtypes, X.dtypes)
288288
pd.testing.assert_index_equal(X_res.index, y_res.index)
289289
assert_array_equal(y_res.to_numpy(), np.array([0, 0, 0, 1, 1, 1]))
290+
291+
292+
def test_random_over_sampler_full_nat():
293+
"""Check that we can return timedelta columns full of NaT.
294+
295+
Non-regression test for:
296+
https://github.com/scikit-learn-contrib/imbalanced-learn/issues/1055
297+
"""
298+
pd = pytest.importorskip("pandas")
299+
300+
X = pd.DataFrame(
301+
{
302+
"col_str": ["abc", "def", "xyz"],
303+
"col_timedelta": pd.to_timedelta([np.nan, np.nan, np.nan]),
304+
}
305+
)
306+
y = np.array([0, 0, 1])
307+
308+
X_res, y_res = RandomOverSampler().fit_resample(X, y)
309+
assert X_res.shape == (4, 2)
310+
assert y_res.shape == (4,)
311+
312+
assert X_res["col_timedelta"].dtype == "timedelta64[ns]"

imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,3 +162,26 @@ def test_random_under_sampling_datetime():
162162
pd.testing.assert_series_equal(X_res.dtypes, X.dtypes)
163163
pd.testing.assert_index_equal(X_res.index, y_res.index)
164164
assert_array_equal(y_res.to_numpy(), np.array([0, 1]))
165+
166+
167+
def test_random_under_sampler_full_nat():
168+
"""Check that we can return timedelta columns full of NaT.
169+
170+
Non-regression test for:
171+
https://github.com/scikit-learn-contrib/imbalanced-learn/issues/1055
172+
"""
173+
pd = pytest.importorskip("pandas")
174+
175+
X = pd.DataFrame(
176+
{
177+
"col_str": ["abc", "def", "xyz"],
178+
"col_timedelta": pd.to_timedelta([np.nan, np.nan, np.nan]),
179+
}
180+
)
181+
y = np.array([0, 0, 1])
182+
183+
X_res, y_res = RandomUnderSampler().fit_resample(X, y)
184+
assert X_res.shape == (2, 2)
185+
assert y_res.shape == (2,)
186+
187+
assert X_res["col_timedelta"].dtype == "timedelta64[ns]"

imblearn/utils/_validation.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,24 @@ def _transfrom_one(self, array, props):
6666
ret = pd.DataFrame.sparse.from_spmatrix(array, columns=props["columns"])
6767
else:
6868
ret = pd.DataFrame(array, columns=props["columns"])
69-
ret = ret.astype(props["dtypes"])
69+
70+
try:
71+
ret = ret.astype(props["dtypes"])
72+
except TypeError:
73+
# We special case the following error:
74+
# https://github.com/scikit-learn-contrib/imbalanced-learn/issues/1055
75+
# There is no easy way to have a generic workaround. Here, we detect
76+
# that we have a column with only null values that is datetime64
77+
# (resulting from the np.vstack of the resampling).
78+
for col in ret.columns:
79+
if (
80+
ret[col].isnull().all()
81+
and ret[col].dtype == "datetime64[ns]"
82+
and props["dtypes"][col] == "timedelta64[ns]"
83+
):
84+
ret[col] = pd.to_timedelta(["NaT"] * len(ret[col]))
85+
# try again
86+
ret = ret.astype(props["dtypes"])
7087
elif type_ == "series":
7188
import pandas as pd
7289

0 commit comments

Comments
 (0)