Skip to content

Commit a8a8adb

Browse files
authored
FIX raise proper error message when only categorical passed to SMOTE-NC (#720)
1 parent 91b99ce commit a8a8adb

File tree

4 files changed

+31
-2
lines changed

4 files changed

+31
-2
lines changed

doc/over_sampling.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,9 @@ something specific for the categorical features. In fact, the categories of a
230230
new generated sample are decided by picking the most frequent category of the
231231
nearest neighbors present during the generation.
232232

233+
.. warning::
234+
Be aware that SMOTE-NC is not designed to work with only categorical data.
235+
233236
The other SMOTE variants and ADASYN differ from each other by selecting the
234237
samples :math:`x_i` ahead of generating the new samples.
235238

doc/whats_new/v0.7.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@ Bug fixes
3838
unusable.
3939
:pr:`710` by :user:`Guillaume Lemaitre <glemaitre>`.
4040

41+
- Raise a proper error message when only numerical or categorical features
42+
are given in :class:`imblearn.over_sampling.SMOTENC`.
43+
:pr:`720` by :user:`Guillaume Lemaitre <glemaitre>`.
44+
4145
Enhancements
4246
............
4347

imblearn/over_sampling/_smote.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
from sklearn.utils import check_random_state
2121
from sklearn.utils import _safe_indexing
2222
from sklearn.utils import check_array
23-
from sklearn.utils import check_X_y
2423
from sklearn.utils.sparsefuncs_fast import csr_mean_variance_axis0
2524
from sklearn.utils.sparsefuncs_fast import csc_mean_variance_axis0
2625

@@ -747,6 +746,7 @@ class SMOTENC(SMOTE):
747746
"""Synthetic Minority Over-sampling Technique for Nominal and Continuous.
748747
749748
Unlike :class:`SMOTE`, SMOTE-NC for dataset containing continuous and
749+
categorical features. However, it is not designed to work with only
750750
categorical features.
751751
752752
Read more in the :ref:`User Guide <smote_adasyn>`.
@@ -893,7 +893,9 @@ def _check_X_y(self, X, y):
893893
features.
894894
"""
895895
y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
896-
X, y = check_X_y(X, y, accept_sparse=["csr", "csc"], dtype=None)
896+
X, y = self._validate_data(
897+
X, y, reset=True, dtype=None, accept_sparse=["csr", "csc"]
898+
)
897899
return X, y, binarize_y
898900

899901
def _validate_estimator(self):
@@ -917,6 +919,12 @@ def _validate_estimator(self):
917919
np.arange(self.n_features_), self.categorical_features_
918920
)
919921

922+
if self.categorical_features_.size == self.n_features_in_:
923+
raise ValueError(
924+
"SMOTE-NC is not designed to work only with categorical "
925+
"features. It requires some numerical features."
926+
)
927+
920928
def _fit_resample(self, X, y):
921929
self.n_features_ = X.shape[1]
922930
self._validate_estimator()

imblearn/over_sampling/tests/test_smote_nc.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,3 +204,17 @@ def test_smotenc_preserve_dtype():
204204
X_res, y_res = smote.fit_resample(X, y)
205205
assert X.dtype == X_res.dtype, "X dtype is not preserved"
206206
assert y.dtype == y_res.dtype, "y dtype is not preserved"
207+
208+
209+
@pytest.mark.parametrize(
210+
"categorical_features", [[True, True, True], [0, 1, 2]]
211+
)
212+
def test_smotenc_raising_error_all_categorical(categorical_features):
213+
X, y = make_classification(
214+
n_features=3, n_informative=1, n_redundant=1, n_repeated=0,
215+
n_clusters_per_class=1,
216+
)
217+
smote = SMOTENC(categorical_features=categorical_features)
218+
err_msg = "SMOTE-NC is not designed to work only with categorical features"
219+
with pytest.raises(ValueError, match=err_msg):
220+
smote.fit_resample(X, y)

0 commit comments

Comments
 (0)