FIX Prevent incorrect class category resampling in SMOTENC when median_std_ is 0 (#675)

bganglia · glemaitre · web-flow · commit 3c6d2323763d · 2020-06-09T11:34:37.000+02:00
Co-authored-by: Guillaume Lemaitre &lt;g.lemaitre58@gmail.com&gt;
diff --git a/doc/whats_new/v0.7.rst b/doc/whats_new/v0.7.rst
@@ -42,6 +42,10 @@ Bug fixes
   are given in :class:`imblearn.over_sampling.SMOTENC`.
   :pr:`720` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+- Fix a bug when the median of the standard deviation is null in
+  :class:`imblearn.over_sampling.SMOTENC`.
+  :pr:`675` by :user:`bganglia <bganglia>`.
+
 Enhancements
 ............
 
diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py
@@ -54,7 +54,6 @@ def _validate_estimator(self):
         self.nn_k_ = check_neighbors_object(
             "k_neighbors", self.k_neighbors, additional_neighbor=1
         )
-        self.nn_k_.set_params(**{"n_jobs": self.n_jobs})
 
     def _make_samples(
         self, X, y_dtype, y_type, nn_data, nn_num, n_samples, step_size=1.0
@@ -956,6 +955,7 @@ def _fit_resample(self, X, y):
         self.ohe_ = OneHotEncoder(
             sparse=True, handle_unknown="ignore", dtype=dtype_ohe
         )
+
         # the input of the OneHotEncoder needs to be dense
         X_ohe = self.ohe_.fit_transform(
             X_categorical.toarray()
@@ -967,6 +967,15 @@ def _fit_resample(self, X, y):
         # median of the standard deviation. It will ensure that whenever
         # distance is computed between 2 samples, the difference will be equal
         # to the median of the standard deviation as in the original paper.
+
+        # In the edge case where the median of the std is equal to 0, the 1s
+        # entries will be also nullified. In this case, we store the original
+        # categorical encoding which will be later used for inversing the OHE
+        if math.isclose(self.median_std_, 0):
+            self._X_categorical_minority_encoded = _safe_indexing(
+                X_ohe.toarray(), np.flatnonzero(y == class_minority)
+            )
+
         X_ohe.data = (
             np.ones_like(X_ohe.data, dtype=X_ohe.dtype) * self.median_std_ / 2
         )
@@ -1027,6 +1036,14 @@ def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps):
 
         # convert to dense array since scipy.sparse doesn't handle 3D
         nn_data = (nn_data.toarray() if sparse.issparse(nn_data) else nn_data)
+
+        # In the case that the median std was equal to zeros, we have to
+        # create non-null entry based on the encoded of OHE
+        if math.isclose(self.median_std_, 0):
+            nn_data[:, self.continuous_features_.size:] = (
+                self._X_categorical_minority_encoded
+            )
+
         all_neighbors = nn_data[nn_num[rows]]
 
         categories_size = [self.continuous_features_.size] + [
diff --git a/imblearn/over_sampling/tests/test_smote_nc.py b/imblearn/over_sampling/tests/test_smote_nc.py
@@ -218,3 +218,21 @@ def test_smotenc_raising_error_all_categorical(categorical_features):
     err_msg = "SMOTE-NC is not designed to work only with categorical features"
     with pytest.raises(ValueError, match=err_msg):
         smote.fit_resample(X, y)
+
+
+def test_smote_nc_with_null_median_std():
+    # Non-regression test for #662
+    # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/662
+    data = np.array([[1, 2, 1, 'A'],
+                     [2, 1, 2, 'A'],
+                     [1, 2, 3, 'B'],
+                     [1, 2, 4, 'C'],
+                     [1, 2, 5, 'C']], dtype="object")
+    labels = np.array(
+        ['class_1', 'class_1', 'class_1', 'class_2', 'class_2'], dtype=object
+    )
+    smote = SMOTENC(categorical_features=[3], k_neighbors=1, random_state=0)
+    X_res, y_res = smote.fit_resample(data, labels)
+    # check that the categorical feature is not random but correspond to the
+    # categories seen in the minority class samples
+    assert X_res[-1, -1] == "C"