FIX multiply by random number < 0.5 for BorderlineSMOTE-2 (#1027)

glemaitre · solegalli · web-flow · commit ec272598d64b · 2023-07-11T15:09:23.000+02:00
Co-authored-by: Soledad Galli &lt;solegalli@protonmail.com&gt;
diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py
@@ -68,7 +68,7 @@ def _validate_estimator(self):
         )
 
     def _make_samples(
-        self, X, y_dtype, y_type, nn_data, nn_num, n_samples, step_size=1.0
+        self, X, y_dtype, y_type, nn_data, nn_num, n_samples, step_size=1.0, y=None
     ):
         """A support function that returns artificial samples constructed along
         the line connecting nearest neighbours.
@@ -98,6 +98,10 @@ def _make_samples(
         step_size : float, default=1.0
             The step size to create samples.
 
+        y : ndarray of shape (n_samples_all,), default=None
+            The true target associated with `nn_data`. Used by Borderline SMOTE-2 to
+            weight the distances in the sample generation process.
+
         Returns
         -------
         X_new : {ndarray, sparse matrix} of shape (n_samples_new, n_features)
@@ -114,11 +118,13 @@ def _make_samples(
         rows = np.floor_divide(samples_indices, nn_num.shape[1])
         cols = np.mod(samples_indices, nn_num.shape[1])
 
-        X_new = self._generate_samples(X, nn_data, nn_num, rows, cols, steps, y_type)
+        X_new = self._generate_samples(X, nn_data, nn_num, rows, cols, steps, y_type, y)
         y_new = np.full(n_samples, fill_value=y_type, dtype=y_dtype)
         return X_new, y_new
 
-    def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps, y_type=None):
+    def _generate_samples(
+        self, X, nn_data, nn_num, rows, cols, steps, y_type=None, y=None
+    ):
         r"""Generate a synthetic sample.
 
         The rule for the generation is:
@@ -153,15 +159,26 @@ def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps, y_type=None):
         steps : ndarray of shape (n_samples,), dtype=float
             Step sizes for new samples.
 
-        y_type : None
-            Unused parameter. Only for compatibility reason with SMOTE-NC.
+        y_type : str, int or None, default=None
+            Class label of the current target classes for which we want to generate
+            samples.
+
+        y : ndarray of shape (n_samples_all,), default=None
+            The true target associated with `nn_data`. Used by Borderline SMOTE-2 to
+            weight the distances in the sample generation process.
 
         Returns
         -------
         X_new : {ndarray, sparse matrix} of shape (n_samples, n_features)
             Synthetically generated samples.
         """
         diffs = nn_data[nn_num[rows, cols]] - X[rows]
+        if y is not None:  # only entering for BorderlineSMOTE-2
+            random_state = check_random_state(self.random_state)
+            mask_pair_samples = y[nn_num[rows, cols]] != y_type
+            diffs[mask_pair_samples] *= random_state.uniform(
+                low=0.0, high=0.5, size=(mask_pair_samples.sum(), 1)
+            )
 
         if sparse.issparse(X):
             sparse_func = type(X).__name__
@@ -736,7 +753,7 @@ def _fit_resample(self, X, y):
 
         return X_resampled, y_resampled
 
-    def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps, y_type):
+    def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps, y_type, y=None):
         """Generate a synthetic sample with an additional steps for the
         categorical features.
 
diff --git a/imblearn/over_sampling/_smote/filter.py b/imblearn/over_sampling/_smote/filter.py
@@ -224,8 +224,10 @@ def _fit_resample(self, X, y):
 
             if self.kind == "borderline-1":
                 X_to_sample_from = X_class  # consider the positive class only
+                y_to_check_neighbors = None
             else:  # self.kind == "borderline-2"
                 X_to_sample_from = X  # consider the whole dataset
+                y_to_check_neighbors = y
 
             self.nn_k_.fit(X_to_sample_from)
             nns = self.nn_k_.kneighbors(X_danger, return_distance=False)[:, 1:]
@@ -236,6 +238,7 @@ def _fit_resample(self, X, y):
                 X_to_sample_from,
                 nns,
                 n_samples,
+                y=y_to_check_neighbors,
             )
             if sparse.issparse(X_new):
                 X_resampled = sparse.vstack([X_resampled, X_new])