Skip to content

Commit ec27259

Browse files
glemaitresolegalli
andauthored
FIX multiply by random number < 0.5 for BorderlineSMOTE-2 (#1027)
Co-authored-by: Soledad Galli <[email protected]>
1 parent d597b05 commit ec27259

File tree

2 files changed

+26
-6
lines changed

2 files changed

+26
-6
lines changed

imblearn/over_sampling/_smote/base.py

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def _validate_estimator(self):
6868
)
6969

7070
def _make_samples(
71-
self, X, y_dtype, y_type, nn_data, nn_num, n_samples, step_size=1.0
71+
self, X, y_dtype, y_type, nn_data, nn_num, n_samples, step_size=1.0, y=None
7272
):
7373
"""A support function that returns artificial samples constructed along
7474
the line connecting nearest neighbours.
@@ -98,6 +98,10 @@ def _make_samples(
9898
step_size : float, default=1.0
9999
The step size to create samples.
100100
101+
y : ndarray of shape (n_samples_all,), default=None
102+
The true target associated with `nn_data`. Used by Borderline SMOTE-2 to
103+
weight the distances in the sample generation process.
104+
101105
Returns
102106
-------
103107
X_new : {ndarray, sparse matrix} of shape (n_samples_new, n_features)
@@ -114,11 +118,13 @@ def _make_samples(
114118
rows = np.floor_divide(samples_indices, nn_num.shape[1])
115119
cols = np.mod(samples_indices, nn_num.shape[1])
116120

117-
X_new = self._generate_samples(X, nn_data, nn_num, rows, cols, steps, y_type)
121+
X_new = self._generate_samples(X, nn_data, nn_num, rows, cols, steps, y_type, y)
118122
y_new = np.full(n_samples, fill_value=y_type, dtype=y_dtype)
119123
return X_new, y_new
120124

121-
def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps, y_type=None):
125+
def _generate_samples(
126+
self, X, nn_data, nn_num, rows, cols, steps, y_type=None, y=None
127+
):
122128
r"""Generate a synthetic sample.
123129
124130
The rule for the generation is:
@@ -153,15 +159,26 @@ def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps, y_type=None):
153159
steps : ndarray of shape (n_samples,), dtype=float
154160
Step sizes for new samples.
155161
156-
y_type : None
157-
Unused parameter. Only for compatibility reason with SMOTE-NC.
162+
y_type : str, int or None, default=None
163+
Class label of the current target classes for which we want to generate
164+
samples.
165+
166+
y : ndarray of shape (n_samples_all,), default=None
167+
The true target associated with `nn_data`. Used by Borderline SMOTE-2 to
168+
weight the distances in the sample generation process.
158169
159170
Returns
160171
-------
161172
X_new : {ndarray, sparse matrix} of shape (n_samples, n_features)
162173
Synthetically generated samples.
163174
"""
164175
diffs = nn_data[nn_num[rows, cols]] - X[rows]
176+
if y is not None: # only entering for BorderlineSMOTE-2
177+
random_state = check_random_state(self.random_state)
178+
mask_pair_samples = y[nn_num[rows, cols]] != y_type
179+
diffs[mask_pair_samples] *= random_state.uniform(
180+
low=0.0, high=0.5, size=(mask_pair_samples.sum(), 1)
181+
)
165182

166183
if sparse.issparse(X):
167184
sparse_func = type(X).__name__
@@ -736,7 +753,7 @@ def _fit_resample(self, X, y):
736753

737754
return X_resampled, y_resampled
738755

739-
def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps, y_type):
756+
def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps, y_type, y=None):
740757
"""Generate a synthetic sample with an additional steps for the
741758
categorical features.
742759

imblearn/over_sampling/_smote/filter.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,8 +224,10 @@ def _fit_resample(self, X, y):
224224

225225
if self.kind == "borderline-1":
226226
X_to_sample_from = X_class # consider the positive class only
227+
y_to_check_neighbors = None
227228
else: # self.kind == "borderline-2"
228229
X_to_sample_from = X # consider the whole dataset
230+
y_to_check_neighbors = y
229231

230232
self.nn_k_.fit(X_to_sample_from)
231233
nns = self.nn_k_.kneighbors(X_danger, return_distance=False)[:, 1:]
@@ -236,6 +238,7 @@ def _fit_resample(self, X, y):
236238
X_to_sample_from,
237239
nns,
238240
n_samples,
241+
y=y_to_check_neighbors,
239242
)
240243
if sparse.issparse(X_new):
241244
X_resampled = sparse.vstack([X_resampled, X_new])

0 commit comments

Comments
 (0)