Skip to content

Commit a8e44ae

Browse files
authored
FIX compute the median of std dev for each class to over-sample in SMOTENC (#1015)
1 parent 2f6b1f6 commit a8e44ae

File tree

4 files changed

+87
-46
lines changed

4 files changed

+87
-46
lines changed

doc/over_sampling.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -203,9 +203,9 @@ or relying on `dtype` inference if the columns are using the
203203
>>> print(sorted(Counter(y_resampled).items()))
204204
[(0, 30), (1, 30)]
205205
>>> print(X_resampled[-5:])
206-
[['A' 0.52... 2]
206+
[['A' 0.19... 2]
207207
['B' -0.36... 2]
208-
['B' 0.93... 2]
208+
['B' 0.87... 2]
209209
['B' 0.37... 2]
210210
['B' 0.33... 2]]
211211

doc/whats_new/v0.11.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,15 @@ Bug fixes
1414
they are plugged into an Euclidean distance computation.
1515
:pr:`1014` by :user:`Guillaume Lemaitre <glemaitre>`.
1616

17+
- Fix a bug in :class:`~imblearn.over_sampling.SMOTENC` where the median of standard
18+
deviation of the continuous features was only computed on the minority class. Now,
19+
we are computing this statistic for each class that is up-sampled.
20+
:pr:`1015` by :user:`Guillaume Lemaitre <glemaitre>`.
21+
22+
- Fix a bug in :class:`~imblearn.over_sampling.SMOTENC` such that the case where
23+
the median of standard deviation of the continuous features is null is handled
24+
in the multiclass case as well.
25+
:pr:`1015` by :user:`Guillaume Lemaitre <glemaitre>`.
1726

1827
Version 0.11.0
1928
==============

imblearn/over_sampling/_smote/base.py

Lines changed: 55 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
import math
1010
import numbers
1111
import warnings
12-
from collections import Counter
1312

1413
import numpy as np
1514
from scipy import sparse
@@ -23,7 +22,6 @@
2322
check_random_state,
2423
)
2524
from sklearn.utils.sparsefuncs_fast import (
26-
csc_mean_variance_axis0,
2725
csr_mean_variance_axis0,
2826
)
2927
from sklearn.utils.validation import _num_features
@@ -116,11 +114,11 @@ def _make_samples(
116114
rows = np.floor_divide(samples_indices, nn_num.shape[1])
117115
cols = np.mod(samples_indices, nn_num.shape[1])
118116

119-
X_new = self._generate_samples(X, nn_data, nn_num, rows, cols, steps)
117+
X_new = self._generate_samples(X, nn_data, nn_num, rows, cols, steps, y_type)
120118
y_new = np.full(n_samples, fill_value=y_type, dtype=y_dtype)
121119
return X_new, y_new
122120

123-
def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps):
121+
def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps, y_type=None):
124122
r"""Generate a synthetic sample.
125123
126124
The rule for the generation is:
@@ -155,6 +153,9 @@ def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps):
155153
steps : ndarray of shape (n_samples,), dtype=float
156154
Step sizes for new samples.
157155
156+
y_type : None
157+
Unused parameter. Only for compatibility reason with SMOTE-NC.
158+
158159
Returns
159160
-------
160161
X_new : {ndarray, sparse matrix} of shape (n_samples, n_features)
@@ -465,8 +466,9 @@ class SMOTENC(SMOTE):
465466
continuous_features_ : ndarray of shape (n_cont_features,), dtype=np.int64
466467
Indices of the continuous features.
467468
468-
median_std_ : float
469-
Median of the standard deviation of the continuous features.
469+
median_std_ : dict of int -> float
470+
Median of the standard deviation of the continuous features for each
471+
class to be over-sampled.
470472
471473
n_features_ : int
472474
Number of features observed at `fit`.
@@ -627,23 +629,8 @@ def _fit_resample(self, X, y):
627629
self._validate_column_types(X)
628630
self._validate_estimator()
629631

630-
# compute the median of the standard deviation of the minority class
631-
target_stats = Counter(y)
632-
class_minority = min(target_stats, key=target_stats.get)
633-
634632
X_continuous = _safe_indexing(X, self.continuous_features_, axis=1)
635633
X_continuous = check_array(X_continuous, accept_sparse=["csr", "csc"])
636-
X_minority = _safe_indexing(X_continuous, np.flatnonzero(y == class_minority))
637-
638-
if sparse.issparse(X):
639-
if X.format == "csr":
640-
_, var = csr_mean_variance_axis0(X_minority)
641-
else:
642-
_, var = csc_mean_variance_axis0(X_minority)
643-
else:
644-
var = X_minority.var(axis=0)
645-
self.median_std_ = np.median(np.sqrt(var))
646-
647634
X_categorical = _safe_indexing(X, self.categorical_features_, axis=1)
648635
if X_continuous.dtype.name != "object":
649636
dtype_ohe = X_continuous.dtype
@@ -664,28 +651,54 @@ def _fit_resample(self, X, y):
664651
if not sparse.issparse(X_ohe):
665652
X_ohe = sparse.csr_matrix(X_ohe, dtype=dtype_ohe)
666653

667-
# we can replace the 1 entries of the categorical features with the
668-
# median of the standard deviation. It will ensure that whenever
669-
# distance is computed between 2 samples, the difference will be equal
670-
# to the median of the standard deviation as in the original paper.
671-
672-
# In the edge case where the median of the std is equal to 0, the 1s
673-
# entries will be also nullified. In this case, we store the original
674-
# categorical encoding which will be later used for inverting the OHE
675-
if math.isclose(self.median_std_, 0):
676-
self._X_categorical_minority_encoded = _safe_indexing(
677-
X_ohe.toarray(), np.flatnonzero(y == class_minority)
654+
X_encoded = sparse.hstack((X_continuous, X_ohe), format="csr", dtype=dtype_ohe)
655+
X_resampled = [X_encoded.copy()]
656+
y_resampled = [y.copy()]
657+
658+
# SMOTE resampling starts here
659+
self.median_std_ = {}
660+
for class_sample, n_samples in self.sampling_strategy_.items():
661+
if n_samples == 0:
662+
continue
663+
target_class_indices = np.flatnonzero(y == class_sample)
664+
X_class = _safe_indexing(X_encoded, target_class_indices)
665+
666+
_, var = csr_mean_variance_axis0(
667+
X_class[:, : self.continuous_features_.size]
678668
)
669+
self.median_std_[class_sample] = np.median(np.sqrt(var))
670+
671+
# In the edge case where the median of the std is equal to 0, the 1s
672+
# entries will be also nullified. In this case, we store the original
673+
# categorical encoding which will be later used for inverting the OHE
674+
if math.isclose(self.median_std_[class_sample], 0):
675+
# This variable will be used when generating data
676+
self._X_categorical_minority_encoded = X_class[
677+
:, self.continuous_features_.size :
678+
].toarray()
679+
680+
# we can replace the 1 entries of the categorical features with the
681+
# median of the standard deviation. It will ensure that whenever
682+
# distance is computed between 2 samples, the difference will be equal
683+
# to the median of the standard deviation as in the original paper.
684+
X_class_categorical = X_class[:, self.continuous_features_.size :]
685+
# With one-hot encoding, the median will be repeated twice. We need
686+
# to divide by sqrt(2) such that we only have one median value
687+
# contributing to the Euclidean distance
688+
X_class_categorical.data[:] = self.median_std_[class_sample] / np.sqrt(2)
689+
X_class[:, self.continuous_features_.size :] = X_class_categorical
679690

680-
# With one-hot encoding, the median will be repeated twice. We need to divide
681-
# by sqrt(2) such that we only have one median value contributing to the
682-
# Euclidean distance
683-
X_ohe.data = (
684-
np.ones_like(X_ohe.data, dtype=X_ohe.dtype) * self.median_std_ / np.sqrt(2)
685-
)
686-
X_encoded = sparse.hstack((X_continuous, X_ohe), format="csr")
691+
self.nn_k_.fit(X_class)
692+
nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
693+
X_new, y_new = self._make_samples(
694+
X_class, y.dtype, class_sample, X_class, nns, n_samples, 1.0
695+
)
696+
X_resampled.append(X_new)
697+
y_resampled.append(y_new)
687698

688-
X_resampled, y_resampled = super()._fit_resample(X_encoded, y)
699+
X_resampled = sparse.vstack(X_resampled, format=X_encoded.format)
700+
y_resampled = np.hstack(y_resampled)
701+
# SMOTE resampling ends here
689702

690703
# reverse the encoding of the categorical features
691704
X_res_cat = X_resampled[:, self.continuous_features_.size :]
@@ -723,7 +736,7 @@ def _fit_resample(self, X, y):
723736

724737
return X_resampled, y_resampled
725738

726-
def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps):
739+
def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps, y_type):
727740
"""Generate a synthetic sample with an additional steps for the
728741
categorical features.
729742
@@ -741,7 +754,7 @@ def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps):
741754

742755
# In the case that the median std was equal to zeros, we have to
743756
# create non-null entry based on the encoded of OHE
744-
if math.isclose(self.median_std_, 0):
757+
if math.isclose(self.median_std_[y_type], 0):
745758
nn_data[
746759
:, self.continuous_features_.size :
747760
] = self._X_categorical_minority_encoded

imblearn/over_sampling/_smote/tests/test_smote_nc.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,8 @@ def test_smotenc(data):
130130
assert set(X[:, cat_idx]) == set(X_resampled[:, cat_idx])
131131
assert X[:, cat_idx].dtype == X_resampled[:, cat_idx].dtype
132132

133+
assert isinstance(smote.median_std_, dict)
134+
133135

134136
# part of the common test which apply to SMOTE-NC even if it is not default
135137
# constructible
@@ -193,6 +195,7 @@ def test_smotenc_pandas():
193195
X_res, y_res = smote.fit_resample(X, y)
194196
assert_array_equal(X_res_pd.to_numpy(), X_res)
195197
assert_allclose(y_res_pd, y_res)
198+
assert set(smote.median_std_.keys()) == {0, 1}
196199

197200

198201
def test_smotenc_preserve_dtype():
@@ -234,20 +237,36 @@ def test_smote_nc_with_null_median_std():
234237
[
235238
[1, 2, 1, "A"],
236239
[2, 1, 2, "A"],
240+
[2, 1, 2, "A"],
237241
[1, 2, 3, "B"],
238242
[1, 2, 4, "C"],
239243
[1, 2, 5, "C"],
244+
[1, 2, 4, "C"],
245+
[1, 2, 4, "C"],
246+
[1, 2, 4, "C"],
240247
],
241248
dtype="object",
242249
)
243250
labels = np.array(
244-
["class_1", "class_1", "class_1", "class_2", "class_2"], dtype=object
251+
[
252+
"class_1",
253+
"class_1",
254+
"class_1",
255+
"class_1",
256+
"class_2",
257+
"class_2",
258+
"class_3",
259+
"class_3",
260+
"class_3",
261+
],
262+
dtype=object,
245263
)
246264
smote = SMOTENC(categorical_features=[3], k_neighbors=1, random_state=0)
247265
X_res, y_res = smote.fit_resample(data, labels)
248266
# check that the categorical feature is not random but correspond to the
249267
# categories seen in the minority class samples
250-
assert X_res[-1, -1] == "C"
268+
assert_array_equal(X_res[-3:, -1], np.array(["C", "C", "C"], dtype=object))
269+
assert smote.median_std_ == {"class_2": 0.0, "class_3": 0.0}
251270

252271

253272
def test_smotenc_categorical_encoder():

0 commit comments

Comments
 (0)