|
11 | 11 |
|
12 | 12 | import numpy as np
|
13 | 13 | from scipy import sparse
|
| 14 | +from scipy import stats |
14 | 15 |
|
15 | 16 | from sklearn.base import clone
|
16 | 17 | from sklearn.cluster import MiniBatchKMeans
|
17 | 18 | from sklearn.metrics import pairwise_distances
|
18 |
| -from sklearn.preprocessing import OneHotEncoder |
| 19 | +from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder |
19 | 20 | from sklearn.svm import SVC
|
20 | 21 | from sklearn.utils import check_random_state
|
21 | 22 | from sklearn.utils import _safe_indexing
|
|
25 | 26 |
|
26 | 27 | from .base import BaseOverSampler
|
27 | 28 | from ..exceptions import raise_isinstance_error
|
| 29 | +from ..metrics.pairwise import ValueDifferenceMetric |
28 | 30 | from ..utils import check_neighbors_object
|
29 | 31 | from ..utils import check_target_type
|
30 | 32 | from ..utils import Substitution
|
@@ -448,6 +450,9 @@ class SVMSMOTE(BaseSMOTE):
|
448 | 450 |
|
449 | 451 | SMOTENC : Over-sample using SMOTE for continuous and categorical features.
|
450 | 452 |
|
| 453 | + SMOTEN : Over-sample using the SMOTE variable specifically for categorical |
| 454 | + features only. |
| 455 | +
|
451 | 456 | BorderlineSMOTE : Over-sample using Borderline-SMOTE.
|
452 | 457 |
|
453 | 458 | ADASYN : Over-sample using ADASYN.
|
@@ -643,6 +648,9 @@ class SMOTE(BaseSMOTE):
|
643 | 648 | --------
|
644 | 649 | SMOTENC : Over-sample using SMOTE for continuous and categorical features.
|
645 | 650 |
|
| 651 | + SMOTEN : Over-sample using the SMOTE variable specifically for categorical |
| 652 | + features only. |
| 653 | +
|
646 | 654 | BorderlineSMOTE : Over-sample using the borderline-SMOTE variant.
|
647 | 655 |
|
648 | 656 | SVMSMOTE : Over-sample using the SVM-SMOTE variant.
|
@@ -766,6 +774,9 @@ class SMOTENC(SMOTE):
|
766 | 774 | --------
|
767 | 775 | SMOTE : Over-sample using SMOTE.
|
768 | 776 |
|
| 777 | + SMOTEN : Over-sample using the SMOTE variable specifically for categorical |
| 778 | + features only. |
| 779 | +
|
769 | 780 | SVMSMOTE : Over-sample using SVM-SMOTE variant.
|
770 | 781 |
|
771 | 782 | BorderlineSMOTE : Over-sample using Borderline-SMOTE variant.
|
@@ -1055,6 +1066,11 @@ class KMeansSMOTE(BaseSMOTE):
|
1055 | 1066 | --------
|
1056 | 1067 | SMOTE : Over-sample using SMOTE.
|
1057 | 1068 |
|
| 1069 | + SMOTENC : Over-sample using SMOTE for continuous and categorical features. |
| 1070 | +
|
| 1071 | + SMOTEN : Over-sample using the SMOTE variable specifically for categorical |
| 1072 | + features only. |
| 1073 | +
|
1058 | 1074 | SVMSMOTE : Over-sample using SVM-SMOTE variant.
|
1059 | 1075 |
|
1060 | 1076 | BorderlineSMOTE : Over-sample using Borderline-SMOTE variant.
|
@@ -1248,3 +1264,145 @@ def _fit_resample(self, X, y):
|
1248 | 1264 | y_resampled = np.hstack((y_resampled, y_new))
|
1249 | 1265 |
|
1250 | 1266 | return X_resampled, y_resampled
|
| 1267 | + |
| 1268 | + |
| 1269 | +@Substitution( |
| 1270 | + sampling_strategy=BaseOverSampler._sampling_strategy_docstring, |
| 1271 | + n_jobs=_n_jobs_docstring, |
| 1272 | + random_state=_random_state_docstring, |
| 1273 | +) |
| 1274 | +class SMOTEN(SMOTE): |
| 1275 | + """Perform SMOTE over-sampling for nominal categorical features only. |
| 1276 | +
|
| 1277 | + This method is refered as SMOTEN in [1]_. |
| 1278 | +
|
| 1279 | + Read more in the :ref:`User Guide <smote_adasyn>`. |
| 1280 | +
|
| 1281 | + Parameters |
| 1282 | + ---------- |
| 1283 | + {sampling_strategy} |
| 1284 | +
|
| 1285 | + {random_state} |
| 1286 | +
|
| 1287 | + k_neighbors : int or object, default=5 |
| 1288 | + If ``int``, number of nearest neighbours to used to construct synthetic |
| 1289 | + samples. If object, an estimator that inherits from |
| 1290 | + :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to |
| 1291 | + find the k_neighbors. |
| 1292 | +
|
| 1293 | + {n_jobs} |
| 1294 | +
|
| 1295 | + See Also |
| 1296 | + -------- |
| 1297 | + SMOTE : Over-sample using SMOTE. |
| 1298 | +
|
| 1299 | + SMOTENC : Over-sample using SMOTE for continuous and categorical features. |
| 1300 | +
|
| 1301 | + BorderlineSMOTE : Over-sample using the borderline-SMOTE variant. |
| 1302 | +
|
| 1303 | + SVMSMOTE : Over-sample using the SVM-SMOTE variant. |
| 1304 | +
|
| 1305 | + ADASYN : Over-sample using ADASYN. |
| 1306 | +
|
| 1307 | + KMeansSMOTE : Over-sample applying a clustering before to oversample using |
| 1308 | + SMOTE. |
| 1309 | +
|
| 1310 | + Notes |
| 1311 | + ----- |
| 1312 | + See the original papers: [1]_ for more details. |
| 1313 | +
|
| 1314 | + Supports multi-class resampling. A one-vs.-rest scheme is used as |
| 1315 | + originally proposed in [1]_. |
| 1316 | +
|
| 1317 | + References |
| 1318 | + ---------- |
| 1319 | + .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE: |
| 1320 | + synthetic minority over-sampling technique," Journal of artificial |
| 1321 | + intelligence research, 321-357, 2002. |
| 1322 | +
|
| 1323 | + Examples |
| 1324 | + -------- |
| 1325 | + >>> import numpy as np |
| 1326 | + >>> X = np.array(["A"] * 10 + ["B"] * 20 + ["C"] * 30, dtype=object).reshape(-1, 1) |
| 1327 | + >>> y = np.array([0] * 20 + [1] * 40, dtype=np.int32) |
| 1328 | + >>> from collections import Counter |
| 1329 | + >>> print(f"Original class counts: {{Counter(y)}}") |
| 1330 | + Original class counts: Counter({{1: 40, 0: 20}}) |
| 1331 | + >>> from imblearn.over_sampling import SMOTEN |
| 1332 | + >>> sampler = SMOTEN(random_state=0) |
| 1333 | + >>> X_res, y_res = sampler.fit_resample(X, y) |
| 1334 | + >>> print(f"Class counts after resampling {{Counter(y_res)}}") |
| 1335 | + Class counts after resampling Counter({{0: 40, 1: 40}}) |
| 1336 | + """ |
| 1337 | + |
| 1338 | + def _check_X_y(self, X, y): |
| 1339 | + """Check should accept strings and not sparse matrices.""" |
| 1340 | + y, binarize_y = check_target_type(y, indicate_one_vs_all=True) |
| 1341 | + X, y = self._validate_data( |
| 1342 | + X, |
| 1343 | + y, |
| 1344 | + reset=True, |
| 1345 | + dtype=None, |
| 1346 | + accept_sparse=False, |
| 1347 | + ) |
| 1348 | + return X, y, binarize_y |
| 1349 | + |
| 1350 | + def _validate_estimator(self): |
| 1351 | + """Force to use precomputed distance matrix.""" |
| 1352 | + super()._validate_estimator() |
| 1353 | + self.nn_k_.set_params(metric="precomputed") |
| 1354 | + |
| 1355 | + def _make_samples(self, X_class, klass, y_dtype, nn_indices, n_samples): |
| 1356 | + random_state = check_random_state(self.random_state) |
| 1357 | + # generate sample indices that will be used to generate new samples |
| 1358 | + samples_indices = random_state.choice( |
| 1359 | + np.arange(X_class.shape[0]), size=n_samples, replace=True |
| 1360 | + ) |
| 1361 | + # for each drawn samples, select its k-neighbors and generate a sample |
| 1362 | + # where for each feature individually, each category generated is the |
| 1363 | + # most common category |
| 1364 | + X_new = np.squeeze( |
| 1365 | + stats.mode(X_class[nn_indices[samples_indices]], axis=1).mode, axis=1 |
| 1366 | + ) |
| 1367 | + y_new = np.full(n_samples, fill_value=klass, dtype=y_dtype) |
| 1368 | + return X_new, y_new |
| 1369 | + |
| 1370 | + def _fit_resample(self, X, y): |
| 1371 | + self._validate_estimator() |
| 1372 | + |
| 1373 | + X_resampled = [X.copy()] |
| 1374 | + y_resampled = [y.copy()] |
| 1375 | + |
| 1376 | + encoder = OrdinalEncoder(dtype=np.int32) |
| 1377 | + X_encoded = encoder.fit_transform(X) |
| 1378 | + |
| 1379 | + vdm = ValueDifferenceMetric( |
| 1380 | + n_categories=[len(cat) for cat in encoder.categories_] |
| 1381 | + ).fit(X_encoded, y) |
| 1382 | + |
| 1383 | + for class_sample, n_samples in self.sampling_strategy_.items(): |
| 1384 | + if n_samples == 0: |
| 1385 | + continue |
| 1386 | + target_class_indices = np.flatnonzero(y == class_sample) |
| 1387 | + X_class = _safe_indexing(X_encoded, target_class_indices) |
| 1388 | + |
| 1389 | + X_class_dist = vdm.pairwise(X_class) |
| 1390 | + self.nn_k_.fit(X_class_dist) |
| 1391 | + # the kneigbors search will include the sample itself which is |
| 1392 | + # expected from the original algorithm |
| 1393 | + nn_indices = self.nn_k_.kneighbors(X_class_dist, return_distance=False) |
| 1394 | + X_new, y_new = self._make_samples( |
| 1395 | + X_class, class_sample, y.dtype, nn_indices, n_samples |
| 1396 | + ) |
| 1397 | + |
| 1398 | + X_new = encoder.inverse_transform(X_new) |
| 1399 | + X_resampled.append(X_new) |
| 1400 | + y_resampled.append(y_new) |
| 1401 | + |
| 1402 | + X_resampled = np.vstack(X_resampled) |
| 1403 | + y_resampled = np.hstack(y_resampled) |
| 1404 | + |
| 1405 | + return X_resampled, y_resampled |
| 1406 | + |
| 1407 | + def _more_tags(self): |
| 1408 | + return {"X_types": ["2darray", "dataframe", "string"]} |
0 commit comments