9
9
import math
10
10
import numbers
11
11
import warnings
12
- from collections import Counter
13
12
14
13
import numpy as np
15
14
from scipy import sparse
23
22
check_random_state ,
24
23
)
25
24
from sklearn .utils .sparsefuncs_fast import (
26
- csc_mean_variance_axis0 ,
27
25
csr_mean_variance_axis0 ,
28
26
)
29
27
from sklearn .utils .validation import _num_features
@@ -116,11 +114,11 @@ def _make_samples(
116
114
rows = np .floor_divide (samples_indices , nn_num .shape [1 ])
117
115
cols = np .mod (samples_indices , nn_num .shape [1 ])
118
116
119
- X_new = self ._generate_samples (X , nn_data , nn_num , rows , cols , steps )
117
+ X_new = self ._generate_samples (X , nn_data , nn_num , rows , cols , steps , y_type )
120
118
y_new = np .full (n_samples , fill_value = y_type , dtype = y_dtype )
121
119
return X_new , y_new
122
120
123
- def _generate_samples (self , X , nn_data , nn_num , rows , cols , steps ):
121
+ def _generate_samples (self , X , nn_data , nn_num , rows , cols , steps , y_type = None ):
124
122
r"""Generate a synthetic sample.
125
123
126
124
The rule for the generation is:
@@ -155,6 +153,9 @@ def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps):
155
153
steps : ndarray of shape (n_samples,), dtype=float
156
154
Step sizes for new samples.
157
155
156
+ y_type : None
157
+ Unused parameter. Only for compatibility reason with SMOTE-NC.
158
+
158
159
Returns
159
160
-------
160
161
X_new : {ndarray, sparse matrix} of shape (n_samples, n_features)
@@ -465,8 +466,9 @@ class SMOTENC(SMOTE):
465
466
continuous_features_ : ndarray of shape (n_cont_features,), dtype=np.int64
466
467
Indices of the continuous features.
467
468
468
- median_std_ : float
469
- Median of the standard deviation of the continuous features.
469
+ median_std_ : dict of int -> float
470
+ Median of the standard deviation of the continuous features for each
471
+ class to be over-sampled.
470
472
471
473
n_features_ : int
472
474
Number of features observed at `fit`.
@@ -627,23 +629,8 @@ def _fit_resample(self, X, y):
627
629
self ._validate_column_types (X )
628
630
self ._validate_estimator ()
629
631
630
- # compute the median of the standard deviation of the minority class
631
- target_stats = Counter (y )
632
- class_minority = min (target_stats , key = target_stats .get )
633
-
634
632
X_continuous = _safe_indexing (X , self .continuous_features_ , axis = 1 )
635
633
X_continuous = check_array (X_continuous , accept_sparse = ["csr" , "csc" ])
636
- X_minority = _safe_indexing (X_continuous , np .flatnonzero (y == class_minority ))
637
-
638
- if sparse .issparse (X ):
639
- if X .format == "csr" :
640
- _ , var = csr_mean_variance_axis0 (X_minority )
641
- else :
642
- _ , var = csc_mean_variance_axis0 (X_minority )
643
- else :
644
- var = X_minority .var (axis = 0 )
645
- self .median_std_ = np .median (np .sqrt (var ))
646
-
647
634
X_categorical = _safe_indexing (X , self .categorical_features_ , axis = 1 )
648
635
if X_continuous .dtype .name != "object" :
649
636
dtype_ohe = X_continuous .dtype
@@ -664,28 +651,54 @@ def _fit_resample(self, X, y):
664
651
if not sparse .issparse (X_ohe ):
665
652
X_ohe = sparse .csr_matrix (X_ohe , dtype = dtype_ohe )
666
653
667
- # we can replace the 1 entries of the categorical features with the
668
- # median of the standard deviation. It will ensure that whenever
669
- # distance is computed between 2 samples, the difference will be equal
670
- # to the median of the standard deviation as in the original paper.
671
-
672
- # In the edge case where the median of the std is equal to 0, the 1s
673
- # entries will be also nullified. In this case, we store the original
674
- # categorical encoding which will be later used for inverting the OHE
675
- if math .isclose (self .median_std_ , 0 ):
676
- self ._X_categorical_minority_encoded = _safe_indexing (
677
- X_ohe .toarray (), np .flatnonzero (y == class_minority )
654
+ X_encoded = sparse .hstack ((X_continuous , X_ohe ), format = "csr" , dtype = dtype_ohe )
655
+ X_resampled = [X_encoded .copy ()]
656
+ y_resampled = [y .copy ()]
657
+
658
+ # SMOTE resampling starts here
659
+ self .median_std_ = {}
660
+ for class_sample , n_samples in self .sampling_strategy_ .items ():
661
+ if n_samples == 0 :
662
+ continue
663
+ target_class_indices = np .flatnonzero (y == class_sample )
664
+ X_class = _safe_indexing (X_encoded , target_class_indices )
665
+
666
+ _ , var = csr_mean_variance_axis0 (
667
+ X_class [:, : self .continuous_features_ .size ]
678
668
)
669
+ self .median_std_ [class_sample ] = np .median (np .sqrt (var ))
670
+
671
+ # In the edge case where the median of the std is equal to 0, the 1s
672
+ # entries will be also nullified. In this case, we store the original
673
+ # categorical encoding which will be later used for inverting the OHE
674
+ if math .isclose (self .median_std_ [class_sample ], 0 ):
675
+ # This variable will be used when generating data
676
+ self ._X_categorical_minority_encoded = X_class [
677
+ :, self .continuous_features_ .size :
678
+ ].toarray ()
679
+
680
+ # we can replace the 1 entries of the categorical features with the
681
+ # median of the standard deviation. It will ensure that whenever
682
+ # distance is computed between 2 samples, the difference will be equal
683
+ # to the median of the standard deviation as in the original paper.
684
+ X_class_categorical = X_class [:, self .continuous_features_ .size :]
685
+ # With one-hot encoding, the median will be repeated twice. We need
686
+ # to divide by sqrt(2) such that we only have one median value
687
+ # contributing to the Euclidean distance
688
+ X_class_categorical .data [:] = self .median_std_ [class_sample ] / np .sqrt (2 )
689
+ X_class [:, self .continuous_features_ .size :] = X_class_categorical
679
690
680
- # With one-hot encoding, the median will be repeated twice. We need to divide
681
- # by sqrt(2) such that we only have one median value contributing to the
682
- # Euclidean distance
683
- X_ohe . data = (
684
- np . ones_like ( X_ohe . data , dtype = X_ohe . dtype ) * self . median_std_ / np . sqrt ( 2 )
685
- )
686
- X_encoded = sparse . hstack (( X_continuous , X_ohe ), format = "csr" )
691
+ self . nn_k_ . fit ( X_class )
692
+ nns = self . nn_k_ . kneighbors ( X_class , return_distance = False )[:, 1 :]
693
+ X_new , y_new = self . _make_samples (
694
+ X_class , y . dtype , class_sample , X_class , nns , n_samples , 1.0
695
+ )
696
+ X_resampled . append ( X_new )
697
+ y_resampled . append ( y_new )
687
698
688
- X_resampled , y_resampled = super ()._fit_resample (X_encoded , y )
699
+ X_resampled = sparse .vstack (X_resampled , format = X_encoded .format )
700
+ y_resampled = np .hstack (y_resampled )
701
+ # SMOTE resampling ends here
689
702
690
703
# reverse the encoding of the categorical features
691
704
X_res_cat = X_resampled [:, self .continuous_features_ .size :]
@@ -723,7 +736,7 @@ def _fit_resample(self, X, y):
723
736
724
737
return X_resampled , y_resampled
725
738
726
- def _generate_samples (self , X , nn_data , nn_num , rows , cols , steps ):
739
+ def _generate_samples (self , X , nn_data , nn_num , rows , cols , steps , y_type ):
727
740
"""Generate a synthetic sample with an additional steps for the
728
741
categorical features.
729
742
@@ -741,7 +754,7 @@ def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps):
741
754
742
755
# In the case that the median std was equal to zeros, we have to
743
756
# create non-null entry based on the encoded of OHE
744
- if math .isclose (self .median_std_ , 0 ):
757
+ if math .isclose (self .median_std_ [ y_type ] , 0 ):
745
758
nn_data [
746
759
:, self .continuous_features_ .size :
747
760
] = self ._X_categorical_minority_encoded
0 commit comments