Skip to content

Commit 70f2e75

Browse files
authored
[MRG] Apply deprecation SMOTE and ADADYN (#183)
* Apply deprecation SMOTE and ADADYN * Update doc pipeline
1 parent ad355b7 commit 70f2e75

File tree

5 files changed

+97
-29
lines changed

5 files changed

+97
-29
lines changed

doc/whats_new.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ Changelog
1414

1515
Bug fixes
1616
~~~~~~~~~
17+
1718
- Fixed a bug in :class:`under_sampling.NearMiss` which was not picking the right samples during under sampling for the method 3. By `Guillaume Lemaitre`_.
1819
- Fixed a bug in :class:`ensemble.EasyEnsemble`, correction of the `random_state` generation. By `Guillaume Lemaitre`_ and `Christos Aridas`_.
1920
- Fixed a bug in :class:`under_sampling.RepeatedEditedNearestNeighbours`, add additional stopping criterion to avoid that the minority class become a majority class or that a class disappear. By `Guillaume Lemaitre`_.
@@ -53,6 +54,9 @@ API changes summary
5354
- Two base classes :class:`BaseBinaryclassSampler` and :class:`BaseMulticlassSampler` have been created to handle the target type and raise warning in case of abnormality. By `Guillaume Lemaitre`_ and `Christos Aridas`_.
5455
- Move `random_state` to be assigned in the :class:`SamplerMixin` initialization. By `Guillaume Lemaitre`_.
5556
- Provide estimators instead of parameters in :class:`combine.SMOTEENN` and :class:`combine.SMOTETomek`. Therefore, the list of parameters have been deprecated. By `Guillaume Lemaitre`_ and `Christos Aridas`_.
57+
- `k` has been deprecated in :class:`over_sampling.ADASYN`. Use `n_neighbors` instead. By `Guillaume Lemaitre`_.
58+
- `k` and `m` have been deprecated in :class:`over_sampling.SMOTE`. Use `k_neighbors` and `m_neighbors` instead. By `Guillaume Lemaitre`_.
59+
5660

5761
Documentation changes
5862
~~~~~~~~~~~~~~~~~~~~~

imblearn/base.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,10 @@ def fit(self, X, y):
8383

8484
if hasattr(self, 'size_ngh'):
8585
self._validate_size_ngh_deprecation()
86+
elif hasattr(self, 'k') and not hasattr(self, 'm'):
87+
self._validate_k_deprecation()
88+
elif hasattr(self, 'k') and hasattr(self, 'm'):
89+
self._validate_k_m_deprecation()
8690

8791
self.logger.info('Compute classes statistics ...')
8892

@@ -161,6 +165,10 @@ def sample(self, X, y):
161165

162166
if hasattr(self, 'size_ngh'):
163167
self._validate_size_ngh_deprecation()
168+
elif hasattr(self, 'k') and not hasattr(self, 'm'):
169+
self._validate_k_deprecation()
170+
elif hasattr(self, 'k') and hasattr(self, 'm'):
171+
self._validate_k_m_deprecation()
164172

165173
return self._sample(X, y)
166174

@@ -212,6 +220,25 @@ def _validate_size_ngh_deprecation(self):
212220
' `n_neighbors` instead.', DeprecationWarning)
213221
self.n_neighbors = self.size_ngh
214222

223+
def _validate_k_deprecation(self):
224+
"""Private function to warn about deprecation of k in ADASYN"""
225+
if self.k is not None:
226+
warnings.warn('`k` will be replaced in version 0.4. Use'
227+
' `n_neighbors` instead.', DeprecationWarning)
228+
self.n_neighbors = self.k
229+
230+
def _validate_k_m_deprecation(self):
231+
"""Private function to warn about deprecation of k in ADASYN"""
232+
if self.k is not None:
233+
warnings.warn('`k` will be replaced in version 0.4. Use'
234+
' `k_neighbors` instead.', DeprecationWarning)
235+
self.k_neighbors = self.k
236+
237+
if self.m is not None:
238+
warnings.warn('`m` will be replaced in version 0.4. Use'
239+
' `m_neighbors` instead.', DeprecationWarning)
240+
self.m_neighbors = self.m
241+
215242
@abstractmethod
216243
def _sample(self, X, y):
217244
"""Resample the dataset.

imblearn/over_sampling/adasyn.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,13 @@ class ADASYN(BaseBinarySampler):
3131
If None, the random number generator is the RandomState instance used
3232
by np.random.
3333
34-
k : int, optional (default=5)
34+
k : int, optional (default=None)
35+
Number of nearest neighbours to used to construct synthetic samples.
36+
37+
NOTE: `k` is deprecated from 0.2 and will be replaced in 0.4
38+
Use ``n_neighbors`` instead.
39+
40+
n_neighbours : int, optional (default=5)
3541
Number of nearest neighbours to used to construct synthetic samples.
3642
3743
n_jobs : int, optional (default=1)
@@ -84,12 +90,15 @@ class ADASYN(BaseBinarySampler):
8490
8591
"""
8692

87-
def __init__(self, ratio='auto', random_state=None, k=5, n_jobs=1):
93+
def __init__(self, ratio='auto', random_state=None, k=None, n_neighbors=5,
94+
n_jobs=1):
8895
super(ADASYN, self).__init__(ratio=ratio, random_state=random_state)
8996
self.k = k
97+
self.n_neighbors = n_neighbors
9098
self.n_jobs = n_jobs
91-
self.nearest_neighbour = NearestNeighbors(n_neighbors=self.k + 1,
92-
n_jobs=self.n_jobs)
99+
self.nearest_neighbour = NearestNeighbors(
100+
n_neighbors=self.n_neighbors + 1,
101+
n_jobs=self.n_jobs)
93102

94103
def _sample(self, X, y):
95104
"""Resample the dataset.
@@ -130,7 +139,8 @@ def _sample(self, X, y):
130139
X_min = X[y == self.min_c_]
131140

132141
# Print if verbose is true
133-
self.logger.debug('Finding the %s nearest neighbours ...', self.k)
142+
self.logger.debug('Finding the %s nearest neighbours ...',
143+
self.n_neighbors)
134144

135145
# Look for k-th nearest neighbours, excluding, of course, the
136146
# point itself.
@@ -140,7 +150,8 @@ def _sample(self, X, y):
140150
_, ind_nn = self.nearest_neighbour.kneighbors(X_min)
141151

142152
# Compute the ratio of majority samples next to minority samples
143-
ratio_nn = np.sum(y[ind_nn[:, 1:]] == self.maj_c_, axis=1) / self.k
153+
ratio_nn = (np.sum(y[ind_nn[:, 1:]] == self.maj_c_, axis=1) /
154+
self.n_neighbors)
144155
# Check that we found at least some neighbours belonging to the
145156
# majority class
146157
if not np.sum(ratio_nn):
@@ -158,7 +169,8 @@ def _sample(self, X, y):
158169
for x_i, x_i_nn, num_sample_i in zip(X_min, ind_nn, num_samples_nn):
159170

160171
# Pick-up the neighbors wanted
161-
nn_zs = random_state.randint(1, high=self.k + 1, size=num_sample_i)
172+
nn_zs = random_state.randint(1, high=self.n_neighbors + 1,
173+
size=num_sample_i)
162174

163175
# Create a new sample
164176
for nn_z in nn_zs:

imblearn/over_sampling/smote.py

Lines changed: 45 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,23 @@ class SMOTE(BaseBinarySampler):
3333
If None, the random number generator is the RandomState instance used
3434
by np.random.
3535
36-
k : int, optional (default=5)
36+
k : int, optional (default=None)
3737
Number of nearest neighbours to used to construct synthetic samples.
3838
39-
m : int, optional (default=10)
39+
NOTE: `k` is deprecated from 0.2 and will be replaced in 0.4
40+
Use ``k_neighbors`` instead.
41+
42+
k_neighbors : int, optional (default=5)
43+
Number of nearest neighbours to used to construct synthetic samples.
44+
45+
m : int, optional (default=None)
46+
Number of nearest neighbours to use to determine if a minority sample
47+
is in danger.
48+
49+
NOTE: `m` is deprecated from 0.2 and will be replaced in 0.4
50+
Use ``m_neighbors`` instead.
51+
52+
m_neighbors : int, optional (default=10)
4053
Number of nearest neighbours to use to determine if a minority sample
4154
is in danger.
4255
@@ -102,12 +115,15 @@ class SMOTE(BaseBinarySampler):
102115
103116
"""
104117

105-
def __init__(self, ratio='auto', random_state=None, k=5, m=10,
106-
out_step=0.5, kind='regular', n_jobs=-1, **kwargs):
118+
def __init__(self, ratio='auto', random_state=None, k=None, k_neighbors=5,
119+
m=None, m_neighbors=10, out_step=0.5, kind='regular',
120+
n_jobs=-1, **kwargs):
107121
super(SMOTE, self).__init__(ratio=ratio, random_state=random_state)
108122
self.kind = kind
109123
self.k = k
124+
self.k_neighbors = k_neighbors
110125
self.m = m
126+
self.m_neighbors = m_neighbors
111127
self.out_step = out_step
112128
self.n_jobs = n_jobs
113129
self.kwargs = kwargs
@@ -149,11 +165,11 @@ def _in_danger_noise(self, samples, y, kind='danger'):
149165

150166
if kind == 'danger':
151167
# Samples are in danger for m/2 <= m' < m
152-
return np.bitwise_and(n_maj >= float(self.m) / 2.,
153-
n_maj < self.m)
168+
return np.bitwise_and(n_maj >= float(self.m_neighbors) / 2.,
169+
n_maj < self.m_neighbors)
154170
elif kind == 'noise':
155171
# Samples are noise for m = m'
156-
return n_maj == self.m
172+
return n_maj == self.m_neighbors
157173
else:
158174
raise NotImplementedError
159175

@@ -281,7 +297,8 @@ def _sample(self, X, y):
281297
# If regular SMOTE is to be performed
282298
if self.kind == 'regular':
283299

284-
self.logger.debug('Finding the %s nearest neighbours ...', self.k)
300+
self.logger.debug('Finding the %s nearest neighbours ...',
301+
self.k_neighbors)
285302

286303
# Look for k-th nearest neighbours, excluding, of course, the
287304
# point itself.
@@ -312,7 +329,8 @@ def _sample(self, X, y):
312329

313330
if self.kind == 'borderline1' or self.kind == 'borderline2':
314331

315-
self.logger.debug('Finding the %s nearest neighbours ...', self.m)
332+
self.logger.debug('Finding the %s nearest neighbours ...',
333+
self.m_neighbors)
316334

317335
# Find the NNs for all samples in the data set.
318336
self.nearest_neighbour.fit(X)
@@ -334,7 +352,8 @@ def _sample(self, X, y):
334352
#
335353
# We start by changing the number of NNs to consider from m + 1
336354
# to k + 1
337-
self.nearest_neighbour.set_params(**{'n_neighbors': self.k + 1})
355+
self.nearest_neighbour.set_params(**{'n_neighbors':
356+
self.k_neighbors + 1})
338357
self.nearest_neighbour.fit(X_min)
339358

340359
# nns...#
@@ -358,7 +377,7 @@ def _sample(self, X, y):
358377

359378
# Reset the k-neighbours to m+1 neighbours
360379
self.nearest_neighbour.set_params(
361-
**{'n_neighbors': self.m + 1})
380+
**{'n_neighbors': self.m_neighbors + 1})
362381

363382
return X_resampled, y_resampled
364383

@@ -395,7 +414,7 @@ def _sample(self, X, y):
395414

396415
# Reset the k-neighbours to m+1 neighbours
397416
self.nearest_neighbour.set_params(
398-
**{'n_neighbors': self.m + 1})
417+
**{'n_neighbors': self.m_neighbors + 1})
399418

400419
return X_resampled, y_resampled
401420

@@ -416,7 +435,8 @@ def _sample(self, X, y):
416435

417436
# First, find the nn of all the samples to identify samples
418437
# in danger and noisy ones
419-
self.logger.debug('Finding the %s nearest neighbours ...', self.m)
438+
self.logger.debug('Finding the %s nearest neighbours ...',
439+
self.m_neighbors)
420440

421441
# As usual, fit a nearest neighbour model to the data
422442
self.nearest_neighbour.fit(X)
@@ -439,9 +459,11 @@ def _sample(self, X, y):
439459
safety_bool.sum().astype(int))
440460

441461
# Proceed to find support vectors NNs among the minority class
442-
self.logger.debug('Finding the %s nearest neighbours ...', self.k)
462+
self.logger.debug('Finding the %s nearest neighbours ...',
463+
self.k_neighbors)
443464

444-
self.nearest_neighbour.set_params(**{'n_neighbors': self.k + 1})
465+
self.nearest_neighbour.set_params(**{'n_neighbors':
466+
self.k_neighbors + 1})
445467
self.nearest_neighbour.fit(X_min)
446468

447469
self.logger.debug('Create synthetic samples ...')
@@ -496,7 +518,8 @@ def _sample(self, X, y):
496518
y_resampled = np.concatenate((y, y_new_1), axis=0)
497519

498520
# Reset the k-neighbours to m+1 neighbours
499-
self.nearest_neighbour.set_params(**{'n_neighbors': self.m + 1})
521+
self.nearest_neighbour.set_params(**{'n_neighbors':
522+
self.m_neighbors + 1})
500523

501524
return X_resampled, y_resampled
502525

@@ -509,17 +532,19 @@ def _get_smote_kind(self):
509532
# Regular smote does not look for samples in danger, instead it
510533
# creates synthetic samples directly from the k-th nearest
511534
# neighbours with not filtering
512-
self.nearest_neighbour = NearestNeighbors(n_neighbors=self.k + 1,
513-
n_jobs=self.n_jobs)
535+
self.nearest_neighbour = NearestNeighbors(
536+
n_neighbors=self.k_neighbors + 1,
537+
n_jobs=self.n_jobs)
514538
else:
515539
# Borderline1, 2 and SVM variations of smote must first look for
516540
# samples that could be considered noise and samples that live
517541
# near the boundary between the classes. Therefore, before
518542
# creating synthetic samples from the k-th nns, it first look
519543
# for m nearest neighbors to decide whether or not a sample is
520544
# noise or near the boundary.
521-
self.nearest_neighbour = NearestNeighbors(n_neighbors=self.m + 1,
522-
n_jobs=self.n_jobs)
545+
self.nearest_neighbour = NearestNeighbors(
546+
n_neighbors=self.m_neighbors + 1,
547+
n_jobs=self.n_jobs)
523548

524549
# --- SVM smote
525550
# Unlike the borderline variations, the SVM variation uses the support

imblearn/pipeline.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,8 @@ class Pipeline(pipeline.Pipeline):
100100
>>> pipeline = Pipeline([('smt', smt), ('pca', pca), ('knn', knn)])
101101
>>> X_train, X_test, y_train, y_test = tts(X, y, random_state=42)
102102
>>> pipeline.fit(X_train, y_train)
103-
Pipeline(steps=[('smt', SMOTE(k=5, kind='regular', m=10, n_jobs=-1, out_step=0.5, random_state=42,
104-
ratio='auto')), ('pca', PCA(copy=True, n_components=None, whiten=False)), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
103+
Pipeline(steps=[('smt', SMOTE(k=None, k_neighbors=5, kind='regular', m=None, m_neighbors=10,
104+
n_jobs=-1, out_step=0.5, random_state=42, ratio='auto')), ('pca', PCA(copy=True, n_components=None, whiten=False)), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
105105
metric_params=None, n_jobs=1, n_neighbors=5, p=2,
106106
weights='uniform'))])
107107
>>> y_hat = pipeline.predict(X_test)

0 commit comments

Comments
 (0)