Skip to content

Commit 3ee7147

Browse files
chkoarglemaitre
authored andcommitted
[MRG+1] Rename all occurrences of size_ngh to n_neighbors for consistency with scikit-learn (#109)
* Rename all occurrences of size_ngh to n_neighbors for consistency with scikit-learn. * Implement deprecation for smote_enn and enn * Add the changes in documentation * Make the changes in the base function * Minor comment fixes
1 parent db85c84 commit 3ee7147

19 files changed

+153
-49
lines changed

doc/whats_new.rst

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ Changelog
1414

1515
Bug fixes
1616
~~~~~~~~~
17-
1817
- Fixed a bug in :class:`under_sampling.NearMiss` which was not picking the right samples during under sampling for the method 3. By `Guillaume Lemaitre`_.
1918
- Fixed a bug in :class:`ensemble.EasyEnsemble`, correction of the `random_state` generation. By `Guillaume Lemaitre`_ and `Christos Aridas`_.
2019
- Fixed a bug in :class:`under_sampling.RepeatedEditedNearestNeighbours`, add additional stopping criterion to avoid that the minority class become a majority class or that a class disappear. By `Guillaume Lemaitre`_.
@@ -38,6 +37,18 @@ Enhancement
3837
- Added support for bumpversion. By `Guillaume Lemaitre`_.
3938
- Validate the type of target in binary samplers. A warning is raised for the moment. By `Guillaume Lemaitre`_ and `Christos Aridas`_.
4039

40+
New features
41+
~~~~~~~~~~~~
42+
43+
- Added AllKNN under sampling technique.
44+
- Added support for bumpversion.
45+
46+
API changes summary
47+
~~~~~~~~~~~~~~~~~~~
48+
49+
- `size_ngh` has been deprecated in :class:`combine.SMOTEENN`. Use `n_neighbors` instead. By `Guillaume Lemaitre`_, `Christos Aridas`_, and `Dayvid Oliveira` .
50+
- `size_ngh` has been deprecated in :class:`under_sampling.EditedNearestNeighbors`. Use `n_neighbors` instead. By `Guillaume Lemaitre`_, `Christos Aridas`_, and `Dayvid Oliveira`_.
51+
4152
Documentation changes
4253
~~~~~~~~~~~~~~~~~~~~~
4354

examples/plot_unbalanced_dataset.ipynb

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -215,25 +215,25 @@
215215
"NM3 = NearMiss(version=3)\n",
216216
"nm3x, nm3y = NM3.fit_sample(x, y)\n",
217217
"# 'Condensed Nearest Neighbour'\n",
218-
"CNN = CondensedNearestNeighbour(size_ngh=51, n_seeds_S=51)\n",
218+
"CNN = CondensedNearestNeighbour(n_neighbors=51, n_seeds_S=51)\n",
219219
"cnnx, cnny = CNN.fit_sample(x, y)\n",
220220
"# 'One-Sided Selection'\n",
221-
"OSS = OneSidedSelection(size_ngh=51, n_seeds_S=51)\n",
221+
"OSS = OneSidedSelection(n_neighbors=51, n_seeds_S=51)\n",
222222
"ossx, ossy = OSS.fit_sample(x, y)\n",
223223
"# 'Neighboorhood Cleaning Rule'\n",
224-
"NCR = NeighbourhoodCleaningRule(size_ngh=51)\n",
224+
"NCR = NeighbourhoodCleaningRule(n_neighbors=51)\n",
225225
"ncrx, ncry = NCR.fit_sample(x, y) \n",
226226
"# 'Edited Neareast Neighbour'\n",
227-
"ENN = EditedNearestNeighbours(size_ngh=51)\n",
227+
"ENN = EditedNearestNeighbours(n_neighbors=51)\n",
228228
"ennx, enny = ENN.fit_sample(x, y)\n",
229229
"# 'Instance Hardness Threshold'\n",
230230
"IHT = InstanceHardnessThreshold()\n",
231231
"ihtx, ihty = IHT.fit_sample(x, y)\n",
232232
"# 'Repeated Edited Nearest Neighbour'\n",
233-
"RENN = RepeatedEditedNearestNeighbours(size_ngh=51)\n",
233+
"RENN = RepeatedEditedNearestNeighbours(n_neighbors=51)\n",
234234
"rennx, renny = RENN.fit_sample(x, y)\n",
235235
"# 'AllKNN'\n",
236-
"ALLK = AllKNN(size_ngh=51)\n",
236+
"ALLK = AllKNN(n_neighbors=51)\n",
237237
"allkx, allky = ALLK.fit_sample(x, y)\n",
238238
"\n",
239239
"# Apply PCA to be able to visualise the results\n",

imblearn/base.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,12 @@ def fit(self, X, y):
7474
if hasattr(self, 'ratio'):
7575
self._validate_ratio()
7676

77+
if hasattr(self, 'size_ngh'):
78+
self._validate_size_ngh_deprecation()
79+
7780
self.logger.info('Compute classes statistics ...')
7881

79-
# # Raise an error if there is only one class
82+
# Raise an error if there is only one class
8083
# if uniques.size == 1:
8184
# raise RuntimeError("Only one class detected, aborting...")
8285
# Raise a warning for the moment to be compatible with BaseEstimator
@@ -149,6 +152,9 @@ def sample(self, X, y):
149152
if hasattr(self, 'ratio'):
150153
self._validate_ratio()
151154

155+
if hasattr(self, 'size_ngh'):
156+
self._validate_size_ngh_deprecation()
157+
152158
return self._sample(X, y)
153159

154160
def fit_sample(self, X, y):
@@ -190,6 +196,15 @@ def _validate_ratio(self):
190196
else:
191197
raise ValueError('Unknown parameter type for ratio.')
192198

199+
def _validate_size_ngh_deprecation(self):
200+
"Private function to warn about the deprecation about size_ngh."
201+
202+
# Announce deprecation if necessary
203+
if self.size_ngh is not None:
204+
warnings.warn('`size_ngh` will be replaced in version 0.4. Use'
205+
' `n_neighbors` instead.', DeprecationWarning)
206+
self.n_neighbors = self.size_ngh
207+
193208
@abstractmethod
194209
def _sample(self, X, y):
195210
"""Resample the dataset.

imblearn/combine/smote_enn.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
"""Class to perform over-sampling using SMOTE and cleaning using ENN."""
22
from __future__ import division, print_function
33

4+
import warnings
5+
46
from ..base import BaseBinarySampler
57
from ..over_sampling import SMOTE
68
from ..under_sampling import EditedNearestNeighbours
@@ -40,7 +42,14 @@ class SMOTEENN(BaseBinarySampler):
4042
The type of SMOTE algorithm to use one of the following
4143
options: 'regular', 'borderline1', 'borderline2', 'svm'.
4244
43-
size_ngh : int, optional (default=3)
45+
size_ngh : int, optional (default=None)
46+
Size of the neighbourhood to consider to compute the average
47+
distance to the minority point samples.
48+
49+
NOTE: size_ngh is deprecated from 0.2 and will be replaced in 0.4
50+
Use ``n_neighbors`` instead.
51+
52+
n_neighbors : int, optional (default=3)
4453
Size of the neighbourhood to consider to compute the average
4554
distance to the minority point samples.
4655
@@ -103,7 +112,8 @@ class SMOTEENN(BaseBinarySampler):
103112

104113
def __init__(self, ratio='auto', random_state=None,
105114
k=5, m=10, out_step=0.5, kind_smote='regular',
106-
size_ngh=3, kind_enn='all', n_jobs=-1, **kwargs):
115+
size_ngh=None, n_neighbors=3, kind_enn='all', n_jobs=-1,
116+
**kwargs):
107117

108118
super(SMOTEENN, self).__init__(ratio=ratio)
109119
self.random_state = random_state
@@ -112,6 +122,7 @@ def __init__(self, ratio='auto', random_state=None,
112122
self.out_step = out_step
113123
self.kind_smote = kind_smote
114124
self.size_ngh = size_ngh
125+
self.n_neighbors = n_neighbors
115126
self.kind_enn = kind_enn
116127
self.n_jobs = n_jobs
117128
self.kwargs = kwargs
@@ -121,6 +132,7 @@ def __init__(self, ratio='auto', random_state=None,
121132
**self.kwargs)
122133
self.enn = EditedNearestNeighbours(random_state=self.random_state,
123134
size_ngh=self.size_ngh,
135+
n_neighbors=self.n_neighbors,
124136
kind_sel=self.kind_enn,
125137
n_jobs=self.n_jobs)
126138

@@ -144,6 +156,12 @@ def fit(self, X, y):
144156

145157
super(SMOTEENN, self).fit(X, y)
146158

159+
# Annonce deprecation if necessary
160+
if self.size_ngh is not None:
161+
warnings.warn('`size_ngh` will be replaced in version 0.4. Use'
162+
' `n_neighbors` instead.', DeprecationWarning)
163+
self.n_neighbors = self.size_ngh
164+
147165
# Fit using SMOTE
148166
self.sm.fit(X, y)
149167

imblearn/combine/smote_tomek.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,6 @@ class SMOTETomek(BaseBinarySampler):
4242
The type of SMOTE algorithm to use one of the following
4343
options: 'regular', 'borderline1', 'borderline2', 'svm'
4444
45-
size_ngh : int, optional (default=3)
46-
Size of the neighbourhood to consider to compute the average
47-
distance to the minority point samples.
48-
4945
kind_sel : str, optional (default='all')
5046
Strategy to use in order to exclude samples.
5147

imblearn/under_sampling/condensed_nearest_neighbour.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
method."""
33
from __future__ import division, print_function
44

5+
import warnings
6+
57
from collections import Counter
68

79
import numpy as np
@@ -27,7 +29,14 @@ class CondensedNearestNeighbour(BaseMulticlassSampler):
2729
If None, the random number generator is the RandomState instance used
2830
by np.random.
2931
30-
size_ngh : int, optional (default=1)
32+
size_ngh : int, optional (default=None)
33+
Size of the neighbourhood to consider to compute the average
34+
distance to the minority point samples.
35+
36+
NOTE: size_ngh is deprecated from 0.2 and will be replaced in 0.4
37+
Use ``n_neighbors`` instead.
38+
39+
n_neighbors : int, optional (default=1)
3140
Size of the neighbourhood to consider to compute the average
3241
distance to the minority point samples.
3342
@@ -86,12 +95,14 @@ class CondensedNearestNeighbour(BaseMulticlassSampler):
8695
"""
8796

8897
def __init__(self, return_indices=False, random_state=None,
89-
size_ngh=1, n_seeds_S=1, n_jobs=-1, **kwargs):
98+
size_ngh=None, n_neighbors=1, n_seeds_S=1, n_jobs=-1,
99+
**kwargs):
90100
super(CondensedNearestNeighbour, self).__init__()
91101

92102
self.return_indices = return_indices
93103
self.random_state = random_state
94104
self.size_ngh = size_ngh
105+
self.n_neighbors = n_neighbors
95106
self.n_seeds_S = n_seeds_S
96107
self.n_jobs = n_jobs
97108
self.kwargs = kwargs
@@ -158,7 +169,7 @@ def _sample(self, X, y):
158169
S_y = y[y == key]
159170

160171
# Create a k-NN classifier
161-
knn = KNeighborsClassifier(n_neighbors=self.size_ngh,
172+
knn = KNeighborsClassifier(n_neighbors=self.n_neighbors,
162173
n_jobs=self.n_jobs,
163174
**self.kwargs)
164175

imblearn/under_sampling/edited_nearest_neighbours.py

Lines changed: 37 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from collections import Counter
66

7+
import warnings
78
import numpy as np
89
from scipy.stats import mode
910
from sklearn.neighbors import NearestNeighbors
@@ -29,7 +30,14 @@ class EditedNearestNeighbours(BaseMulticlassSampler):
2930
If None, the random number generator is the RandomState instance used
3031
by np.random.
3132
32-
size_ngh : int, optional (default=3)
33+
size_ngh : int, optional (default=None)
34+
Size of the neighbourhood to consider to compute the average
35+
distance to the minority point samples.
36+
37+
NOTE: size_ngh is deprecated from 0.2 and will be replaced in 0.4
38+
Use ``n_neighbors`` instead.
39+
40+
n_neighbors : int, optional (default=3)
3341
Size of the neighbourhood to consider to compute the average
3442
distance to the minority point samples.
3543
@@ -91,11 +99,12 @@ class EditedNearestNeighbours(BaseMulticlassSampler):
9199
"""
92100

93101
def __init__(self, return_indices=False, random_state=None,
94-
size_ngh=3, kind_sel='all', n_jobs=-1):
102+
size_ngh=None, n_neighbors=3, kind_sel='all', n_jobs=-1):
95103
super(EditedNearestNeighbours, self).__init__()
96104
self.return_indices = return_indices
97105
self.random_state = random_state
98106
self.size_ngh = size_ngh
107+
self.n_neighbors = n_neighbors
99108
self.kind_sel = kind_sel
100109
self.n_jobs = n_jobs
101110

@@ -140,7 +149,7 @@ def _sample(self, X, y):
140149
idx_under = np.flatnonzero(y == self.min_c_)
141150

142151
# Create a k-NN to fit the whole data
143-
nn_obj = NearestNeighbors(n_neighbors=self.size_ngh + 1,
152+
nn_obj = NearestNeighbors(n_neighbors=self.n_neighbors + 1,
144153
n_jobs=self.n_jobs)
145154
# Fit the data
146155
nn_obj.fit(X)
@@ -217,7 +226,14 @@ class RepeatedEditedNearestNeighbours(BaseMulticlassSampler):
217226
If None, the random number generator is the RandomState instance used
218227
by np.random.
219228
220-
size_ngh : int, optional (default=3)
229+
size_ngh : int, optional (default=None)
230+
Size of the neighbourhood to consider to compute the average
231+
distance to the minority point samples.
232+
233+
NOTE: size_ngh is deprecated from 0.2 and will be replaced in 0.4
234+
Use ``n_neighbors`` instead.
235+
236+
n_neighbors : int, optional (default=3)
221237
Size of the neighbourhood to consider to compute the average
222238
distance to the minority point samples.
223239
@@ -283,18 +299,20 @@ class RepeatedEditedNearestNeighbours(BaseMulticlassSampler):
283299
"""
284300

285301
def __init__(self, return_indices=False, random_state=None,
286-
size_ngh=3, max_iter=100, kind_sel='all', n_jobs=-1):
302+
size_ngh=None, n_neighbors=3, max_iter=100, kind_sel='all',
303+
n_jobs=-1):
287304
super(RepeatedEditedNearestNeighbours, self).__init__()
288305
self.return_indices = return_indices
289306
self.random_state = random_state
290307
self.size_ngh = size_ngh
308+
self.n_neighbors = n_neighbors
291309
self.kind_sel = kind_sel
292310
self.n_jobs = n_jobs
293311
self.max_iter = max_iter
294312
self.enn_ = EditedNearestNeighbours(
295313
return_indices=self.return_indices,
296314
random_state=self.random_state,
297-
size_ngh=self.size_ngh,
315+
n_neighbors=self.n_neighbors,
298316
kind_sel=self.kind_sel,
299317
n_jobs=self.n_jobs)
300318

@@ -441,7 +459,14 @@ class AllKNN(BaseMulticlassSampler):
441459
If None, the random number generator is the RandomState instance used
442460
by np.random.
443461
444-
size_ngh : int, optional (default=3)
462+
size_ngh : int, optional (default=None)
463+
Size of the neighbourhood to consider to compute the average
464+
distance to the minority point samples.
465+
466+
NOTE: size_ngh is deprecated from 0.2 and will be replaced in 0.4
467+
Use ``n_neighbors`` instead.
468+
469+
n_neighbors : int, optional (default=3)
445470
Size of the neighbourhood to consider to compute the average
446471
distance to the minority point samples.
447472
@@ -503,17 +528,18 @@ class AllKNN(BaseMulticlassSampler):
503528
"""
504529

505530
def __init__(self, return_indices=False, random_state=None,
506-
size_ngh=3, kind_sel='all', n_jobs=-1):
531+
size_ngh=None, n_neighbors=3, kind_sel='all', n_jobs=-1):
507532
super(AllKNN, self).__init__()
508533
self.return_indices = return_indices
509534
self.random_state = random_state
510535
self.size_ngh = size_ngh
536+
self.n_neighbors = n_neighbors
511537
self.kind_sel = kind_sel
512538
self.n_jobs = n_jobs
513539
self.enn_ = EditedNearestNeighbours(
514540
return_indices=self.return_indices,
515541
random_state=self.random_state,
516-
size_ngh=self.size_ngh,
542+
n_neighbors=self.n_neighbors,
517543
kind_sel=self.kind_sel,
518544
n_jobs=self.n_jobs)
519545

@@ -572,10 +598,11 @@ def _sample(self, X, y):
572598
if self.return_indices:
573599
idx_under = np.arange(X.shape[0], dtype=int)
574600

575-
for curr_size_ngh in range(1, self.size_ngh + 1):
601+
for curr_size_ngh in range(1, self.n_neighbors + 1):
576602
self.logger.debug('Apply ENN size_ngh #%s', curr_size_ngh)
577603
# updating ENN size_ngh
578604
self.enn_.size_ngh = curr_size_ngh
605+
579606
if self.return_indices:
580607
X_enn, y_enn, idx_enn = self.enn_.fit_sample(X_, y_)
581608
else:

0 commit comments

Comments
 (0)