Skip to content

Commit c687bc3

Browse files
author
Guillaume Lemaitre
committed
Add adasyn
1 parent 2c4a363 commit c687bc3

File tree

10 files changed

+438
-2
lines changed

10 files changed

+438
-2
lines changed

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ Below is a list of the methods currently implemented in this module.
7979
2. [SMOTE - Synthetic Minority Over-sampling Technique](#ref8)
8080
3. [bSMOTE(1 & 2) - Borderline SMOTE of types 1 and 2](#ref9)
8181
4. [SVM SMOTE - Support Vectors SMOTE](#ref10)
82+
5. [ADASYN - Adaptive synthetic sampling approach for imbalanced learning](#ref15)
8283

8384
* Over-sampling followed by under-sampling
8485
1. [SMOTE + Tomek links](#ref12)
@@ -121,4 +122,6 @@ References:
121122

122123
<a name="ref13"></a>[13]: X. Y. Liu, J. Wu and Z. H. Zhou, [“Exploratory Undersampling for Class-Imbalance Learning,”](http://cse.seu.edu.cn/people/xyliu/publication/tsmcb09.pdf) in IEEE Transactions on Systems, Man, and Cybernetics, Part B (Cybernetics), vol. 39, no. 2, pp. 539-550, April 2009.
123124

124-
<a name="ref14"></a>[14]:I. Tomek, [“An Experiment with the Edited Nearest-Neighbor Rule,”](http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4309523) IEEE Transactions on Systems, Man, and Cybernetics, vol. 6(6), pp. 448-452, June 1976.
125+
<a name="ref14"></a>[14]: I. Tomek, [“An Experiment with the Edited Nearest-Neighbor Rule,”](http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4309523) IEEE Transactions on Systems, Man, and Cybernetics, vol. 6(6), pp. 448-452, June 1976.
126+
127+
<a name="ref15"></a>[15]: He, Haibo, Yang Bai, Edwardo A. Garcia, and Shutao Li. [“ADASYN: Adaptive synthetic sampling approach for imbalanced learning,”](http://140.123.102.14:8080/reportSys/file/paper/manto/manto_6_paper.pdf) In IEEE International Joint Conference on Neural Networks (IEEE World Congress on Computational Intelligence), pp. 1322-1328, 2008.

doc/api.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ Classes
4848
.. autosummary::
4949
:toctree: generated/
5050

51+
over_sampling.ADASYN
5152
over_sampling.RandomOverSampler
5253
over_sampling.SMOTE
5354

examples/over-sampling/plot_adasyn.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
"""
2+
======
3+
ADASYN
4+
======
5+
6+
An illustration of the Adaptive Synthetic Sampling Approach for Imbalanced
7+
Learning ADASYN method.
8+
9+
"""
10+
11+
print(__doc__)
12+
13+
import matplotlib.pyplot as plt
14+
import seaborn as sns
15+
sns.set()
16+
17+
# Define some color for the plotting
18+
almost_black = '#262626'
19+
palette = sns.color_palette()
20+
21+
from sklearn.datasets import make_classification
22+
from sklearn.decomposition import PCA
23+
24+
from imblearn.over_sampling import ADASYN
25+
26+
# Generate the dataset
27+
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
28+
n_informative=3, n_redundant=1, flip_y=0,
29+
n_features=20, n_clusters_per_class=1,
30+
n_samples=5000, random_state=10)
31+
32+
# Instanciate a PCA object for the sake of easy visualisation
33+
pca = PCA(n_components=2)
34+
# Fit and transform x to visualise inside a 2D feature space
35+
X_vis = pca.fit_transform(X)
36+
37+
# Apply the random over-sampling
38+
ada = ADASYN()
39+
X_resampled, y_resampled = ada.fit_sample(X, y)
40+
X_res_vis = pca.transform(X_resampled)
41+
42+
# Two subplots, unpack the axes array immediately
43+
f, (ax1, ax2) = plt.subplots(1, 2)
44+
45+
ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
46+
edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
47+
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
48+
edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
49+
ax1.set_title('Original set')
50+
51+
ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
52+
label="Class #0", alpha=.5, edgecolor=almost_black,
53+
facecolor=palette[0], linewidth=0.15)
54+
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
55+
label="Class #1", alpha=.5, edgecolor=almost_black,
56+
facecolor=palette[2], linewidth=0.15)
57+
ax2.set_title('ADASYN')
58+
59+
plt.show()

imblearn/over_sampling/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@
66
from .over_sampler import OverSampler
77
from .random_over_sampler import RandomOverSampler
88
from .smote import SMOTE
9+
from .adasyn import ADASYN
910

1011
__all__ = ['OverSampler',
1112
'RandomOverSampler',
12-
'SMOTE']
13+
'SMOTE',
14+
'ADASYN']

imblearn/over_sampling/adasyn.py

Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
"""Class to perform random over-sampling."""
2+
from __future__ import print_function
3+
from __future__ import division
4+
5+
import numpy as np
6+
7+
from collections import Counter
8+
9+
from sklearn.neighbors import NearestNeighbors
10+
from sklearn.utils import check_X_y
11+
12+
from .over_sampler import OverSampler
13+
14+
15+
class ADASYN(OverSampler):
16+
"""Perform over-sampling using ADASYN.
17+
18+
Perform over-sampling using Adaptive Synthetic Sampling Approach for
19+
Imbalanced Learning.
20+
21+
Parameters
22+
----------
23+
ratio : str or float, optional (default='auto')
24+
If 'auto', the ratio will be defined automatically to balance
25+
the dataset. Otherwise, the ratio is defined as the number
26+
of samples in the minority class over the the number of samples
27+
in the majority class.
28+
29+
random_state : int or None, optional (default=None)
30+
Seed for random number generation.
31+
32+
verbose : bool, optional (default=True)
33+
Whether or not to print information about the processing.
34+
35+
k : int, optional (default=5)
36+
Number of nearest neighbours to used to construct synthetic samples.
37+
38+
n_jobs : int, optional (default=-1)
39+
Number of threads to run the algorithm when it is possible.
40+
41+
Attributes
42+
----------
43+
ratio : str or float
44+
If 'auto', the ratio will be defined automatically to balance
45+
the dataset. Otherwise, the ratio is defined as the number
46+
of samples in the minority class over the the number of samples
47+
in the majority class.
48+
49+
random_state : int or None
50+
Seed for random number generation.
51+
52+
min_c_ : str or int
53+
The identifier of the minority class.
54+
55+
max_c_ : str or int
56+
The identifier of the majority class.
57+
58+
stats_c_ : dict of str/int : int
59+
A dictionary in which the number of occurences of each class is
60+
reported.
61+
62+
X_shape_ : tuple of int
63+
Shape of the data `X` during fitting.
64+
65+
Notes
66+
-----
67+
Does not support multi-class.
68+
69+
The implementation is based on [1]_.
70+
71+
References
72+
----------
73+
.. [1] He, Haibo, Yang Bai, Edwardo A. Garcia, and Shutao Li. "ADASYN:
74+
Adaptive synthetic sampling approach for imbalanced learning," In IEEE
75+
International Joint Conference on Neural Networks (IEEE World Congress
76+
on Computational Intelligence), pp. 1322-1328, 2008.
77+
78+
"""
79+
80+
def __init__(self, ratio='auto', random_state=None, verbose=True, k=5,
81+
n_jobs=-1):
82+
"""Initialize this object and its instance variables.
83+
84+
Parameters
85+
----------
86+
ratio : str or float, optional (default='auto')
87+
If 'auto', the ratio will be defined automatically to balance
88+
the dataset. Otherwise, the ratio is defined as the number
89+
of samples in the minority class over the the number of samples
90+
in the majority class.
91+
92+
random_state : int or None, optional (default=None)
93+
Seed for random number generation.
94+
95+
verbose : bool, optional (default=True)
96+
Whether or not to print information about the processing.
97+
98+
k : int, optional (default=5)
99+
Number of nearest neighbours to used to construct synthetic
100+
samples.
101+
102+
n_jobs : int, optional (default=-1)
103+
Number of threads to run the algorithm when it is possible.
104+
105+
Returns
106+
-------
107+
None
108+
109+
"""
110+
super(ADASYN, self).__init__(ratio=ratio,
111+
random_state=random_state,
112+
verbose=verbose)
113+
self.k = k
114+
self.n_jobs = n_jobs
115+
self.nearest_neighbour = NearestNeighbors(n_neighbors=self.k + 1,
116+
n_jobs=self.n_jobs)
117+
118+
def fit(self, X, y):
119+
"""Find the classes statistics before to perform sampling.
120+
121+
Parameters
122+
----------
123+
X : ndarray, shape (n_samples, n_features)
124+
Matrix containing the data which have to be sampled.
125+
126+
y : ndarray, shape (n_samples, )
127+
Corresponding label for each sample in X.
128+
129+
Returns
130+
-------
131+
self : object,
132+
Return self.
133+
134+
"""
135+
# Check the consistency of X and y
136+
X, y = check_X_y(X, y)
137+
138+
# Call the parent function
139+
super(ADASYN, self).fit(X, y)
140+
141+
return self
142+
143+
def sample(self, X, y):
144+
"""Resample the dataset.
145+
146+
Parameters
147+
----------
148+
X : ndarray, shape (n_samples, n_features)
149+
Matrix containing the data which have to be sampled.
150+
151+
y : ndarray, shape (n_samples, )
152+
Corresponding label for each sample in X.
153+
154+
Returns
155+
-------
156+
X_resampled : ndarray, shape (n_samples_new, n_features)
157+
The array containing the resampled data.
158+
159+
y_resampled : ndarray, shape (n_samples_new)
160+
The corresponding label of `X_resampled`
161+
162+
"""
163+
# Check the consistency of X and y
164+
X, y = check_X_y(X, y)
165+
166+
# Call the parent function
167+
super(ADASYN, self).sample(X, y)
168+
169+
# Keep the samples from the majority class
170+
X_resampled = X[y == self.maj_c_]
171+
y_resampled = y[y == self.maj_c_]
172+
173+
# Define the number of sample to create
174+
# We handle only two classes problem for the moment.
175+
if self.ratio == 'auto':
176+
num_samples = (self.stats_c_[self.maj_c_] -
177+
self.stats_c_[self.min_c_])
178+
else:
179+
num_samples = int((self.ratio * self.stats_c_[self.maj_c_]) -
180+
self.stats_c_[self.min_c_])
181+
182+
# Start by separating minority class features and target values.
183+
X_min = X[y == self.min_c_]
184+
185+
# Print if verbose is true
186+
if self.verbose:
187+
print('Finding the {} nearest neighbours...'.format(self.k))
188+
189+
# Look for k-th nearest neighbours, excluding, of course, the
190+
# point itself.
191+
self.nearest_neighbour.fit(X)
192+
193+
# Get the distance to the NN
194+
dist_nn, ind_nn = self.nearest_neighbour.kneighbors(X_min)
195+
196+
# Compute the ratio of majority samples next to minority samples
197+
ratio_nn = np.sum(y[ind_nn[:, 1:]] == self.maj_c_, axis=1) / self.k
198+
# Normalize the ratio
199+
ratio_nn /= np.sum(ratio_nn)
200+
201+
# Compute the number of sample to be generated
202+
num_samples_nn = np.round(ratio_nn * num_samples).astype(int)
203+
204+
# For each minority samples
205+
for x_i, x_i_nn, num_sample_i in zip(X_min, ind_nn, num_samples_nn):
206+
# Fix the the seed
207+
np.random.seed(self.random_state)
208+
# Pick-up the neighbors wanted
209+
nn_zs = np.random.randint(1, high=self.k + 1, size=num_sample_i)
210+
211+
# Create a new sample
212+
for nn_z in nn_zs:
213+
step = np.random.uniform()
214+
x_gen = x_i + step * (x_i - X[x_i_nn[nn_z], :])
215+
X_resampled = np.vstack((X_resampled, x_gen))
216+
y_resampled = np.hstack((y_resampled, self.min_c_))
217+
218+
if self.verbose:
219+
print("Over-sampling performed: {}".format(Counter(y_resampled)))
220+
221+
return X_resampled, y_resampled
1.3 MB
Binary file not shown.
978 KB
Binary file not shown.
66.4 KB
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)