12
12
# Authors: Guillaume Lemaitre <[email protected] >
13
13
# License: MIT
14
14
15
- from collections import Counter
16
-
17
- import numpy as np
18
- import matplotlib .pyplot as plt
19
-
20
- from sklearn .datasets import load_iris
21
-
22
- from imblearn .datasets import make_imbalance
23
-
24
- from imblearn .over_sampling import RandomOverSampler
25
- from imblearn .under_sampling import RandomUnderSampler
26
- from imblearn .under_sampling import TomekLinks
27
-
15
+ # %%
28
16
print (__doc__ )
17
+ import seaborn as sns
29
18
19
+ sns .set_context ("poster" )
30
20
31
- def plot_pie (y ):
32
- target_stats = Counter (y )
33
- labels = list (target_stats .keys ())
34
- sizes = list (target_stats .values ())
35
- explode = tuple ([0.1 ] * len (target_stats ))
36
-
37
- def make_autopct (values ):
38
- def my_autopct (pct ):
39
- total = sum (values )
40
- val = int (round (pct * total / 100.0 ))
41
- return f"{ pct :.2f} % ({ val :d} )"
42
-
43
- return my_autopct
44
-
45
- fig , ax = plt .subplots ()
46
- ax .pie (
47
- sizes ,
48
- explode = explode ,
49
- labels = labels ,
50
- shadow = True ,
51
- autopct = make_autopct (sizes ),
52
- )
53
- ax .axis ("equal" )
54
-
55
-
56
- ###############################################################################
21
+ # %% [markdown]
22
+ # Create an imbalanced dataset
23
+ # ----------------------------
24
+ #
57
25
# First, we will create an imbalanced data set from a the iris data set.
58
26
59
- iris = load_iris ()
27
+ # %%
28
+ from sklearn .datasets import load_iris
29
+ from imblearn .datasets import make_imbalance
60
30
61
- print (f"Information of the original iris data set: \n { Counter (iris .target )} " )
62
- plot_pie (iris .target )
31
+ iris = load_iris (as_frame = True )
63
32
64
33
sampling_strategy = {0 : 10 , 1 : 20 , 2 : 47 }
65
34
X , y = make_imbalance (iris .data , iris .target , sampling_strategy = sampling_strategy )
66
35
67
- print (
68
- f"Information of the iris data set after making it"
69
- f" imbalanced using a dict: \n sampling_strategy={ sampling_strategy } \n "
70
- f"y: { Counter (y )} "
71
- )
72
- plot_pie (y )
36
+ # %%
37
+ import matplotlib .pyplot as plt
73
38
74
- ###############################################################################
75
- # Using ``sampling_strategy`` in resampling algorithms
76
- ###############################################################################
39
+ fig , axs = plt .subplots (ncols = 2 , figsize = (10 , 5 ))
40
+ autopct = "%.2f"
41
+ iris .target .value_counts ().plot .pie (autopct = autopct , ax = axs [0 ])
42
+ axs [0 ].set_title ("Original" )
43
+ y .value_counts ().plot .pie (autopct = autopct , ax = axs [1 ])
44
+ axs [1 ].set_title ("Imbalanced" )
45
+ fig .tight_layout ()
77
46
78
- ###############################################################################
79
- # ``sampling_strategy`` as a ``float``
80
- # ....................................
47
+ # %% [markdown]
48
+ # Using ``sampling_strategy`` in resampling algorithms
49
+ # ====================================================
81
50
#
82
- # ``sampling_strategy`` can be given a ``float``. For **under-sampling
51
+ # `sampling_strategy` as a `float`
52
+ # --------------------------------
53
+ #
54
+ # `sampling_strategy` can be given a `float`. For **under-sampling
83
55
# methods**, it corresponds to the ratio :math:`\\alpha_{us}` defined by
84
56
# :math:`N_{rM} = \\alpha_{us} \\times N_{m}` where :math:`N_{rM}` and
85
57
# :math:`N_{m}` are the number of samples in the majority class after
86
58
# resampling and the number of samples in the minority class, respectively.
87
59
60
+ # %%
61
+ import numpy as np
62
+
88
63
# select only 2 classes since the ratio make sense in this case
89
64
binary_mask = np .bitwise_or (y == 0 , y == 2 )
90
65
binary_y = y [binary_mask ]
91
66
binary_X = X [binary_mask ]
92
67
93
- sampling_strategy = 0.8
68
+ # %%
69
+ from imblearn .under_sampling import RandomUnderSampler
94
70
71
+ sampling_strategy = 0.8
95
72
rus = RandomUnderSampler (sampling_strategy = sampling_strategy )
96
73
X_res , y_res = rus .fit_resample (binary_X , binary_y )
97
- print (
98
- f"Information of the iris data set after making it "
99
- f"balanced using a float and an under-sampling method: \n "
100
- f"sampling_strategy={ sampling_strategy } \n y: { Counter (y_res )} "
101
- )
102
- plot_pie (y_res )
103
-
104
- ###############################################################################
74
+ ax = y_res .value_counts ().plot .pie (autopct = autopct )
75
+ _ = ax .set_title ("Under-sampling" )
76
+
77
+ # %% [markdown]
105
78
# For **over-sampling methods**, it correspond to the ratio
106
79
# :math:`\\alpha_{os}` defined by :math:`N_{rm} = \\alpha_{os} \\times N_{M}`
107
80
# where :math:`N_{rm}` and :math:`N_{M}` are the number of samples in the
108
81
# minority class after resampling and the number of samples in the majority
109
82
# class, respectively.
110
83
84
+ # %%
85
+ from imblearn .over_sampling import RandomOverSampler
86
+
111
87
ros = RandomOverSampler (sampling_strategy = sampling_strategy )
112
88
X_res , y_res = ros .fit_resample (binary_X , binary_y )
113
- print (
114
- f"Information of the iris data set after making it "
115
- f"balanced using a float and an over-sampling method: \n "
116
- f"sampling_strategy={ sampling_strategy } \n y: { Counter (y_res )} "
117
- )
118
- plot_pie (y_res )
119
-
120
- ###############################################################################
121
- # ``sampling_strategy`` has a ``str``
122
- # ...................................
89
+ ax = y_res .value_counts ().plot .pie (autopct = autopct )
90
+ _ = ax .set_title ("Over-sampling" )
91
+
92
+ # %% [markdown]
93
+ # `sampling_strategy` has a `str`
94
+ # -------------------------------
123
95
#
124
- # `` sampling_strategy` ` can be given as a string which specify the class
96
+ # `sampling_strategy` can be given as a string which specify the class
125
97
# targeted by the resampling. With under- and over-sampling, the number of
126
98
# samples will be equalized.
127
99
#
128
100
# Note that we are using multiple classes from now on.
129
101
102
+ # %%
130
103
sampling_strategy = "not minority"
131
104
105
+ fig , axs = plt .subplots (ncols = 2 , figsize = (10 , 5 ))
132
106
rus = RandomUnderSampler (sampling_strategy = sampling_strategy )
133
107
X_res , y_res = rus .fit_resample (X , y )
134
- print (
135
- f"Information of the iris data set after making it "
136
- f"balanced by under-sampling: \n sampling_strategy={ sampling_strategy } \n "
137
- f" y: { Counter (y_res )} "
138
- )
139
- plot_pie (y_res )
108
+ y_res .value_counts ().plot .pie (autopct = autopct , ax = axs [0 ])
109
+ axs [0 ].set_title ("Under-sampling" )
140
110
141
111
sampling_strategy = "not majority"
142
-
143
112
ros = RandomOverSampler (sampling_strategy = sampling_strategy )
144
113
X_res , y_res = ros .fit_resample (X , y )
145
- print (
146
- f"Information of the iris data set after making it "
147
- f"balanced by over-sampling: \n sampling_strategy={ sampling_strategy } \n "
148
- f"y: { Counter (y_res )} "
149
- )
150
- plot_pie (y_res )
151
-
152
- ###############################################################################
114
+ y_res .value_counts ().plot .pie (autopct = autopct , ax = axs [1 ])
115
+ axs [1 ].set_title ("Over-sampling" )
116
+
117
+ # %% [markdown]
153
118
# With **cleaning method**, the number of samples in each class will not be
154
119
# equalized even if targeted.
155
120
121
+ # %%
122
+ from imblearn .under_sampling import TomekLinks
123
+
156
124
sampling_strategy = "not minority"
157
125
tl = TomekLinks (sampling_strategy )
158
126
X_res , y_res = tl .fit_resample (X , y )
159
- print (
160
- f"Information of the iris data set after making it "
161
- f"balanced by cleaning sampling: \n sampling_strategy={ sampling_strategy } \n "
162
- f"y: { Counter (y_res )} "
163
- )
164
- plot_pie (y_res )
165
-
166
- ###############################################################################
167
- # ``sampling_strategy`` as a ``dict``
168
- # ...................................
127
+ ax = y_res .value_counts ().plot .pie (autopct = autopct )
128
+ _ = ax .set_title ("Cleaning" )
129
+
130
+ # %% [markdown]
131
+ # `sampling_strategy as a `dict`
132
+ # ------------------------------
169
133
#
170
- # When `` sampling_strategy`` is a `` dict` `, the keys correspond to the targeted
134
+ # When `sampling_strategy` is a `dict`, the keys correspond to the targeted
171
135
# classes. The values correspond to the desired number of samples for each
172
136
# targeted class. This is working for both **under- and over-sampling**
173
- # algorithms but not for the **cleaning algorithms**. Use a `` list` ` instead.
137
+ # algorithms but not for the **cleaning algorithms**. Use a `list` instead.
174
138
139
+ # %%
140
+ fig , axs = plt .subplots (ncols = 2 , figsize = (10 , 5 ))
175
141
176
142
sampling_strategy = {0 : 10 , 1 : 15 , 2 : 20 }
177
-
178
143
rus = RandomUnderSampler (sampling_strategy = sampling_strategy )
179
144
X_res , y_res = rus .fit_resample (X , y )
180
- print (
181
- f"Information of the iris data set after making it "
182
- f"balanced by under-sampling: \n sampling_strategy={ sampling_strategy } \n "
183
- f"y: { Counter (y_res )} "
184
- )
185
- plot_pie (y_res )
145
+ y_res .value_counts ().plot .pie (autopct = autopct , ax = axs [0 ])
146
+ axs [0 ].set_title ("Under-sampling" )
186
147
187
148
sampling_strategy = {0 : 25 , 1 : 35 , 2 : 47 }
188
-
189
149
ros = RandomOverSampler (sampling_strategy = sampling_strategy )
190
150
X_res , y_res = ros .fit_resample (X , y )
191
- print (
192
- f"Information of the iris data set after making it "
193
- f"balanced by over-sampling: \n sampling_strategy={ sampling_strategy } \n "
194
- f"y: { Counter (y_res )} "
195
- )
196
- plot_pie (y_res )
197
-
198
- ###############################################################################
199
- # ``sampling_strategy`` as a ``list``
200
- # ...................................
151
+ y_res .value_counts ().plot .pie (autopct = autopct , ax = axs [1 ])
152
+ axs [1 ].set_title ("Under-sampling" )
153
+
154
+ # %% [markdown]
155
+ # `sampling_strategy` as a `list`
156
+ # -------------------------------
201
157
#
202
- # When `` sampling_strategy`` is a `` list` `, the list contains the targeted
158
+ # When `sampling_strategy` is a `list`, the list contains the targeted
203
159
# classes. It is used only for **cleaning methods** and raise an error
204
160
# otherwise.
205
161
162
+ # %%
206
163
sampling_strategy = [0 , 1 , 2 ]
207
164
tl = TomekLinks (sampling_strategy = sampling_strategy )
208
165
X_res , y_res = tl .fit_resample (X , y )
209
- print (
210
- f"Information of the iris data set after making it "
211
- f"balanced by cleaning sampling: \n sampling_strategy={ sampling_strategy } "
212
- f"\n y: { Counter (y_res )} "
213
- )
214
- plot_pie (y_res )
215
-
216
- ###############################################################################
217
- # ``sampling_strategy`` as a callable
218
- # ...................................
166
+ ax = y_res .value_counts ().plot .pie (autopct = autopct )
167
+ _ = ax .set_title ("Cleaning" )
168
+
169
+ # %% [markdown]
170
+ # `sampling_strategy` as a callable
171
+ # ---------------------------------
219
172
#
220
- # When callable, function taking ``y`` and returns a `` dict` `. The keys
173
+ # When callable, function taking `y` and returns a `dict`. The keys
221
174
# correspond to the targeted classes. The values correspond to the desired
222
175
# number of samples for each class.
223
176
224
177
178
+ # %%
225
179
def ratio_multiplier (y ):
180
+ from collections import Counter
181
+
226
182
multiplier = {1 : 0.7 , 2 : 0.95 }
227
183
target_stats = Counter (y )
228
184
for key , value in target_stats .items ():
@@ -232,11 +188,6 @@ def ratio_multiplier(y):
232
188
233
189
234
190
X_res , y_res = RandomUnderSampler (sampling_strategy = ratio_multiplier ).fit_resample (X , y )
235
-
236
- print (
237
- f"Information of the iris data set after balancing using a callable"
238
- f" mode:\n ratio={ ratio_multiplier } \n y: { Counter (y_res )} "
239
- )
240
- plot_pie (y_res )
241
-
191
+ ax = y_res .value_counts ().plot .pie (autopct = autopct )
192
+ ax .set_title ("Under-sampling" )
242
193
plt .show ()
0 commit comments