Skip to content

Commit 2150e67

Browse files
authored
DOC improve style of some examples (#813)
1 parent 9f4173d commit 2150e67

File tree

8 files changed

+354
-440
lines changed

8 files changed

+354
-440
lines changed

build_tools/circle/build_doc.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ conda create -n $CONDA_ENV_NAME --yes --quiet python=3.8
101101
source activate $CONDA_ENV_NAME
102102

103103
conda install --yes pip numpy scipy joblib pillow matplotlib memory_profiler \
104-
sphinx pandas tensorflow=2
104+
sphinx pandas tensorflow=2 seaborn
105105
pip install --pre -f https://sklearn-nightly.scdn8.secure.raxcdn.com scikit-learn
106106
pip install -U git+https://github.com/sphinx-gallery/sphinx-gallery.git
107107
pip install -U git+https://github.com/numpy/numpydoc.git

doc/conf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@
160160
"matplotlib": ("https://matplotlib.org/", None),
161161
"sklearn": ("http://scikit-learn.org/stable", None),
162162
"pandas": ("http://pandas.pydata.org/pandas-docs/stable/", None),
163+
"seaborn": ("https://seaborn.pydata.org/", None),
163164
}
164165

165166
# -- Options for sphinx-gallery -----------------------------------------------

examples/api/plot_sampling_strategy_usage.py

Lines changed: 90 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -12,217 +12,173 @@
1212
# Authors: Guillaume Lemaitre <[email protected]>
1313
# License: MIT
1414

15-
from collections import Counter
16-
17-
import numpy as np
18-
import matplotlib.pyplot as plt
19-
20-
from sklearn.datasets import load_iris
21-
22-
from imblearn.datasets import make_imbalance
23-
24-
from imblearn.over_sampling import RandomOverSampler
25-
from imblearn.under_sampling import RandomUnderSampler
26-
from imblearn.under_sampling import TomekLinks
27-
15+
# %%
2816
print(__doc__)
17+
import seaborn as sns
2918

19+
sns.set_context("poster")
3020

31-
def plot_pie(y):
32-
target_stats = Counter(y)
33-
labels = list(target_stats.keys())
34-
sizes = list(target_stats.values())
35-
explode = tuple([0.1] * len(target_stats))
36-
37-
def make_autopct(values):
38-
def my_autopct(pct):
39-
total = sum(values)
40-
val = int(round(pct * total / 100.0))
41-
return f"{pct:.2f}% ({val:d})"
42-
43-
return my_autopct
44-
45-
fig, ax = plt.subplots()
46-
ax.pie(
47-
sizes,
48-
explode=explode,
49-
labels=labels,
50-
shadow=True,
51-
autopct=make_autopct(sizes),
52-
)
53-
ax.axis("equal")
54-
55-
56-
###############################################################################
21+
# %% [markdown]
22+
# Create an imbalanced dataset
23+
# ----------------------------
24+
#
5725
# First, we will create an imbalanced data set from a the iris data set.
5826

59-
iris = load_iris()
27+
# %%
28+
from sklearn.datasets import load_iris
29+
from imblearn.datasets import make_imbalance
6030

61-
print(f"Information of the original iris data set: \n {Counter(iris.target)}")
62-
plot_pie(iris.target)
31+
iris = load_iris(as_frame=True)
6332

6433
sampling_strategy = {0: 10, 1: 20, 2: 47}
6534
X, y = make_imbalance(iris.data, iris.target, sampling_strategy=sampling_strategy)
6635

67-
print(
68-
f"Information of the iris data set after making it"
69-
f" imbalanced using a dict: \n sampling_strategy={sampling_strategy} \n "
70-
f"y: {Counter(y)}"
71-
)
72-
plot_pie(y)
36+
# %%
37+
import matplotlib.pyplot as plt
7338

74-
###############################################################################
75-
# Using ``sampling_strategy`` in resampling algorithms
76-
###############################################################################
39+
fig, axs = plt.subplots(ncols=2, figsize=(10, 5))
40+
autopct = "%.2f"
41+
iris.target.value_counts().plot.pie(autopct=autopct, ax=axs[0])
42+
axs[0].set_title("Original")
43+
y.value_counts().plot.pie(autopct=autopct, ax=axs[1])
44+
axs[1].set_title("Imbalanced")
45+
fig.tight_layout()
7746

78-
###############################################################################
79-
# ``sampling_strategy`` as a ``float``
80-
# ....................................
47+
# %% [markdown]
48+
# Using ``sampling_strategy`` in resampling algorithms
49+
# ====================================================
8150
#
82-
# ``sampling_strategy`` can be given a ``float``. For **under-sampling
51+
# `sampling_strategy` as a `float`
52+
# --------------------------------
53+
#
54+
# `sampling_strategy` can be given a `float`. For **under-sampling
8355
# methods**, it corresponds to the ratio :math:`\\alpha_{us}` defined by
8456
# :math:`N_{rM} = \\alpha_{us} \\times N_{m}` where :math:`N_{rM}` and
8557
# :math:`N_{m}` are the number of samples in the majority class after
8658
# resampling and the number of samples in the minority class, respectively.
8759

60+
# %%
61+
import numpy as np
62+
8863
# select only 2 classes since the ratio make sense in this case
8964
binary_mask = np.bitwise_or(y == 0, y == 2)
9065
binary_y = y[binary_mask]
9166
binary_X = X[binary_mask]
9267

93-
sampling_strategy = 0.8
68+
# %%
69+
from imblearn.under_sampling import RandomUnderSampler
9470

71+
sampling_strategy = 0.8
9572
rus = RandomUnderSampler(sampling_strategy=sampling_strategy)
9673
X_res, y_res = rus.fit_resample(binary_X, binary_y)
97-
print(
98-
f"Information of the iris data set after making it "
99-
f"balanced using a float and an under-sampling method: \n "
100-
f"sampling_strategy={sampling_strategy} \n y: {Counter(y_res)}"
101-
)
102-
plot_pie(y_res)
103-
104-
###############################################################################
74+
ax = y_res.value_counts().plot.pie(autopct=autopct)
75+
_ = ax.set_title("Under-sampling")
76+
77+
# %% [markdown]
10578
# For **over-sampling methods**, it correspond to the ratio
10679
# :math:`\\alpha_{os}` defined by :math:`N_{rm} = \\alpha_{os} \\times N_{M}`
10780
# where :math:`N_{rm}` and :math:`N_{M}` are the number of samples in the
10881
# minority class after resampling and the number of samples in the majority
10982
# class, respectively.
11083

84+
# %%
85+
from imblearn.over_sampling import RandomOverSampler
86+
11187
ros = RandomOverSampler(sampling_strategy=sampling_strategy)
11288
X_res, y_res = ros.fit_resample(binary_X, binary_y)
113-
print(
114-
f"Information of the iris data set after making it "
115-
f"balanced using a float and an over-sampling method: \n "
116-
f"sampling_strategy={sampling_strategy} \n y: {Counter(y_res)}"
117-
)
118-
plot_pie(y_res)
119-
120-
###############################################################################
121-
# ``sampling_strategy`` has a ``str``
122-
# ...................................
89+
ax = y_res.value_counts().plot.pie(autopct=autopct)
90+
_ = ax.set_title("Over-sampling")
91+
92+
# %% [markdown]
93+
# `sampling_strategy` has a `str`
94+
# -------------------------------
12395
#
124-
# ``sampling_strategy`` can be given as a string which specify the class
96+
# `sampling_strategy` can be given as a string which specify the class
12597
# targeted by the resampling. With under- and over-sampling, the number of
12698
# samples will be equalized.
12799
#
128100
# Note that we are using multiple classes from now on.
129101

102+
# %%
130103
sampling_strategy = "not minority"
131104

105+
fig, axs = plt.subplots(ncols=2, figsize=(10, 5))
132106
rus = RandomUnderSampler(sampling_strategy=sampling_strategy)
133107
X_res, y_res = rus.fit_resample(X, y)
134-
print(
135-
f"Information of the iris data set after making it "
136-
f"balanced by under-sampling: \n sampling_strategy={sampling_strategy} \n"
137-
f" y: {Counter(y_res)}"
138-
)
139-
plot_pie(y_res)
108+
y_res.value_counts().plot.pie(autopct=autopct, ax=axs[0])
109+
axs[0].set_title("Under-sampling")
140110

141111
sampling_strategy = "not majority"
142-
143112
ros = RandomOverSampler(sampling_strategy=sampling_strategy)
144113
X_res, y_res = ros.fit_resample(X, y)
145-
print(
146-
f"Information of the iris data set after making it "
147-
f"balanced by over-sampling: \n sampling_strategy={sampling_strategy} \n "
148-
f"y: {Counter(y_res)}"
149-
)
150-
plot_pie(y_res)
151-
152-
###############################################################################
114+
y_res.value_counts().plot.pie(autopct=autopct, ax=axs[1])
115+
axs[1].set_title("Over-sampling")
116+
117+
# %% [markdown]
153118
# With **cleaning method**, the number of samples in each class will not be
154119
# equalized even if targeted.
155120

121+
# %%
122+
from imblearn.under_sampling import TomekLinks
123+
156124
sampling_strategy = "not minority"
157125
tl = TomekLinks(sampling_strategy)
158126
X_res, y_res = tl.fit_resample(X, y)
159-
print(
160-
f"Information of the iris data set after making it "
161-
f"balanced by cleaning sampling: \n sampling_strategy={sampling_strategy} \n "
162-
f"y: {Counter(y_res)}"
163-
)
164-
plot_pie(y_res)
165-
166-
###############################################################################
167-
# ``sampling_strategy`` as a ``dict``
168-
# ...................................
127+
ax = y_res.value_counts().plot.pie(autopct=autopct)
128+
_ = ax.set_title("Cleaning")
129+
130+
# %% [markdown]
131+
# `sampling_strategy as a `dict`
132+
# ------------------------------
169133
#
170-
# When ``sampling_strategy`` is a ``dict``, the keys correspond to the targeted
134+
# When `sampling_strategy` is a `dict`, the keys correspond to the targeted
171135
# classes. The values correspond to the desired number of samples for each
172136
# targeted class. This is working for both **under- and over-sampling**
173-
# algorithms but not for the **cleaning algorithms**. Use a ``list`` instead.
137+
# algorithms but not for the **cleaning algorithms**. Use a `list` instead.
174138

139+
# %%
140+
fig, axs = plt.subplots(ncols=2, figsize=(10, 5))
175141

176142
sampling_strategy = {0: 10, 1: 15, 2: 20}
177-
178143
rus = RandomUnderSampler(sampling_strategy=sampling_strategy)
179144
X_res, y_res = rus.fit_resample(X, y)
180-
print(
181-
f"Information of the iris data set after making it "
182-
f"balanced by under-sampling: \n sampling_strategy={sampling_strategy} \n "
183-
f"y: {Counter(y_res)}"
184-
)
185-
plot_pie(y_res)
145+
y_res.value_counts().plot.pie(autopct=autopct, ax=axs[0])
146+
axs[0].set_title("Under-sampling")
186147

187148
sampling_strategy = {0: 25, 1: 35, 2: 47}
188-
189149
ros = RandomOverSampler(sampling_strategy=sampling_strategy)
190150
X_res, y_res = ros.fit_resample(X, y)
191-
print(
192-
f"Information of the iris data set after making it "
193-
f"balanced by over-sampling: \n sampling_strategy={sampling_strategy} \n "
194-
f"y: {Counter(y_res)}"
195-
)
196-
plot_pie(y_res)
197-
198-
###############################################################################
199-
# ``sampling_strategy`` as a ``list``
200-
# ...................................
151+
y_res.value_counts().plot.pie(autopct=autopct, ax=axs[1])
152+
axs[1].set_title("Under-sampling")
153+
154+
# %% [markdown]
155+
# `sampling_strategy` as a `list`
156+
# -------------------------------
201157
#
202-
# When ``sampling_strategy`` is a ``list``, the list contains the targeted
158+
# When `sampling_strategy` is a `list`, the list contains the targeted
203159
# classes. It is used only for **cleaning methods** and raise an error
204160
# otherwise.
205161

162+
# %%
206163
sampling_strategy = [0, 1, 2]
207164
tl = TomekLinks(sampling_strategy=sampling_strategy)
208165
X_res, y_res = tl.fit_resample(X, y)
209-
print(
210-
f"Information of the iris data set after making it "
211-
f"balanced by cleaning sampling: \n sampling_strategy={sampling_strategy} "
212-
f"\n y: {Counter(y_res)}"
213-
)
214-
plot_pie(y_res)
215-
216-
###############################################################################
217-
# ``sampling_strategy`` as a callable
218-
# ...................................
166+
ax = y_res.value_counts().plot.pie(autopct=autopct)
167+
_ = ax.set_title("Cleaning")
168+
169+
# %% [markdown]
170+
# `sampling_strategy` as a callable
171+
# ---------------------------------
219172
#
220-
# When callable, function taking ``y`` and returns a ``dict``. The keys
173+
# When callable, function taking `y` and returns a `dict`. The keys
221174
# correspond to the targeted classes. The values correspond to the desired
222175
# number of samples for each class.
223176

224177

178+
# %%
225179
def ratio_multiplier(y):
180+
from collections import Counter
181+
226182
multiplier = {1: 0.7, 2: 0.95}
227183
target_stats = Counter(y)
228184
for key, value in target_stats.items():
@@ -232,11 +188,6 @@ def ratio_multiplier(y):
232188

233189

234190
X_res, y_res = RandomUnderSampler(sampling_strategy=ratio_multiplier).fit_resample(X, y)
235-
236-
print(
237-
f"Information of the iris data set after balancing using a callable"
238-
f" mode:\n ratio={ratio_multiplier} \n y: {Counter(y_res)}"
239-
)
240-
plot_pie(y_res)
241-
191+
ax = y_res.value_counts().plot.pie(autopct=autopct)
192+
ax.set_title("Under-sampling")
242193
plt.show()

0 commit comments

Comments
 (0)