Skip to content

Commit 71b1e87

Browse files
authored
DOC improve style of examples bis (#814)
1 parent 2150e67 commit 71b1e87

14 files changed

+957
-727
lines changed

examples/applications/plot_over_sampling_benchmark_lfw.py

Lines changed: 114 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -8,111 +8,144 @@
88
methods are used in conjunction with a 3NN classifier in order
99
to examine the improvement of the classifier's output quality
1010
by using an over-sampler.
11-
1211
"""
1312

1413
# Authors: Christos Aridas
1514
# Guillaume Lemaitre <[email protected]>
1615
# License: MIT
1716

18-
import matplotlib.pyplot as plt
19-
import numpy as np
20-
from scipy import interp
21-
from sklearn import datasets, neighbors
22-
from sklearn.metrics import auc, roc_curve
23-
from sklearn.model_selection import StratifiedKFold
24-
25-
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
26-
from imblearn.pipeline import make_pipeline
27-
17+
# %%
2818
print(__doc__)
2919

30-
LW = 2
31-
RANDOM_STATE = 42
20+
import seaborn as sns
3221

22+
sns.set_context("poster")
3323

34-
class DummySampler:
35-
def sample(self, X, y):
36-
return X, y
24+
# %% [markdown]
25+
# Load the dataset
26+
# ----------------
27+
#
28+
# We will use a dataset containing image from know person where we will
29+
# build a model to recognize the person on the image. We will make this problem
30+
# a binary problem by taking picture of only George W. Bush and Bill Clinton.
3731

38-
def fit(self, X, y):
39-
return self
32+
# %%
33+
import numpy as np
34+
from sklearn.datasets import fetch_lfw_people
35+
36+
data = fetch_lfw_people()
37+
george_bush_id = 1871 # Photos of George W. Bush
38+
bill_clinton_id = 531 # Photos of Bill Clinton
39+
classes = [george_bush_id, bill_clinton_id]
40+
classes_name = np.array(["B. Clinton", "G.W. Bush"], dtype=np.object)
41+
42+
# %%
43+
mask_photos = np.isin(data.target, classes)
44+
X, y = data.data[mask_photos], data.target[mask_photos]
45+
y = (y == george_bush_id).astype(np.int8)
46+
y = classes_name[y]
47+
48+
# %% [markdown]
49+
# We can check the ratio between the two classes.
50+
51+
# %%
52+
import pandas as pd
53+
54+
class_distribution = pd.Series(y).value_counts(normalize=True)
55+
ax = class_distribution.plot.barh()
56+
ax.set_title("Class distribution")
57+
pos_label = class_distribution.idxmin()
58+
print(f"The positive label considered as the minority class is {pos_label}")
59+
60+
# %% [markdown]
61+
# We see that we have an imbalanced classification problem with ~95% of the
62+
# data belonging to the class G.W. Bush.
63+
#
64+
# Compare over-sampling approaches
65+
# --------------------------------
66+
#
67+
# We will use different over-sampling approaches and use a kNN classifier
68+
# to check if we can recognize the 2 presidents. The evaluation will be
69+
# performed through cross-validation and we will plot the mean ROC curve.
70+
#
71+
# We will create different pipelines and evaluate them.
72+
73+
# %%
74+
from imblearn import FunctionSampler
75+
from imblearn.over_sampling import ADASYN, RandomOverSampler, SMOTE
76+
from imblearn.pipeline import make_pipeline
77+
from sklearn.neighbors import KNeighborsClassifier
4078

41-
def fit_resample(self, X, y):
42-
return self.sample(X, y)
79+
classifier = KNeighborsClassifier(n_neighbors=3)
4380

81+
pipeline = [
82+
make_pipeline(FunctionSampler(), classifier),
83+
make_pipeline(RandomOverSampler(random_state=42), classifier),
84+
make_pipeline(ADASYN(random_state=42), classifier),
85+
make_pipeline(SMOTE(random_state=42), classifier),
86+
]
4487

45-
cv = StratifiedKFold(n_splits=3)
88+
# %%
89+
from sklearn.model_selection import StratifiedKFold
4690

47-
# Load the dataset
48-
data = datasets.fetch_lfw_people()
49-
majority_person = 1871 # 530 photos of George W Bush
50-
minority_person = 531 # 29 photos of Bill Clinton
51-
majority_idxs = np.flatnonzero(data.target == majority_person)
52-
minority_idxs = np.flatnonzero(data.target == minority_person)
53-
idxs = np.hstack((majority_idxs, minority_idxs))
54-
55-
X = data.data[idxs]
56-
y = data.target[idxs]
57-
y[y == majority_person] = 0
58-
y[y == minority_person] = 1
59-
60-
classifier = ["3NN", neighbors.KNeighborsClassifier(3)]
61-
62-
samplers = [
63-
["Standard", DummySampler()],
64-
["ADASYN", ADASYN(random_state=RANDOM_STATE)],
65-
["ROS", RandomOverSampler(random_state=RANDOM_STATE)],
66-
["SMOTE", SMOTE(random_state=RANDOM_STATE)],
67-
]
91+
cv = StratifiedKFold(n_splits=3)
6892

69-
pipelines = [
70-
[
71-
f"{sampler[0]}-{classifier[0]}",
72-
make_pipeline(sampler[1], classifier[1]),
73-
]
74-
for sampler in samplers
75-
]
93+
# %% [markdown]
94+
# We will compute the mean ROC curve for each pipeline using a different splits
95+
# provided by the :class:`~sklearn.model_selection.StratifiedKFold`
96+
# cross-validation.
7697

77-
fig = plt.figure()
78-
ax = fig.add_subplot(1, 1, 1)
98+
# %%
99+
import matplotlib.pyplot as plt
100+
from sklearn.metrics import RocCurveDisplay, roc_curve, auc
79101

80-
for name, pipeline in pipelines:
81-
mean_tpr = 0.0
82-
mean_fpr = np.linspace(0, 1, 100)
102+
disp = []
103+
for model in pipeline:
104+
# compute the mean fpr/tpr to get the mean ROC curve
105+
mean_tpr, mean_fpr = 0.0, np.linspace(0, 1, 100)
83106
for train, test in cv.split(X, y):
84-
probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
85-
fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
86-
mean_tpr += interp(mean_fpr, fpr, tpr)
107+
model.fit(X[train], y[train])
108+
y_proba = model.predict_proba(X[test])
109+
110+
pos_label_idx = np.flatnonzero(model.classes_ == pos_label)[0]
111+
fpr, tpr, thresholds = roc_curve(
112+
y[test], y_proba[:, pos_label_idx], pos_label=pos_label
113+
)
114+
mean_tpr += np.interp(mean_fpr, fpr, tpr)
87115
mean_tpr[0] = 0.0
88-
roc_auc = auc(fpr, tpr)
89116

90117
mean_tpr /= cv.get_n_splits(X, y)
91118
mean_tpr[-1] = 1.0
92119
mean_auc = auc(mean_fpr, mean_tpr)
93-
plt.plot(
94-
mean_fpr,
95-
mean_tpr,
96-
linestyle="--",
97-
label=f"{name} (area = {mean_auc:.2f})",
98-
lw=LW,
99-
)
100-
101-
plt.plot([0, 1], [0, 1], linestyle="--", lw=LW, color="k", label="Luck")
102-
103-
# make nice plotting
104-
ax.spines["top"].set_visible(False)
105-
ax.spines["right"].set_visible(False)
106-
ax.get_xaxis().tick_bottom()
107-
ax.get_yaxis().tick_left()
108-
ax.spines["left"].set_position(("outward", 10))
109-
ax.spines["bottom"].set_position(("outward", 10))
110-
plt.xlim([0, 1])
111-
plt.ylim([0, 1])
112-
plt.xlabel("False Positive Rate")
113-
plt.ylabel("True Positive Rate")
114-
plt.title("Receiver operating characteristic example")
115120

116-
plt.legend(loc="lower right")
121+
# Create a display that we will reuse to make the aggregated plots for
122+
# all methods
123+
disp.append(
124+
RocCurveDisplay(
125+
fpr=mean_fpr,
126+
tpr=mean_tpr,
127+
roc_auc=mean_auc,
128+
estimator_name=f"{model[0].__class__.__name__}",
129+
)
130+
)
117131

132+
# %% [markdown]
133+
# In the previous cell, we created the different mean ROC curve and we can plot
134+
# them on the same plot.
135+
136+
# %%
137+
fig, ax = plt.subplots(figsize=(9, 9))
138+
for d in disp:
139+
d.plot(ax=ax, linestyle="--")
140+
ax.plot([0, 1], [0, 1], linestyle="--", color="k")
141+
ax.axis("square")
142+
fig.suptitle("Comparison of over-sampling methods with a 3NN classifier")
143+
ax.set_xlim([0, 1])
144+
ax.set_ylim([0, 1])
145+
sns.despine(offset=10, ax=ax)
118146
plt.show()
147+
148+
# %% [markdown]
149+
# We see that for this task, methods that are generating new samples with some
150+
# interpolation (i.e. ADASYN and SMOTE) perform better than random
151+
# over-sampling or no resampling.

examples/applications/plot_topic_classication.py

Lines changed: 41 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -7,36 +7,27 @@
77
88
Note that for this example, the data are slightly imbalanced but it can happen
99
that for some data sets, the imbalanced ratio is more significant.
10-
1110
"""
1211

1312
# Authors: Guillaume Lemaitre <[email protected]>
1413
# License: MIT
1514

16-
from collections import Counter
17-
18-
from sklearn.datasets import fetch_20newsgroups
19-
from sklearn.feature_extraction.text import TfidfVectorizer
20-
from sklearn.naive_bayes import MultinomialNB
21-
from sklearn.pipeline import make_pipeline
22-
23-
from imblearn.under_sampling import RandomUnderSampler
24-
from imblearn.pipeline import make_pipeline as make_pipeline_imb
25-
from imblearn.metrics import classification_report_imbalanced
26-
15+
# %%
2716
print(__doc__)
2817

29-
###############################################################################
18+
# %% [markdown]
3019
# Setting the data set
31-
###############################################################################
32-
33-
###############################################################################
20+
# --------------------
21+
#
3422
# We use a part of the 20 newsgroups data set by loading 4 topics. Using the
3523
# scikit-learn loader, the data are split into a training and a testing set.
3624
#
3725
# Note the class \#3 is the minority class and has almost twice less samples
3826
# than the majority class.
3927

28+
# %%
29+
from sklearn.datasets import fetch_20newsgroups
30+
4031
categories = [
4132
"alt.atheism",
4233
"talk.religion.misc",
@@ -52,49 +43,64 @@
5243
y_train = newsgroups_train.target
5344
y_test = newsgroups_test.target
5445

46+
# %%
47+
from collections import Counter
48+
5549
print(f"Training class distributions summary: {Counter(y_train)}")
5650
print(f"Test class distributions summary: {Counter(y_test)}")
5751

58-
###############################################################################
52+
# % [markdown]
5953
# The usual scikit-learn pipeline
60-
###############################################################################
61-
62-
###############################################################################
54+
# -------------------------------
55+
#
6356
# You might usually use scikit-learn pipeline by combining the TF-IDF
6457
# vectorizer to feed a multinomial naive bayes classifier. A classification
6558
# report summarized the results on the testing set.
6659
#
6760
# As expected, the recall of the class \#3 is low mainly due to the class
6861
# imbalanced.
6962

70-
pipe = make_pipeline(TfidfVectorizer(), MultinomialNB())
71-
pipe.fit(X_train, y_train)
72-
y_pred = pipe.predict(X_test)
63+
# %%
64+
from sklearn.feature_extraction.text import TfidfVectorizer
65+
from sklearn.naive_bayes import MultinomialNB
66+
from sklearn.pipeline import make_pipeline
67+
68+
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
69+
model.fit(X_train, y_train)
70+
y_pred = model.predict(X_test)
71+
72+
# %%
73+
from imblearn.metrics import classification_report_imbalanced
7374

7475
print(classification_report_imbalanced(y_test, y_pred))
7576

76-
###############################################################################
77+
# %% [markdown]
7778
# Balancing the class before classification
78-
###############################################################################
79-
80-
###############################################################################
79+
# -----------------------------------------
80+
#
8181
# To improve the prediction of the class \#3, it could be interesting to apply
8282
# a balancing before to train the naive bayes classifier. Therefore, we will
83-
# use a ``RandomUnderSampler`` to equalize the number of samples in all the
84-
# classes before the training.
83+
# use a :class:`~imblearn.under_sampling.RandomUnderSampler` to equalize the
84+
# number of samples in all the classes before the training.
8585
#
86-
# It is also important to note that we are using the ``make_pipeline`` function
87-
# implemented in imbalanced-learn to properly handle the samplers.
86+
# It is also important to note that we are using the
87+
# :class:`~imblearn.pipeline.make_pipeline` function implemented in
88+
# imbalanced-learn to properly handle the samplers.
89+
90+
# %%
91+
from imblearn.under_sampling import RandomUnderSampler
92+
from imblearn.pipeline import make_pipeline as make_pipeline_imb
8893

89-
pipe = make_pipeline_imb(TfidfVectorizer(), RandomUnderSampler(), MultinomialNB())
94+
model = make_pipeline_imb(TfidfVectorizer(), RandomUnderSampler(), MultinomialNB())
9095

91-
pipe.fit(X_train, y_train)
92-
y_pred = pipe.predict(X_test)
96+
model.fit(X_train, y_train)
97+
y_pred = model.predict(X_test)
9398

94-
###############################################################################
99+
# %% [markdown]
95100
# Although the results are almost identical, it can be seen that the resampling
96101
# allowed to correct the poor recall of the class \#3 at the cost of reducing
97102
# the other metrics for the other classes. However, the overall results are
98103
# slightly better.
99104

105+
# %%
100106
print(classification_report_imbalanced(y_test, y_pred))

0 commit comments

Comments
 (0)