|
8 | 8 | methods are used in conjunction with a 3NN classifier in order
|
9 | 9 | to examine the improvement of the classifier's output quality
|
10 | 10 | by using an over-sampler.
|
11 |
| -
|
12 | 11 | """
|
13 | 12 |
|
14 | 13 | # Authors: Christos Aridas
|
15 | 14 | # Guillaume Lemaitre <[email protected]>
|
16 | 15 | # License: MIT
|
17 | 16 |
|
18 |
| -import matplotlib.pyplot as plt |
19 |
| -import numpy as np |
20 |
| -from scipy import interp |
21 |
| -from sklearn import datasets, neighbors |
22 |
| -from sklearn.metrics import auc, roc_curve |
23 |
| -from sklearn.model_selection import StratifiedKFold |
24 |
| - |
25 |
| -from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler |
26 |
| -from imblearn.pipeline import make_pipeline |
27 |
| - |
| 17 | +# %% |
28 | 18 | print(__doc__)
|
29 | 19 |
|
30 |
| -LW = 2 |
31 |
| -RANDOM_STATE = 42 |
| 20 | +import seaborn as sns |
32 | 21 |
|
| 22 | +sns.set_context("poster") |
33 | 23 |
|
34 |
| -class DummySampler: |
35 |
| - def sample(self, X, y): |
36 |
| - return X, y |
| 24 | +# %% [markdown] |
| 25 | +# Load the dataset |
| 26 | +# ---------------- |
| 27 | +# |
| 28 | +# We will use a dataset containing image from know person where we will |
| 29 | +# build a model to recognize the person on the image. We will make this problem |
| 30 | +# a binary problem by taking picture of only George W. Bush and Bill Clinton. |
37 | 31 |
|
38 |
| - def fit(self, X, y): |
39 |
| - return self |
| 32 | +# %% |
| 33 | +import numpy as np |
| 34 | +from sklearn.datasets import fetch_lfw_people |
| 35 | + |
| 36 | +data = fetch_lfw_people() |
| 37 | +george_bush_id = 1871 # Photos of George W. Bush |
| 38 | +bill_clinton_id = 531 # Photos of Bill Clinton |
| 39 | +classes = [george_bush_id, bill_clinton_id] |
| 40 | +classes_name = np.array(["B. Clinton", "G.W. Bush"], dtype=np.object) |
| 41 | + |
| 42 | +# %% |
| 43 | +mask_photos = np.isin(data.target, classes) |
| 44 | +X, y = data.data[mask_photos], data.target[mask_photos] |
| 45 | +y = (y == george_bush_id).astype(np.int8) |
| 46 | +y = classes_name[y] |
| 47 | + |
| 48 | +# %% [markdown] |
| 49 | +# We can check the ratio between the two classes. |
| 50 | + |
| 51 | +# %% |
| 52 | +import pandas as pd |
| 53 | + |
| 54 | +class_distribution = pd.Series(y).value_counts(normalize=True) |
| 55 | +ax = class_distribution.plot.barh() |
| 56 | +ax.set_title("Class distribution") |
| 57 | +pos_label = class_distribution.idxmin() |
| 58 | +print(f"The positive label considered as the minority class is {pos_label}") |
| 59 | + |
| 60 | +# %% [markdown] |
| 61 | +# We see that we have an imbalanced classification problem with ~95% of the |
| 62 | +# data belonging to the class G.W. Bush. |
| 63 | +# |
| 64 | +# Compare over-sampling approaches |
| 65 | +# -------------------------------- |
| 66 | +# |
| 67 | +# We will use different over-sampling approaches and use a kNN classifier |
| 68 | +# to check if we can recognize the 2 presidents. The evaluation will be |
| 69 | +# performed through cross-validation and we will plot the mean ROC curve. |
| 70 | +# |
| 71 | +# We will create different pipelines and evaluate them. |
| 72 | + |
| 73 | +# %% |
| 74 | +from imblearn import FunctionSampler |
| 75 | +from imblearn.over_sampling import ADASYN, RandomOverSampler, SMOTE |
| 76 | +from imblearn.pipeline import make_pipeline |
| 77 | +from sklearn.neighbors import KNeighborsClassifier |
40 | 78 |
|
41 |
| - def fit_resample(self, X, y): |
42 |
| - return self.sample(X, y) |
| 79 | +classifier = KNeighborsClassifier(n_neighbors=3) |
43 | 80 |
|
| 81 | +pipeline = [ |
| 82 | + make_pipeline(FunctionSampler(), classifier), |
| 83 | + make_pipeline(RandomOverSampler(random_state=42), classifier), |
| 84 | + make_pipeline(ADASYN(random_state=42), classifier), |
| 85 | + make_pipeline(SMOTE(random_state=42), classifier), |
| 86 | +] |
44 | 87 |
|
45 |
| -cv = StratifiedKFold(n_splits=3) |
| 88 | +# %% |
| 89 | +from sklearn.model_selection import StratifiedKFold |
46 | 90 |
|
47 |
| -# Load the dataset |
48 |
| -data = datasets.fetch_lfw_people() |
49 |
| -majority_person = 1871 # 530 photos of George W Bush |
50 |
| -minority_person = 531 # 29 photos of Bill Clinton |
51 |
| -majority_idxs = np.flatnonzero(data.target == majority_person) |
52 |
| -minority_idxs = np.flatnonzero(data.target == minority_person) |
53 |
| -idxs = np.hstack((majority_idxs, minority_idxs)) |
54 |
| - |
55 |
| -X = data.data[idxs] |
56 |
| -y = data.target[idxs] |
57 |
| -y[y == majority_person] = 0 |
58 |
| -y[y == minority_person] = 1 |
59 |
| - |
60 |
| -classifier = ["3NN", neighbors.KNeighborsClassifier(3)] |
61 |
| - |
62 |
| -samplers = [ |
63 |
| - ["Standard", DummySampler()], |
64 |
| - ["ADASYN", ADASYN(random_state=RANDOM_STATE)], |
65 |
| - ["ROS", RandomOverSampler(random_state=RANDOM_STATE)], |
66 |
| - ["SMOTE", SMOTE(random_state=RANDOM_STATE)], |
67 |
| -] |
| 91 | +cv = StratifiedKFold(n_splits=3) |
68 | 92 |
|
69 |
| -pipelines = [ |
70 |
| - [ |
71 |
| - f"{sampler[0]}-{classifier[0]}", |
72 |
| - make_pipeline(sampler[1], classifier[1]), |
73 |
| - ] |
74 |
| - for sampler in samplers |
75 |
| -] |
| 93 | +# %% [markdown] |
| 94 | +# We will compute the mean ROC curve for each pipeline using a different splits |
| 95 | +# provided by the :class:`~sklearn.model_selection.StratifiedKFold` |
| 96 | +# cross-validation. |
76 | 97 |
|
77 |
| -fig = plt.figure() |
78 |
| -ax = fig.add_subplot(1, 1, 1) |
| 98 | +# %% |
| 99 | +import matplotlib.pyplot as plt |
| 100 | +from sklearn.metrics import RocCurveDisplay, roc_curve, auc |
79 | 101 |
|
80 |
| -for name, pipeline in pipelines: |
81 |
| - mean_tpr = 0.0 |
82 |
| - mean_fpr = np.linspace(0, 1, 100) |
| 102 | +disp = [] |
| 103 | +for model in pipeline: |
| 104 | + # compute the mean fpr/tpr to get the mean ROC curve |
| 105 | + mean_tpr, mean_fpr = 0.0, np.linspace(0, 1, 100) |
83 | 106 | for train, test in cv.split(X, y):
|
84 |
| - probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test]) |
85 |
| - fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) |
86 |
| - mean_tpr += interp(mean_fpr, fpr, tpr) |
| 107 | + model.fit(X[train], y[train]) |
| 108 | + y_proba = model.predict_proba(X[test]) |
| 109 | + |
| 110 | + pos_label_idx = np.flatnonzero(model.classes_ == pos_label)[0] |
| 111 | + fpr, tpr, thresholds = roc_curve( |
| 112 | + y[test], y_proba[:, pos_label_idx], pos_label=pos_label |
| 113 | + ) |
| 114 | + mean_tpr += np.interp(mean_fpr, fpr, tpr) |
87 | 115 | mean_tpr[0] = 0.0
|
88 |
| - roc_auc = auc(fpr, tpr) |
89 | 116 |
|
90 | 117 | mean_tpr /= cv.get_n_splits(X, y)
|
91 | 118 | mean_tpr[-1] = 1.0
|
92 | 119 | mean_auc = auc(mean_fpr, mean_tpr)
|
93 |
| - plt.plot( |
94 |
| - mean_fpr, |
95 |
| - mean_tpr, |
96 |
| - linestyle="--", |
97 |
| - label=f"{name} (area = {mean_auc:.2f})", |
98 |
| - lw=LW, |
99 |
| - ) |
100 |
| - |
101 |
| -plt.plot([0, 1], [0, 1], linestyle="--", lw=LW, color="k", label="Luck") |
102 |
| - |
103 |
| -# make nice plotting |
104 |
| -ax.spines["top"].set_visible(False) |
105 |
| -ax.spines["right"].set_visible(False) |
106 |
| -ax.get_xaxis().tick_bottom() |
107 |
| -ax.get_yaxis().tick_left() |
108 |
| -ax.spines["left"].set_position(("outward", 10)) |
109 |
| -ax.spines["bottom"].set_position(("outward", 10)) |
110 |
| -plt.xlim([0, 1]) |
111 |
| -plt.ylim([0, 1]) |
112 |
| -plt.xlabel("False Positive Rate") |
113 |
| -plt.ylabel("True Positive Rate") |
114 |
| -plt.title("Receiver operating characteristic example") |
115 | 120 |
|
116 |
| -plt.legend(loc="lower right") |
| 121 | + # Create a display that we will reuse to make the aggregated plots for |
| 122 | + # all methods |
| 123 | + disp.append( |
| 124 | + RocCurveDisplay( |
| 125 | + fpr=mean_fpr, |
| 126 | + tpr=mean_tpr, |
| 127 | + roc_auc=mean_auc, |
| 128 | + estimator_name=f"{model[0].__class__.__name__}", |
| 129 | + ) |
| 130 | + ) |
117 | 131 |
|
| 132 | +# %% [markdown] |
| 133 | +# In the previous cell, we created the different mean ROC curve and we can plot |
| 134 | +# them on the same plot. |
| 135 | + |
| 136 | +# %% |
| 137 | +fig, ax = plt.subplots(figsize=(9, 9)) |
| 138 | +for d in disp: |
| 139 | + d.plot(ax=ax, linestyle="--") |
| 140 | +ax.plot([0, 1], [0, 1], linestyle="--", color="k") |
| 141 | +ax.axis("square") |
| 142 | +fig.suptitle("Comparison of over-sampling methods with a 3NN classifier") |
| 143 | +ax.set_xlim([0, 1]) |
| 144 | +ax.set_ylim([0, 1]) |
| 145 | +sns.despine(offset=10, ax=ax) |
118 | 146 | plt.show()
|
| 147 | + |
| 148 | +# %% [markdown] |
| 149 | +# We see that for this task, methods that are generating new samples with some |
| 150 | +# interpolation (i.e. ADASYN and SMOTE) perform better than random |
| 151 | +# over-sampling or no resampling. |
0 commit comments