[MRG] Benchmark over-sampling methods in a face regognition task (#198)

chkoar · glemaitre · commit 25ee4fa473c5 · 2016-12-18T17:10:18.000+01:00
* Benchmark over-sampling methods using a 3NN classifier

* fulfil the review
diff --git a/examples/over-sampling/plot_over_sampling_benchmark_3nn.py b/examples/over-sampling/plot_over_sampling_benchmark_3nn.py
@@ -0,0 +1,97 @@
+"""
+=============================================================
+Benchmark over-sampling methods in a face regognition task
+=============================================================
+In this face recognition example two faces are used from the LFW 
+(Faces in the Wild) dataset. Several implemented over-sampling 
+methods are used in conjunction with a 3NN classifier in order 
+to examine the improvement of the classifier's output quality 
+by using an over-sampler.
+"""
+print(__doc__)
+
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy import interp
+from sklearn import datasets, neighbors
+from sklearn.metrics import auc, roc_curve
+from sklearn.model_selection import StratifiedKFold
+
+from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
+from imblearn.pipeline import make_pipeline
+
+LW = 2
+RANDOM_STATE = 42
+
+
+class DummySampler(object):
+
+    def sample(self, X, y):
+        return X, y
+
+    def fit(self, X, y):
+        return self
+
+    def fit_sample(self, X, y):
+        return self.sample(X, y)
+
+
+cv = StratifiedKFold(n_splits=3)
+
+# Load the dataset
+data = datasets.fetch_lfw_people()
+majority_person = 1871  # 530 photos of George W Bush
+minority_person = 531  # 29 photos of Bill Clinton
+majority_idxs = np.flatnonzero(data.target == majority_person)
+minority_idxs = np.flatnonzero(data.target == minority_person)
+idxs = np.hstack((majority_idxs, minority_idxs))
+
+X = data.data[idxs]
+y = data.target[idxs]
+y[y == majority_person] = 0
+y[y == minority_person] = 1
+
+
+classifier = ['3NN', neighbors.KNeighborsClassifier(3)]
+
+samplers = [
+    ['Standard', DummySampler()],
+    ['ADASYN', ADASYN(random_state=RANDOM_STATE)],
+    ['ROS', RandomOverSampler(random_state=RANDOM_STATE)],
+    ['SMOTE', SMOTE(random_state=RANDOM_STATE)],
+]
+
+pipelines = [
+    ['{}-{}'.format(sampler[0], classifier[0]),
+     make_pipeline(sampler[1], classifier[1])]
+    for sampler in samplers
+]
+
+
+for name, pipeline in pipelines:
+    mean_tpr = 0.0
+    mean_fpr = np.linspace(0, 1, 100)
+    for train, test in cv.split(X, y):
+        probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
+        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
+        mean_tpr += interp(mean_fpr, fpr, tpr)
+        mean_tpr[0] = 0.0
+        roc_auc = auc(fpr, tpr)
+
+    mean_tpr /= cv.get_n_splits(X, y)
+    mean_tpr[-1] = 1.0
+    mean_auc = auc(mean_fpr, mean_tpr)
+    plt.plot(mean_fpr, mean_tpr, linestyle='--',
+             label='{} (area = %0.2f)'.format(name) % mean_auc, lw=LW)
+
+    plt.xlim([-0.05, 1.05])
+    plt.ylim([-0.05, 1.05])
+    plt.xlabel('False Positive Rate')
+    plt.ylabel('True Positive Rate')
+    plt.title('Receiver operating characteristic example')
+    plt.legend(loc="lower right")
+
+plt.plot([0, 1], [0, 1], linestyle='--', lw=LW, color='k',
+         label='Luck')
+
+plt.show()