Improve testing of instance hardness threshold

Guillaume Lemaitre · Guillaume Lemaitre · commit e6e9a2894786 · 2016-06-24T19:13:50.000+02:00
diff --git a/unbalanced_dataset/ensemble/balance_cascade.py b/unbalanced_dataset/ensemble/balance_cascade.py
@@ -135,7 +135,12 @@ def __init__(self, ratio='auto', return_indices=False, random_state=None,
                                              verbose=verbose,
                                              random_state=random_state)
         # Define the classifier to use
-        self.classifier = classifier
+        list_classifier = ('knn', 'decision-tree', 'random-forest', 'adaboost',
+                          'gradient-boosting', 'linear-svm')
+        if classifier in list_classifier:
+            self.classifier = classifier
+        else:
+            raise NotImplementedError
         self.n_max_subset = n_max_subset
         self.bootstrap = bootstrap
         self.kwargs = kwargs
@@ -223,8 +228,7 @@ def sample(self, X, y):
             classifier = LinearSVC(random_state=self.random_state,
                                    **self.kwargs)
         else:
-            raise RuntimeError('UnbalancedData.BalanceCascade: classifier '
-                               'not yet supported.')
+            raise NotImplementedError
 
         X_resampled = []
         y_resampled = []
diff --git a/unbalanced_dataset/ensemble/tests/test_balance_cascade.py b/unbalanced_dataset/ensemble/tests/test_balance_cascade.py
@@ -302,11 +302,9 @@ def test_init_wrong_classifier():
     ratio = 'auto'
     classifier = 'rnd'
 
-    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED,
-                        return_indices=True, classifier=classifier)
-
-    # Create the sampling object
-    assert_raises(RuntimeError, bc.fit_sample, X, Y)
+    assert_raises(NotImplementedError, BalanceCascade, ratio=ratio,
+                  random_state=RND_SEED, return_indices=True,
+                  classifier=classifier)
 
 
 def test_fit_sample_auto_early_stop():
diff --git a/unbalanced_dataset/under_sampling/instance_hardness_threshold.py b/unbalanced_dataset/under_sampling/instance_hardness_threshold.py
@@ -132,13 +132,13 @@ def __init__(self, estimator='linear-svm', ratio='auto',
             random_state=random_state,
             verbose=verbose)
 
-        # if not hasattr(estimator, 'predict_proba'):
-        #     raise ValueError('Estimator does not have predict_proba method.')
-        # else:
-        #     self.estimator = estimator
-
         # Define the estimator to use
-        self.estimator = estimator
+        list_estimator = ('knn', 'decision-tree', 'random-forest', 'adaboost',
+                          'gradient-boosting', 'linear-svm')
+        if estimator in list_estimator:
+            self.estimator = estimator
+        else:
+            raise NotImplementedError
         self.kwargs = kwargs
         self.cv = cv
         self.n_jobs = n_jobs
@@ -200,7 +200,6 @@ def sample(self, X, y):
         if self.estimator == 'knn':
             from sklearn.neighbors import KNeighborsClassifier
             estimator = KNeighborsClassifier(
-                random_state=self.random_state,
                 **self.kwargs)
         elif self.estimator == 'decision-tree':
             from sklearn.tree import DecisionTreeClassifier
@@ -227,8 +226,7 @@ def sample(self, X, y):
             estimator = SVC(probability=True,
                             random_state=self.random_state, **self.kwargs)
         else:
-            raise ValueError('UnbalancedData.BalanceCascade: classifier '
-                             'not yet supported.')
+            raise NotImplementedError
 
         # Create the different folds
         skf = StratifiedKFold(y, n_folds=self.cv, shuffle=False,
diff --git a/unbalanced_dataset/under_sampling/tests/test_instance_hardness_threshold.py b/unbalanced_dataset/under_sampling/tests/test_instance_hardness_threshold.py
@@ -54,15 +54,14 @@ def test_iht_bad_ratio():
                   ratio=ratio)
 
 
-# def test_iht_estimator_no_proba():
-#     """Test either if an error is raised when the estimator does not have
-#     predict_proba function"""
+def test_iht_wrong_estimator():
+    """Test either if an error is raised when the estimator is unknown"""
 
-#     # Resample the data
-#     ratio = 0.5
-#     est = 'linear-svm'
-#     assert_raises(ValueError, InstanceHardnessThreshold, est, ratio=ratio,
-#                   random_state=RND_SEED)
+    # Resample the data
+    ratio = 0.5
+    est = 'rnd'
+    assert_raises(NotImplementedError, InstanceHardnessThreshold, est,
+                  ratio=ratio, random_state=RND_SEED)
 
 def test_iht_init():
     """Test the initialisation of the object"""
@@ -174,3 +173,94 @@ def test_iht_fit_sample_half():
     y_gt = np.load(os.path.join(currdir, 'data', 'iht_y_05.npy'))
     assert_array_equal(X_resampled, X_gt)
     assert_array_equal(y_resampled, y_gt)
+
+
+def test_iht_fit_sample_knn():
+    """Test the fit sample routine with knn"""
+
+    # Resample the data
+    est = 'knn'
+    iht = InstanceHardnessThreshold(est, random_state=RND_SEED)
+    X_resampled, y_resampled = iht.fit_sample(X, Y)
+
+    currdir = os.path.dirname(os.path.abspath(__file__))
+    X_gt = np.load(os.path.join(currdir, 'data', 'iht_x_knn.npy'))
+    y_gt = np.load(os.path.join(currdir, 'data', 'iht_y_knn.npy'))
+    assert_array_equal(X_resampled, X_gt)
+    assert_array_equal(y_resampled, y_gt)
+
+
+def test_iht_fit_sample_decision_tree():
+    """Test the fit sample routine with decision-tree"""
+
+    # Resample the data
+    est = 'decision-tree'
+    iht = InstanceHardnessThreshold(est, random_state=RND_SEED)
+    X_resampled, y_resampled = iht.fit_sample(X, Y)
+
+    currdir = os.path.dirname(os.path.abspath(__file__))
+    X_gt = np.load(os.path.join(currdir, 'data', 'iht_x_dt.npy'))
+    y_gt = np.load(os.path.join(currdir, 'data', 'iht_y_dt.npy'))
+    assert_array_equal(X_resampled, X_gt)
+    assert_array_equal(y_resampled, y_gt)
+
+
+def test_iht_fit_sample_random_forest():
+    """Test the fit sample routine with random forest"""
+
+    # Resample the data
+    est = 'random-forest'
+    iht = InstanceHardnessThreshold(est, random_state=RND_SEED)
+    X_resampled, y_resampled = iht.fit_sample(X, Y)
+
+    currdir = os.path.dirname(os.path.abspath(__file__))
+    X_gt = np.load(os.path.join(currdir, 'data', 'iht_x_rf.npy'))
+    y_gt = np.load(os.path.join(currdir, 'data', 'iht_y_rf.npy'))
+    assert_array_equal(X_resampled, X_gt)
+    assert_array_equal(y_resampled, y_gt)
+
+
+def test_iht_fit_sample_adaboost():
+    """Test the fit sample routine with adaboost"""
+
+    # Resample the data
+    est = 'adaboost'
+    iht = InstanceHardnessThreshold(est, random_state=RND_SEED)
+    X_resampled, y_resampled = iht.fit_sample(X, Y)
+
+    currdir = os.path.dirname(os.path.abspath(__file__))
+    X_gt = np.load(os.path.join(currdir, 'data', 'iht_x_adb.npy'))
+    y_gt = np.load(os.path.join(currdir, 'data', 'iht_y_adb.npy'))
+    assert_array_equal(X_resampled, X_gt)
+    assert_array_equal(y_resampled, y_gt)
+
+
+
+def test_iht_fit_sample_gradient_boosting():
+    """Test the fit sample routine with gradient boosting"""
+
+    # Resample the data
+    est = 'gradient-boosting'
+    iht = InstanceHardnessThreshold(est, random_state=RND_SEED)
+    X_resampled, y_resampled = iht.fit_sample(X, Y)
+
+    currdir = os.path.dirname(os.path.abspath(__file__))
+    X_gt = np.load(os.path.join(currdir, 'data', 'iht_x_gb.npy'))
+    y_gt = np.load(os.path.join(currdir, 'data', 'iht_y_gb.npy'))
+    assert_array_equal(X_resampled, X_gt)
+    assert_array_equal(y_resampled, y_gt)
+
+
+def test_iht_fit_sample_linear_svm():
+    """Test the fit sample routine with linear SVM"""
+
+    # Resample the data
+    est = 'linear-svm'
+    iht = InstanceHardnessThreshold(est, random_state=RND_SEED)
+    X_resampled, y_resampled = iht.fit_sample(X, Y)
+
+    currdir = os.path.dirname(os.path.abspath(__file__))
+    X_gt = np.load(os.path.join(currdir, 'data', 'iht_x_svm.npy'))
+    y_gt = np.load(os.path.join(currdir, 'data', 'iht_y_svm.npy'))
+    assert_array_equal(X_resampled, X_gt)
+    assert_array_equal(y_resampled, y_gt)