ENH accept string labels in classifier (#718)

glemaitre · web-flow · commit 1b8cd470c1b1 · 2020-06-09T01:14:28.000+02:00
diff --git a/doc/whats_new/v0.7.rst b/doc/whats_new/v0.7.rst
@@ -26,3 +26,14 @@ Bug fixes
 - Change the default value `min_samples_leaf` to be consistent with
   scikit-learn.
   :pr:`711` by :user:`zerolfx <zerolfx>`.
+
+Enhancements
+............
+
+- The classifier implemented in imbalanced-learn,
+  :class:`imblearn.ensemble.BalancedBaggingClassifier`,
+  :class:`imblearn.ensemble.BalancedRandomForestClassifier`,
+  :class:`imblearn.ensemble.EasyEnsembleClassifier`, and
+  :class:`imblearn.ensemble.RUSBoostClassifier`, accept `sampling_strategy`
+  with the same key than in `y` without the need of encoding `y` in advance.
+  :pr:`718` by :user:`Guillaume Lemaitre <glemaitre>`.
diff --git a/imblearn/ensemble/_bagging.py b/imblearn/ensemble/_bagging.py
@@ -15,7 +15,7 @@
 from ..pipeline import Pipeline
 from ..under_sampling import RandomUnderSampler
 from ..under_sampling.base import BaseUnderSampler
-from ..utils import Substitution, check_target_type
+from ..utils import Substitution, check_target_type, check_sampling_strategy
 from ..utils._docstring import _n_jobs_docstring
 from ..utils._docstring import _random_state_docstring
 
@@ -208,6 +208,19 @@ def __init__(
         self.sampling_strategy = sampling_strategy
         self.replacement = replacement
 
+    def _validate_y(self, y):
+        y_encoded = super()._validate_y(y)
+        if isinstance(self.sampling_strategy, dict):
+            self._sampling_strategy = {
+                np.where(self.classes_ == key)[0][0]: value
+                for key, value in check_sampling_strategy(
+                    self.sampling_strategy, y, 'under-sampling',
+                ).items()
+            }
+        else:
+            self._sampling_strategy = self.sampling_strategy
+        return y_encoded
+
     def _validate_estimator(self, default=DecisionTreeClassifier()):
         """Check the estimator and the n_estimator attribute, set the
         `base_estimator_` attribute."""
@@ -233,7 +246,7 @@ def _validate_estimator(self, default=DecisionTreeClassifier()):
                 (
                     "sampler",
                     RandomUnderSampler(
-                        sampling_strategy=self.sampling_strategy,
+                        sampling_strategy=self._sampling_strategy,
                         replacement=self.replacement,
                     ),
                 ),
diff --git a/imblearn/ensemble/_easy_ensemble.py b/imblearn/ensemble/_easy_ensemble.py
@@ -14,7 +14,7 @@
 
 from ..under_sampling import RandomUnderSampler
 from ..under_sampling.base import BaseUnderSampler
-from ..utils import Substitution, check_target_type
+from ..utils import Substitution, check_target_type, check_sampling_strategy
 from ..utils._docstring import _n_jobs_docstring
 from ..utils._docstring import _random_state_docstring
 from ..pipeline import Pipeline
@@ -152,6 +152,19 @@ def __init__(
         self.sampling_strategy = sampling_strategy
         self.replacement = replacement
 
+    def _validate_y(self, y):
+        y_encoded = super()._validate_y(y)
+        if isinstance(self.sampling_strategy, dict):
+            self._sampling_strategy = {
+                np.where(self.classes_ == key)[0][0]: value
+                for key, value in check_sampling_strategy(
+                    self.sampling_strategy, y, 'under-sampling',
+                ).items()
+            }
+        else:
+            self._sampling_strategy = self.sampling_strategy
+        return y_encoded
+
     def _validate_estimator(self, default=AdaBoostClassifier()):
         """Check the estimator and the n_estimator attribute, set the
         `base_estimator_` attribute."""
@@ -177,7 +190,7 @@ def _validate_estimator(self, default=AdaBoostClassifier()):
                 (
                     "sampler",
                     RandomUnderSampler(
-                        sampling_strategy=self.sampling_strategy,
+                        sampling_strategy=self._sampling_strategy,
                         replacement=self.replacement,
                     ),
                 ),
diff --git a/imblearn/ensemble/_forest.py b/imblearn/ensemble/_forest.py
@@ -33,6 +33,7 @@
 from ..utils import Substitution
 from ..utils._docstring import _n_jobs_docstring
 from ..utils._docstring import _random_state_docstring
+from ..utils._validation import check_sampling_strategy
 
 MAX_INT = np.iinfo(np.int32).max
 
@@ -364,7 +365,7 @@ def _validate_estimator(self, default=DecisionTreeClassifier()):
             self.base_estimator_ = clone(default)
 
         self.base_sampler_ = RandomUnderSampler(
-            sampling_strategy=self.sampling_strategy,
+            sampling_strategy=self._sampling_strategy,
             replacement=self.replacement,
         )
 
@@ -447,10 +448,20 @@ def fit(self, X, y, sample_weight=None):
 
         self.n_outputs_ = y.shape[1]
 
-        y, expanded_class_weight = self._validate_y_class_weight(y)
+        y_encoded, expanded_class_weight = self._validate_y_class_weight(y)
 
         if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
-            y = np.ascontiguousarray(y, dtype=DOUBLE)
+            y_encoded = np.ascontiguousarray(y_encoded, dtype=DOUBLE)
+
+        if isinstance(self.sampling_strategy, dict):
+            self._sampling_strategy = {
+                np.where(self.classes_[0] == key)[0][0]: value
+                for key, value in check_sampling_strategy(
+                    self.sampling_strategy, y, 'under-sampling',
+                ).items()
+            }
+        else:
+            self._sampling_strategy = self.sampling_strategy
 
         if expanded_class_weight is not None:
             if sample_weight is not None:
@@ -523,7 +534,7 @@ def fit(self, X, y, sample_weight=None):
                     t,
                     self,
                     X,
-                    y,
+                    y_encoded,
                     sample_weight,
                     i,
                     len(trees),
@@ -548,7 +559,7 @@ def fit(self, X, y, sample_weight=None):
             )
 
         if self.oob_score:
-            self._set_oob_score(X, y)
+            self._set_oob_score(X, y_encoded)
 
         # Decapsulate classes_ attributes
         if hasattr(self, "classes_") and self.n_outputs_ == 1:
diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py
@@ -28,7 +28,7 @@
 
 
 class ArraysTransformer:
-    """A class to convert sampler ouput arrays to their orinal types."""
+    """A class to convert sampler output arrays to their original types."""
 
     def __init__(self, X, y):
         self.x_props = self._gets_props(X)
diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
@@ -18,6 +18,7 @@
 
 from sklearn.base import clone
 from sklearn.datasets import (
+    fetch_openml,
     make_classification,
     make_multilabel_classification,
 )  # noqa
@@ -30,6 +31,7 @@
 from sklearn.utils._testing import assert_raises_regex
 from sklearn.utils.multiclass import type_of_target
 
+from imblearn.datasets import make_imbalance
 from imblearn.over_sampling.base import BaseOverSampler
 from imblearn.under_sampling.base import BaseCleaningSampler, BaseUnderSampler
 
@@ -65,6 +67,7 @@ def _yield_sampler_checks(sampler):
 
 def _yield_classifier_checks(classifier):
     yield check_classifier_on_multilabel_or_multioutput_targets
+    yield check_classifiers_with_encoded_labels
 
 
 def _yield_all_checks(estimator):
@@ -376,3 +379,24 @@ def check_classifier_on_multilabel_or_multioutput_targets(name, estimator):
     msg = "Multilabel and multioutput targets are not supported."
     with pytest.raises(ValueError, match=msg):
         estimator.fit(X, y)
+
+
+def check_classifiers_with_encoded_labels(name, classifier):
+    # Non-regression test for #709
+    # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/709
+    pytest.importorskip("pandas")
+    df, y = fetch_openml("iris", version=1, as_frame=True, return_X_y=True)
+    df, y = make_imbalance(
+        df, y, sampling_strategy={
+            "Iris-setosa": 30, "Iris-versicolor": 20, "Iris-virginica": 50,
+        }
+    )
+    classifier.set_params(
+        sampling_strategy={
+            "Iris-setosa": 20, "Iris-virginica": 20,
+        }
+    )
+    classifier.fit(df, y)
+    assert set(classifier.classes_) == set(y.cat.categories.tolist())
+    y_pred = classifier.predict(df)
+    assert set(y_pred) == set(y.cat.categories.tolist())