iter

glemaitre · glemaitre · commit 1629b06bb32f · 2024-11-12T17:35:12.000+01:00
diff --git a/imblearn/ensemble/_easy_ensemble.py b/imblearn/ensemble/_easy_ensemble.py
@@ -346,12 +346,19 @@ def base_estimator_(self):
 
     def _get_estimator(self):
         if self.estimator is None:
-            return AdaBoostClassifier(algorithm="SAMME")
+            if parse_version("1.4") <= sklearn_version < parse_version("1.6"):
+                return AdaBoostClassifier(algorithm="SAMME")
+            else:
+                return AdaBoostClassifier()
         return self.estimator
 
     # TODO: remove when minimum supported version of scikit-learn is 1.5
     @available_if(check_version_package("sklearn", "<", "1.6"))
     def _more_tags(self):
-        # This code should not be called for scikit-learn >= 1.6
-        # Therefore, get_tags corresponds to _safe_tags that returns a dict
-        return {"allow_nan": get_tags(self._get_estimator(), "allow_nan")}
+        return {"allow_nan": get_tags(self._get_estimator())["allow_nan"]}
+
+    @available_if(check_version_package("sklearn", ">=", "1.6"))
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = get_tags(self._get_estimator()).input_tags.allow_nan
+        return tags
diff --git a/imblearn/ensemble/_forest.py b/imblearn/ensemble/_forest.py
@@ -5,6 +5,7 @@
 
 import numbers
 from copy import deepcopy
+from dataclasses import is_dataclass
 from warnings import warn
 
 import numpy as np
@@ -36,7 +37,7 @@
 from ..utils._docstring import _n_jobs_docstring, _random_state_docstring
 from ..utils._param_validation import Hidden, Interval, StrOptions
 from ..utils._validation import check_sampling_strategy
-from ..utils.fixes import _fit_context, check_version_package, validate_data
+from ..utils.fixes import _fit_context, check_version_package, get_tags, validate_data
 from ._common import _random_forest_classifier_parameter_constraints
 
 MAX_INT = np.iinfo(np.int32).max
@@ -78,7 +79,7 @@ def _local_parallel_build_trees(
         "bootstrap": bootstrap,
     }
 
-    if parse_version(sklearn_version.base_version) >= parse_version("1.4"):
+    if sklearn_version >= parse_version("1.4"):
         # TODO: remove when the minimum supported version of scikit-learn will be 1.4
         # support for missing values
         params_parallel_build_trees["missing_values_in_feature_mask"] = (
@@ -475,7 +476,7 @@ def __init__(
             "max_samples": max_samples,
         }
         # TODO: remove when the minimum supported version of scikit-learn will be 1.4
-        if parse_version(sklearn_version.base_version) >= parse_version("1.4"):
+        if sklearn_version >= parse_version("1.4"):
             # use scikit-learn support for monotonic constraints
             params_random_forest["monotonic_cst"] = monotonic_cst
         else:
@@ -595,12 +596,12 @@ def fit(self, X, y, sample_weight=None):
         if issparse(y):
             raise ValueError("sparse multilabel-indicator for y is not supported.")
 
-        # TODO: remove when the minimum supported version of scipy will be 1.4
-        # Support for missing values
-        if parse_version(sklearn_version.base_version) >= parse_version("1.4"):
-            ensure_all_finite = False
+        # TODO (1.6): simplify because we will only have dataclass tags
+        tags = get_tags(self)
+        if is_dataclass(tags):
+            ensure_all_finite = not tags.input_tags.allow_nan
         else:
-            ensure_all_finite = False
+            ensure_all_finite = not tags.get("allow_nan", False)
 
         X, y = validate_data(
             self,
@@ -884,4 +885,13 @@ def _compute_oob_predictions(self, X, y):
 
     @available_if(check_version_package("sklearn", "<", "1.6"))
     def _more_tags(self):
-        return {"multioutput": False, "multilabel": False}
+        allow_nan = sklearn_version >= parse_version("1.4")
+        return {"multioutput": False, "multilabel": False, "allow_nan": allow_nan}
+
+    @available_if(check_version_package("sklearn", ">=", "1.6"))
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.target_tags.multi_output = False
+        tags.classifier_tags.multi_label = False
+        tags.input_tags.allow_nan = sklearn_version >= parse_version("1.4")
+        return tags
diff --git a/imblearn/ensemble/_weight_boosting.py b/imblearn/ensemble/_weight_boosting.py
@@ -10,6 +10,7 @@
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.utils import _safe_indexing
 from sklearn.utils.fixes import parse_version
+from sklearn.utils.metaestimators import available_if
 from sklearn.utils.validation import has_fit_parameter
 
 from ..base import _ParamsValidationMixin
@@ -18,8 +19,8 @@
 from ..under_sampling.base import BaseUnderSampler
 from ..utils import Substitution, check_target_type
 from ..utils._docstring import _random_state_docstring
-from ..utils._param_validation import Interval, StrOptions
-from ..utils.fixes import _fit_context
+from ..utils._param_validation import Hidden, Interval, StrOptions
+from ..utils.fixes import _fit_context, check_version_package
 from ._common import _adaboost_classifier_parameter_constraints
 
 sklearn_version = parse_version(sklearn.__version__)
@@ -58,16 +59,18 @@ class RUSBoostClassifier(_ParamsValidationMixin, AdaBoostClassifier):
         ``learning_rate``. There is a trade-off between ``learning_rate`` and
         ``n_estimators``.
 
-    algorithm : {{'SAMME', 'SAMME.R'}}, default='SAMME.R'
+    algorithm : {{'SAMME', 'SAMME.R'}}, default='deprecated'
         If 'SAMME.R' then use the SAMME.R real boosting algorithm.
         ``base_estimator`` must support calculation of class probabilities.
         If 'SAMME' then use the SAMME discrete boosting algorithm.
         The SAMME.R algorithm typically converges faster than SAMME,
         achieving a lower test error with fewer boosting iterations.
 
         .. deprecated:: 0.12
-            `"SAMME.R"` is deprecated and will be removed in version 0.14.
-            '"SAMME"' will become the default.
+            `algorithm` is deprecated in 0.12 and will be removed 0.14.
+            Depending on the `scikit-learn` version, the "SAMME.R" algorithm might not
+            be available. Refer to the documentation of
+            :class:`~sklearn.ensemble.AdaBoostClassifier` for more information.
 
     {sampling_strategy}
 
@@ -109,7 +112,7 @@ class RUSBoostClassifier(_ParamsValidationMixin, AdaBoostClassifier):
         ensemble.
 
     feature_importances_ : ndarray of shape (n_features,)
-        The feature importances if supported by the ``base_estimator``.
+        The feature importances if supported by the ``estimator``.
 
     n_features_in_ : int
         Number of features in the input dataset.
@@ -167,6 +170,10 @@ class RUSBoostClassifier(_ParamsValidationMixin, AdaBoostClassifier):
 
     _parameter_constraints.update(
         {
+            "algorithm": [
+                StrOptions({"SAMME", "SAMME.R"}),
+                Hidden(StrOptions({"deprecated"})),
+            ],
             "sampling_strategy": [
                 Interval(numbers.Real, 0, 1, closed="right"),
                 StrOptions({"auto", "majority", "not minority", "not majority", "all"}),
@@ -186,17 +193,17 @@ def __init__(
         *,
         n_estimators=50,
         learning_rate=1.0,
-        algorithm="SAMME.R",
+        algorithm="deprecated",
         sampling_strategy="auto",
         replacement=False,
         random_state=None,
     ):
         super().__init__(
             n_estimators=n_estimators,
             learning_rate=learning_rate,
-            algorithm=algorithm,
             random_state=random_state,
         )
+        self.algorithm = algorithm
         self.estimator = estimator
         self.sampling_strategy = sampling_strategy
         self.replacement = replacement
@@ -394,3 +401,7 @@ def _boost_discrete(self, iboost, X, y, sample_weight, random_state):
             sample_weight *= np.exp(estimator_weight * incorrect * (sample_weight > 0))
 
         return sample_weight, estimator_weight, estimator_error
+
+    @available_if(check_version_package("sklearn", ">=", "1.6"))
+    def _boost(self, iboost, X, y, sample_weight, random_state):
+        return self._boost_discrete(iboost, X, y, sample_weight, random_state)
diff --git a/imblearn/utils/_test_common/instance_generator.py b/imblearn/utils/_test_common/instance_generator.py
@@ -10,11 +10,16 @@
 
 from sklearn import clone, config_context
 from sklearn.linear_model import LogisticRegression
+from sklearn.tree import DecisionTreeClassifier
 from sklearn.exceptions import SkipTestWarning
 from sklearn.utils._testing import SkipTest
 
 from imblearn.combine import SMOTEENN, SMOTETomek
-from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier
+from imblearn.ensemble import (
+    BalancedBaggingClassifier,
+    BalancedRandomForestClassifier,
+    EasyEnsembleClassifier,
+)
 from imblearn.over_sampling import (
     ADASYN,
     BorderlineSMOTE,
@@ -42,6 +47,12 @@
     # estimator
     BalancedBaggingClassifier: dict(random_state=42),
     BalancedRandomForestClassifier: dict(random_state=42),
+    EasyEnsembleClassifier: [
+        # AdaBoostClassifier does not allow nan values
+        dict(random_state=42),
+        # DecisionTreeClassifier allows nan values
+        dict(estimator=DecisionTreeClassifier(random_state=42), random_state=42),
+    ],
     Pipeline: dict(
         steps=[("sampler", RandomUnderSampler()), ("logistic", LogisticRegression())]
     ),