From ed7e6fc7569570cfbd2ce0f923b9c55209836b3d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 10 Nov 2024 23:51:54 +0100 Subject: [PATCH 01/20] iter --- imblearn/base.py | 22 +- imblearn/ensemble/_bagging.py | 15 +- imblearn/ensemble/_easy_ensemble.py | 20 +- imblearn/ensemble/_forest.py | 22 +- imblearn/metrics/pairwise.py | 7 +- .../over_sampling/_random_over_sampler.py | 5 +- imblearn/over_sampling/_smote/base.py | 13 +- imblearn/tests/test_common.py | 24 +- .../_random_under_sampler.py | 5 +- imblearn/utils/_tags.py | 13 + imblearn/utils/_test_common/__init__.py | 0 .../utils/_test_common/instance_generator.py | 138 ++++++++ imblearn/utils/_validation.py | 11 +- imblearn/utils/estimator_checks.py | 305 +++++++++++++++--- imblearn/utils/fixes.py | 38 ++- imblearn/utils/tests/test_estimator_checks.py | 12 +- 16 files changed, 543 insertions(+), 107 deletions(-) create mode 100644 imblearn/utils/_tags.py create mode 100644 imblearn/utils/_test_common/__init__.py create mode 100644 imblearn/utils/_test_common/instance_generator.py diff --git a/imblearn/base.py b/imblearn/base.py index 6e3954532..18913667d 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -7,15 +7,26 @@ from abc import ABCMeta, abstractmethod import numpy as np +import sklearn from sklearn.base import BaseEstimator, OneToOneFeatureMixin from sklearn.preprocessing import label_binarize +from sklearn.utils.metaestimators import available_if from sklearn.utils.multiclass import check_classification_targets +from sklearn.utils.fixes import parse_version from .utils import check_sampling_strategy, check_target_type +from .utils.fixes import validate_data from .utils._param_validation import validate_parameter_constraints +from .utils._tags import InputTags from .utils._validation import ArraysTransformer +def check_version(): + return parse_version( + parse_version(sklearn.__version__).base_version + ) >= parse_version("1.6") + + class _ParamsValidationMixin: """Mixin class to validate parameters.""" @@ -147,7 +158,7 @@ def _check_X_y(self, X, y, accept_sparse=None): if accept_sparse is None: accept_sparse = ["csr", "csc"] y, binarize_y = check_target_type(y, indicate_one_vs_all=True) - X, y = self._validate_data(X, y, reset=True, accept_sparse=accept_sparse) + X, y = validate_data(self, X=X, y=y, reset=True, accept_sparse=accept_sparse) return X, y, binarize_y def fit(self, X, y): @@ -196,9 +207,18 @@ def fit_resample(self, X, y): self._validate_params() return super().fit_resample(X, y) + @available_if(check_version) def _more_tags(self): return {"X_types": ["2darray", "sparse", "dataframe"]} + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags = InputTags() + tags.input_tags.two_d_array = True + tags.input_tags.sparse = True + tags.input_tags.dataframe = True + return tags + def _identity(X, y): return X, y diff --git a/imblearn/ensemble/_bagging.py b/imblearn/ensemble/_bagging.py index 79559cd2c..0ce2526ae 100644 --- a/imblearn/ensemble/_bagging.py +++ b/imblearn/ensemble/_bagging.py @@ -26,10 +26,10 @@ from ..utils import Substitution, check_sampling_strategy, check_target_type from ..utils._docstring import _n_jobs_docstring, _random_state_docstring from ..utils._param_validation import HasMethods, Interval, StrOptions -from ..utils.fixes import _fit_context +from ..utils.fixes import _fit_context, validate_data from ._common import _bagging_parameter_constraints, _estimator_has -sklearn_version = parse_version(sklearn.__version__) +sklearn_version = parse_version(parse_version(sklearn.__version__).base_version) @Substitution( @@ -382,12 +382,17 @@ def decision_function(self, X): check_is_fitted(self) # Check data - X = self._validate_data( - X, + if sklearn_version < parse_version("1.6"): + kwargs = {"force_all_finite": False} + else: + kwargs = {"ensure_all_finite": False} + X = validate_data( + self, + X=X, accept_sparse=["csr", "csc"], dtype=None, - force_all_finite=False, reset=False, + **kwargs ) # Parallel loop diff --git a/imblearn/ensemble/_easy_ensemble.py b/imblearn/ensemble/_easy_ensemble.py index aec7f6837..78b1e842a 100644 --- a/imblearn/ensemble/_easy_ensemble.py +++ b/imblearn/ensemble/_easy_ensemble.py @@ -14,7 +14,6 @@ from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier from sklearn.ensemble._bagging import _parallel_decision_function from sklearn.ensemble._base import _partition_estimators -from sklearn.utils._tags import _safe_tags from sklearn.utils.fixes import parse_version from sklearn.utils.metaestimators import available_if from sklearn.utils.parallel import Parallel, delayed @@ -27,11 +26,11 @@ from ..utils import Substitution, check_sampling_strategy, check_target_type from ..utils._docstring import _n_jobs_docstring, _random_state_docstring from ..utils._param_validation import Interval, StrOptions -from ..utils.fixes import _fit_context +from ..utils.fixes import _fit_context, get_tags, validate_data from ._common import _bagging_parameter_constraints, _estimator_has MAX_INT = np.iinfo(np.int32).max -sklearn_version = parse_version(sklearn.__version__) +sklearn_version = parse_version(parse_version(sklearn.__version__).base_version) @Substitution( @@ -311,12 +310,17 @@ def decision_function(self, X): check_is_fitted(self) # Check data - X = self._validate_data( - X, + if sklearn_version < parse_version("1.6"): + kwargs = {"force_all_finite": False} + else: + kwargs = {"ensure_all_finite": False} + X = validate_data( + self, + X=X, accept_sparse=["csr", "csc"], dtype=None, - force_all_finite=False, reset=False, + **kwargs, ) # Parallel loop @@ -351,4 +355,6 @@ def _get_estimator(self): # TODO: remove when minimum supported version of scikit-learn is 1.5 def _more_tags(self): - return {"allow_nan": _safe_tags(self._get_estimator(), "allow_nan")} + # This code should not be called for scikit-learn >= 1.6 + # Therefore, get_tags corresponds to _safe_tags that returns a dict + return {"allow_nan": get_tags(self._get_estimator(), "allow_nan")} diff --git a/imblearn/ensemble/_forest.py b/imblearn/ensemble/_forest.py index 587db01d8..5f1b700bc 100644 --- a/imblearn/ensemble/_forest.py +++ b/imblearn/ensemble/_forest.py @@ -35,11 +35,11 @@ from ..utils._docstring import _n_jobs_docstring, _random_state_docstring from ..utils._param_validation import Hidden, Interval, StrOptions from ..utils._validation import check_sampling_strategy -from ..utils.fixes import _fit_context +from ..utils.fixes import _fit_context, validate_data from ._common import _random_forest_classifier_parameter_constraints MAX_INT = np.iinfo(np.int32).max -sklearn_version = parse_version(sklearn.__version__) +sklearn_version = parse_version(parse_version(sklearn.__version__).base_version) def _local_parallel_build_trees( @@ -597,21 +597,25 @@ def fit(self, X, y, sample_weight=None): # TODO: remove when the minimum supported version of scipy will be 1.4 # Support for missing values if parse_version(sklearn_version.base_version) >= parse_version("1.4"): - force_all_finite = False + if sklearn_version >= parse_version("1.6"): + kwargs = {"ensure_all_finite": False} + else: + kwargs = {"force_all_finite": False} else: - force_all_finite = True + kwargs = {"force_all_finite": False} - X, y = self._validate_data( - X, - y, + X, y = validate_data( + self, + X=X, + y=y, multi_output=True, accept_sparse="csc", dtype=DTYPE, - force_all_finite=force_all_finite, + **kwargs, ) # TODO: remove when the minimum supported version of scikit-learn will be 1.4 - if parse_version(sklearn_version.base_version) >= parse_version("1.4"): + if sklearn_version >= parse_version("1.4"): # _compute_missing_values_in_feature_mask checks if X has missing values and # will raise an error if the underlying tree base estimator can't handle # missing values. Only the criterion is required to determine if the tree diff --git a/imblearn/metrics/pairwise.py b/imblearn/metrics/pairwise.py index 766a6d399..802d726d4 100644 --- a/imblearn/metrics/pairwise.py +++ b/imblearn/metrics/pairwise.py @@ -14,6 +14,7 @@ from ..base import _ParamsValidationMixin from ..utils._param_validation import StrOptions +from ..utils.fixes import validate_data class ValueDifferenceMetric(_ParamsValidationMixin, BaseEstimator): @@ -148,7 +149,7 @@ def fit(self, X, y): """ self._validate_params() check_consistent_length(X, y) - X, y = self._validate_data(X, y, reset=True, dtype=np.int32) + X, y = validate_data(self, X=X, y=y, reset=True, dtype=np.int32) if isinstance(self.n_categories, str) and self.n_categories == "auto": # categories are expected to be encoded from 0 to n_categories - 1 @@ -207,11 +208,11 @@ def pairwise(self, X, Y=None): The VDM pairwise distance. """ check_is_fitted(self) - X = self._validate_data(X, reset=False, dtype=np.int32) + X = validate_data(self, X=X, reset=False, dtype=np.int32) n_samples_X = X.shape[0] if Y is not None: - Y = self._validate_data(Y, reset=False, dtype=np.int32) + Y = validate_data(self, Y=Y, reset=False, dtype=np.int32) n_samples_Y = Y.shape[0] else: n_samples_Y = n_samples_X diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py index 993788a42..71da059da 100644 --- a/imblearn/over_sampling/_random_over_sampler.py +++ b/imblearn/over_sampling/_random_over_sampler.py @@ -15,6 +15,7 @@ from ..utils import Substitution, check_target_type from ..utils._docstring import _random_state_docstring from ..utils._param_validation import Interval +from ..utils.fixes import _check_n_features, _check_feature_names from ..utils._validation import _check_X from .base import BaseOverSampler @@ -156,8 +157,8 @@ def __init__( def _check_X_y(self, X, y): y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X = _check_X(X) - self._check_n_features(X, reset=True) - self._check_feature_names(X, reset=True) + _check_n_features(self, X, reset=True) + _check_feature_names(self, X, reset=True) return X, y, binarize_y def _fit_resample(self, X, y): diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py index 95e10b246..dc2e565ec 100644 --- a/imblearn/over_sampling/_smote/base.py +++ b/imblearn/over_sampling/_smote/base.py @@ -32,7 +32,7 @@ from ...utils._docstring import _n_jobs_docstring, _random_state_docstring from ...utils._param_validation import HasMethods, Interval, StrOptions from ...utils._validation import _check_X -from ...utils.fixes import _is_pandas_df, _mode +from ...utils.fixes import _check_n_features, _check_feature_names, _is_pandas_df, _mode, validate_data from ..base import BaseOverSampler sklearn_version = parse_version(sklearn.__version__).base_version @@ -601,8 +601,8 @@ def _check_X_y(self, X, y): """ y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X = _check_X(X) - self._check_n_features(X, reset=True) - self._check_feature_names(X, reset=True) + _check_n_features(self, X, reset=True) + _check_feature_names(self, X, reset=True) return X, y, binarize_y def _validate_column_types(self, X): @@ -963,9 +963,10 @@ def __init__( def _check_X_y(self, X, y): """Check should accept strings and not sparse matrices.""" y, binarize_y = check_target_type(y, indicate_one_vs_all=True) - X, y = self._validate_data( - X, - y, + X, y = validate_data( + self, + X=X, + y=y, reset=True, dtype=None, accept_sparse=["csr", "csc"], diff --git a/imblearn/tests/test_common.py b/imblearn/tests/test_common.py index f04dd1d4c..4028f439a 100644 --- a/imblearn/tests/test_common.py +++ b/imblearn/tests/test_common.py @@ -1,4 +1,5 @@ """Common tests""" + # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT @@ -10,8 +11,7 @@ import pytest from sklearn.base import clone from sklearn.exceptions import ConvergenceWarning -from sklearn.utils._testing import SkipTest, ignore_warnings, set_random_state -from sklearn.utils.estimator_checks import _construct_instance, _get_check_estimator_ids +from sklearn.utils._testing import ignore_warnings from sklearn.utils.estimator_checks import ( parametrize_with_checks as parametrize_with_checks_sklearn, ) @@ -25,6 +25,10 @@ parametrize_with_checks, ) from imblearn.utils.testing import all_estimators +from imblearn.utils._test_common.instance_generator import ( + _get_check_estimator_ids, + _tested_estimators, +) @pytest.mark.parametrize("name, Estimator", all_estimators()) @@ -34,22 +38,6 @@ def test_all_estimator_no_base_class(name, Estimator): assert not name.lower().startswith("base"), msg -def _tested_estimators(): - for name, Estimator in all_estimators(): - try: - estimator = _construct_instance(Estimator) - set_random_state(estimator) - except SkipTest: - continue - - if isinstance(estimator, NearMiss): - # For NearMiss, let's check the three algorithms - for version in (1, 2, 3): - yield clone(estimator).set_params(version=version) - else: - yield estimator - - @parametrize_with_checks_sklearn(list(_tested_estimators())) def test_estimators_compatibility_sklearn(estimator, check, request): _set_checking_parameters(estimator) diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py index 876195a6d..f914b7882 100644 --- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py @@ -8,6 +8,7 @@ from sklearn.utils import _safe_indexing, check_random_state from ...utils import Substitution, check_target_type +from ...utils.fixes import _check_n_features, _check_feature_names from ...utils._docstring import _random_state_docstring from ...utils._validation import _check_X from ..base import BaseUnderSampler @@ -99,8 +100,8 @@ def __init__( def _check_X_y(self, X, y): y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X = _check_X(X) - self._check_n_features(X, reset=True) - self._check_feature_names(X, reset=True) + _check_n_features(self, X, reset=True) + _check_feature_names(self, X, reset=True) return X, y, binarize_y def _fit_resample(self, X, y): diff --git a/imblearn/utils/_tags.py b/imblearn/utils/_tags.py new file mode 100644 index 000000000..5a43b4d52 --- /dev/null +++ b/imblearn/utils/_tags.py @@ -0,0 +1,13 @@ +from dataclasses import dataclass + +import sklearn +from sklearn.utils.fixes import parse_version + +sklearn_version = parse_version(parse_version(sklearn.__version__).base_version) + +if sklearn_version >= parse_version("1.6"): + from sklearn.utils._tags import InputTags + + @dataclass + class InputTags(InputTags): + dataframe: bool = True diff --git a/imblearn/utils/_test_common/__init__.py b/imblearn/utils/_test_common/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/imblearn/utils/_test_common/instance_generator.py b/imblearn/utils/_test_common/instance_generator.py new file mode 100644 index 000000000..455427967 --- /dev/null +++ b/imblearn/utils/_test_common/instance_generator.py @@ -0,0 +1,138 @@ +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + +import re +import warnings +from contextlib import suppress +from functools import partial +from inspect import isfunction + +from sklearn import clone, config_context +from sklearn.linear_model import LogisticRegression +from sklearn.exceptions import SkipTestWarning +from sklearn.utils._testing import SkipTest + +from imblearn.over_sampling import SMOTENC +from imblearn.pipeline import Pipeline +from imblearn.under_sampling import NearMiss, RandomUnderSampler +from imblearn.utils.testing import all_estimators + +# The following dictionary is to indicate constructor arguments suitable for the test +# suite, which uses very small datasets, and is intended to run rather quickly. +INIT_PARAMS = { + NearMiss: [dict(version=1), dict(version=2), dict(version=3)], + Pipeline: dict( + steps=[("sampler", RandomUnderSampler()), ("logistic", LogisticRegression())] + ), + SMOTENC: dict(categorical_features=[0]), +} + +# This dictionary stores parameters for specific checks. It also enables running the +# same check with multiple instances of the same estimator with different parameters. +# The special key "*" allows to apply the parameters to all checks. +# TODO(devtools): allow third-party developers to pass test specific params to checks +PER_ESTIMATOR_CHECK_PARAMS: dict = {} + +SKIPPED_ESTIMATORS = [] + + +def _tested_estimators(type_filter=None): + for _, Estimator in all_estimators(type_filter=type_filter): + with suppress(SkipTest): + for estimator in _construct_instances(Estimator): + yield estimator + + +def _construct_instances(Estimator): + """Construct Estimator instances if possible. + + If parameter sets in INIT_PARAMS are provided, use them. If there are a list + of parameter sets, return one instance for each set. + """ + if Estimator in SKIPPED_ESTIMATORS: + msg = f"Can't instantiate estimator {Estimator.__name__}" + # raise additional warning to be shown by pytest + warnings.warn(msg, SkipTestWarning) + raise SkipTest(msg) + + if Estimator in INIT_PARAMS: + param_sets = INIT_PARAMS[Estimator] + if not isinstance(param_sets, list): + param_sets = [param_sets] + for params in param_sets: + est = Estimator(**params) + yield est + else: + yield Estimator() + + +def _get_check_estimator_ids(obj): + """Create pytest ids for checks. + + When `obj` is an estimator, this returns the pprint version of the + estimator (with `print_changed_only=True`). When `obj` is a function, the + name of the function is returned with its keyword arguments. + + `_get_check_estimator_ids` is designed to be used as the `id` in + `pytest.mark.parametrize` where `check_estimator(..., generate_only=True)` + is yielding estimators and checks. + + Parameters + ---------- + obj : estimator or function + Items generated by `check_estimator`. + + Returns + ------- + id : str or None + + See Also + -------- + check_estimator + """ + if isfunction(obj): + return obj.__name__ + if isinstance(obj, partial): + if not obj.keywords: + return obj.func.__name__ + kwstring = ",".join(["{}={}".format(k, v) for k, v in obj.keywords.items()]) + return "{}({})".format(obj.func.__name__, kwstring) + if hasattr(obj, "get_params"): + with config_context(print_changed_only=True): + return re.sub(r"\s", "", str(obj)) + + +def _yield_instances_for_check(check, estimator_orig): + """Yield instances for a check. + + For most estimators, this is a no-op. + + For estimators which have an entry in PER_ESTIMATOR_CHECK_PARAMS, this will yield + an estimator for each parameter set in PER_ESTIMATOR_CHECK_PARAMS[estimator]. + """ + # TODO(devtools): enable this behavior for third party estimators as well + if type(estimator_orig) not in PER_ESTIMATOR_CHECK_PARAMS: + yield estimator_orig + return + + check_params = PER_ESTIMATOR_CHECK_PARAMS[type(estimator_orig)] + + try: + check_name = check.__name__ + except AttributeError: + # partial tests + check_name = check.func.__name__ + + if check_name not in check_params: + yield estimator_orig + return + + param_set = check_params[check_name] + if isinstance(param_set, dict): + param_set = [param_set] + + for params in param_set: + estimator = clone(estimator_orig) + estimator.set_params(**params) + yield estimator diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index bb17cf015..66e637763 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -11,9 +11,11 @@ import numpy as np from scipy.sparse import issparse +import sklearn from sklearn.base import clone from sklearn.neighbors import NearestNeighbors from sklearn.utils import check_array, column_or_1d +from sklearn.utils.fixes import parse_version from sklearn.utils.multiclass import type_of_target from sklearn.utils.validation import _num_samples @@ -28,6 +30,8 @@ ) TARGET_KIND = ("binary", "multiclass", "multilabel-indicator") +sklearn_version = parse_version(parse_version(sklearn.__version__).base_version) + class ArraysTransformer: """A class to convert sampler output arrays to their original types.""" @@ -643,6 +647,11 @@ def _check_X(X): ) if _is_pandas_df(X): return X + if sklearn_version >= parse_version("1.6"): + kwargs = {"ensure_all_finite": False} + else: + kwargs = {"force_all_finite": False} + return check_array( - X, dtype=None, accept_sparse=["csr", "csc"], force_all_finite=False + X, dtype=None, accept_sparse=["csr", "csc"], **kwargs ) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index fc58c321c..83793a443 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -9,7 +9,8 @@ import traceback import warnings from collections import Counter -from functools import partial +from dataclasses import is_dataclass +from functools import partial, wraps import numpy as np import sklearn @@ -24,7 +25,6 @@ ) from sklearn.exceptions import SkipTestWarning from sklearn.preprocessing import StandardScaler, label_binarize -from sklearn.utils._tags import _safe_tags from sklearn.utils._testing import ( SkipTest, assert_allclose, @@ -33,10 +33,7 @@ set_random_state, ) from sklearn.utils.estimator_checks import ( - _enforce_estimator_tags_X, - _enforce_estimator_tags_y, - _get_check_estimator_ids, - _maybe_mark_xfail, + _enforce_estimator_tags_X, _enforce_estimator_tags_y ) from sklearn.utils.fixes import parse_version from sklearn.utils.multiclass import type_of_target @@ -45,6 +42,10 @@ from imblearn.over_sampling.base import BaseOverSampler from imblearn.under_sampling.base import BaseCleaningSampler, BaseUnderSampler from imblearn.utils._param_validation import generate_invalid_param_val, make_constraint +from imblearn.utils._test_common.instance_generator import ( + _get_check_estimator_ids, _yield_instances_for_check +) +from imblearn.utils.fixes import get_tags sklearn_version = parse_version(sklearn.__version__) @@ -80,20 +81,32 @@ def _set_checking_parameters(estimator): def _yield_sampler_checks(sampler): - tags = sampler._get_tags() + tags = get_tags(sampler) + if is_dataclass(tags): + # scikit-learn >= 1.6 + accept_sparse = tags.input_tags.sparse + accept_dataframe = tags.input_tags.dataframe + accept_string = tags.input_tags.string + allow_nan = tags.input_tags.allow_nan + else: + # scikit-learn < 1.6 + accept_sparse = sparse in tags["X_types"] + accept_dataframe = "dataframe" in tags["X_types"] + accept_string = "string" in tags["X_types"] + allow_nan = tags["allow_nan"] yield check_target_type yield check_samplers_one_label yield check_samplers_fit yield check_samplers_fit_resample yield check_samplers_sampling_strategy_fit_resample - if "sparse" in tags["X_types"]: + if accept_sparse: yield check_samplers_sparse - if "dataframe" in tags["X_types"]: + if accept_dataframe: yield check_samplers_pandas yield check_samplers_pandas_sparse - if "string" in tags["X_types"]: + if accept_string: yield check_samplers_string - if tags["allow_nan"]: + if allow_nan: yield check_samplers_nan yield check_samplers_list yield check_samplers_multiclass_ova @@ -112,10 +125,17 @@ def _yield_classifier_checks(classifier): yield check_classifiers_with_encoded_labels -def _yield_all_checks(estimator): +def _yield_all_checks(estimator, legacy=True): name = estimator.__class__.__name__ - tags = estimator._get_tags() - if tags["_skip_test"]: + tags = get_tags(estimator) + + if is_dataclass(tags): + # scikit-learn >= 1.6 + skip_test = tags._skip_test + else: + # scikit-learn < 1.6 + skip_test = tags["_skip_test"] + if skip_test: warnings.warn( f"Explicit SKIP via _skip_test tag for estimator {name}.", SkipTestWarning, @@ -130,9 +150,139 @@ def _yield_all_checks(estimator): yield check -def parametrize_with_checks(estimators): +def _check_name(check): + if hasattr(check, "__wrapped__"): + return _check_name(check.__wrapped__) + return check.func.__name__ if isinstance(check, partial) else check.__name__ + + +def _maybe_mark(estimator, check, expected_failed_checks=None, mark=None, pytest=None): + """Mark the test as xfail or skip if needed. + + Parameters + ---------- + estimator : estimator object + Estimator instance for which to generate checks. + check : partial or callable + Check to be marked. + expected_failed_checks : dict[str, str], default=None + Dictionary of the form {check_name: reason} for checks that are expected to + fail. + mark : "xfail" or "skip" or None + Whether to mark the check as xfail or skip. + pytest : pytest module, default=None + Pytest module to use to mark the check. This is only needed if ``mark`` is + `"xfail"`. Note that one can run `check_estimator` without having `pytest` + installed. This is used in combination with `parametrize_with_checks` only. + """ + should_be_marked, reason = _should_be_skipped_or_marked( + estimator, check, expected_failed_checks + ) + if not should_be_marked or mark is None: + return estimator, check + + estimator_name = estimator.__class__.__name__ + if mark == "xfail": + return pytest.param(estimator, check, marks=pytest.mark.xfail(reason=reason)) + else: + + @wraps(check) + def wrapped(*args, **kwargs): + raise SkipTest( + f"Skipping {_check_name(check)} for {estimator_name}: {reason}" + ) + + return estimator, wrapped + + +def _should_be_skipped_or_marked( + estimator, check, expected_failed_checks: dict[str, str] | None = None +) -> tuple[bool, str]: + """Check whether a check should be skipped or marked as xfail. + + Parameters + ---------- + estimator : estimator object + Estimator instance for which to generate checks. + check : partial or callable + Check to be marked. + expected_failed_checks : dict[str, str], default=None + Dictionary of the form {check_name: reason} for checks that are expected to + fail. + + Returns + ------- + should_be_marked : bool + Whether the check should be marked as xfail or skipped. + reason : str + Reason for skipping the check. + """ + + expected_failed_checks = expected_failed_checks or {} + + check_name = _check_name(check) + if check_name in expected_failed_checks: + return True, expected_failed_checks[check_name] + + return False, "Check is not expected to fail" + + +def estimator_checks_generator( + estimator, *, legacy=True, expected_failed_checks=None, mark=None +): + """Iteratively yield all check callables for an estimator. + + .. versionadded:: 1.6 + + Parameters + ---------- + estimator : estimator object + Estimator instance for which to generate checks. + legacy : bool, default=True + Whether to include legacy checks. Over time we remove checks from this category + and move them into their specific category. + expected_failed_checks : dict[str, str], default=None + Dictionary of the form {check_name: reason} for checks that are expected to + fail. + mark : {"xfail", "skip"} or None, default=None + Whether to mark the checks that are expected to fail as + xfail(`pytest.mark.xfail`) or skip. Marking a test as "skip" is done via + wrapping the check in a function that raises a + :class:`~sklearn.exceptions.SkipTest` exception. + + Returns + ------- + estimator_checks_generator : generator + Generator that yields (estimator, check) tuples. + """ + if mark == "xfail": + import pytest + else: + pytest = None # type: ignore + + name = type(estimator).__name__ + for check in _yield_all_checks(estimator, legacy=legacy): + check_with_name = partial(check, name) + for check_instance in _yield_instances_for_check(check, estimator): + yield _maybe_mark( + check_instance, + check_with_name, + expected_failed_checks=expected_failed_checks, + mark=mark, + pytest=pytest, + ) + + +def parametrize_with_checks(estimators, *, legacy=True, expected_failed_checks=None): """Pytest specific decorator for parametrizing estimator checks. + Checks are categorised into the following groups: + + - API checks: a set of checks to ensure API compatibility with scikit-learn. + Refer to https://scikit-learn.org/dev/developers/develop.html a requirement of + scikit-learn estimators. + - legacy: a set of checks which gradually will be grouped into other categories. + The `id` of each check is set to be a pprint version of the estimator and the name of the check with its keyword arguments. This allows to use `pytest -k` to specify which tests to run:: @@ -144,10 +294,41 @@ def parametrize_with_checks(estimators): estimators : list of estimators instances Estimators to generated checks for. + .. versionchanged:: 0.24 + Passing a class was deprecated in version 0.23, and support for + classes was removed in 0.24. Pass an instance instead. + + .. versionadded:: 0.24 + + + legacy : bool, default=True + Whether to include legacy checks. Over time we remove checks from this category + and move them into their specific category. + + .. versionadded:: 1.6 + + expected_failed_checks : callable, default=None + A callable that takes an estimator as input and returns a dictionary of the + form:: + + { + "check_name": "my reason", + } + + Where `"check_name"` is the name of the check, and `"my reason"` is why + the check fails. These tests will be marked as xfail if the check fails. + + + .. versionadded:: 1.6 + Returns ------- decorator : `pytest.mark.parametrize` + See Also + -------- + check_estimator : Check if estimator adheres to scikit-learn conventions. + Examples -------- >>> from sklearn.utils.estimator_checks import parametrize_with_checks @@ -158,18 +339,29 @@ def parametrize_with_checks(estimators): ... DecisionTreeRegressor()]) ... def test_sklearn_compatible_estimator(estimator, check): ... check(estimator) + """ import pytest - def checks_generator(): + if any(isinstance(est, type) for est in estimators): + msg = ( + "Passing a class was deprecated in version 0.23 " + "and isn't supported anymore from 0.24." + "Please pass an instance instead." + ) + raise TypeError(msg) + + def _checks_generator(estimators, legacy, expected_failed_checks): for estimator in estimators: - name = type(estimator).__name__ - for check in _yield_all_checks(estimator): - check = partial(check, name) - yield _maybe_mark_xfail(estimator, check, pytest) + args = {"estimator": estimator, "legacy": legacy, "mark": "xfail"} + if callable(expected_failed_checks): + args["expected_failed_checks"] = expected_failed_checks(estimator) + yield from estimator_checks_generator(**args) return pytest.mark.parametrize( - "estimator, check", checks_generator(), ids=_get_check_estimator_ids + "estimator, check", + _checks_generator(estimators, legacy, expected_failed_checks), + ids=_get_check_estimator_ids, ) @@ -404,7 +596,12 @@ def check_samplers_sample_indices(name, sampler_orig): sampler = clone(sampler_orig) X, y = sample_dataset_generator() sampler.fit_resample(X, y) - sample_indices = sampler._get_tags().get("sample_indices", None) + tags = get_tags(sampler) + if is_dataclass(tags): + sample_indices = getattr(tags, "sample_indices", None) + else: + # scikit-learn < 1.6 + sample_indices = tags.get("sample_indices", None) if sample_indices: assert hasattr(sampler, "sample_indices_") is sample_indices else: @@ -529,14 +726,7 @@ def check_param_validation(name, estimator_orig): continue with raises(ValueError, match=match, err_msg=err_msg): - if any( - isinstance(X_type, str) and X_type.endswith("labels") - for X_type in _safe_tags(estimator, key="X_types") - ): - # The estimator is a label transformer and take only `y` - getattr(estimator, method)(y) # pragma: no cover - else: - getattr(estimator, method)(X, y) + getattr(estimator, method)(X, y) # Then, for constraints that are more than a type constraint, check that the # error is raised if param does match a valid type but does not match any valid @@ -557,14 +747,7 @@ def check_param_validation(name, estimator_orig): continue with raises(ValueError, match=match, err_msg=err_msg): - if any( - X_type.endswith("labels") - for X_type in _safe_tags(estimator, key="X_types") - ): - # The estimator is a label transformer and take only `y` - getattr(estimator, method)(y) # pragma: no cover - else: - getattr(estimator, method)(X, y) + getattr(estimator, method)(X, y) def check_dataframe_column_names_consistency(name, estimator_orig): @@ -575,12 +758,22 @@ def check_dataframe_column_names_consistency(name, estimator_orig): "pandas is not installed: not checking column name consistency for pandas" ) - tags = _safe_tags(estimator_orig) - is_supported_X_types = ( - "2darray" in tags["X_types"] or "categorical" in tags["X_types"] - ) + tags = get_tags(estimator_orig) - if not is_supported_X_types or tags["no_validation"]: + if is_dataclass(tags): + # scikit-learn >= 1.6 + is_supported_X_types = ( + tags.input_tags.two_d_array or tags.input_tags.categorical + ) + no_validation = tags.no_validation + else: + # scikit-learn < 1.6 + is_supported_X_types = ( + "2darray" in tags["X_types"] or "categorical" in tags["X_types"] + ) + no_validation = tags["no_validation"] + + if not is_supported_X_types or no_validation: return rng = np.random.RandomState(0) @@ -711,8 +904,18 @@ def check_dataframe_column_names_consistency(name, estimator_orig): def check_sampler_get_feature_names_out(name, sampler_orig): - tags = sampler_orig._get_tags() - if "2darray" not in tags["X_types"] or tags["no_validation"]: + tags = get_tags(sampler_orig) + + if is_dataclass(tags): + # scikit-learn >= 1.6 + two_d_array = tags.input_tags.two_d_array + no_validation = tags.no_validation + else: + # scikit-learn < 1.6 + two_d_array = "2darray" in tags["X_types"] + no_validation = tags["no_validation"] + + if not two_d_array or no_validation: return X, y = make_blobs( @@ -759,8 +962,16 @@ def check_sampler_get_feature_names_out_pandas(name, sampler_orig): "pandas is not installed: not checking column name consistency for pandas" ) - tags = sampler_orig._get_tags() - if "2darray" not in tags["X_types"] or tags["no_validation"]: + tags = get_tags(sampler_orig) + if is_dataclass(tags): + # scikit-learn >= 1.6 + two_d_array = tags.input_tags.two_d_array + no_validation = tags.no_validation + else: + # scikit-learn < 1.6 + two_d_array = "2darray" in tags["X_types"] + no_validation = tags["no_validation"] + if not two_d_array or no_validation: return X, y = make_blobs( diff --git a/imblearn/utils/fixes.py b/imblearn/utils/fixes.py index 9e4852566..94d1c03f8 100644 --- a/imblearn/utils/fixes.py +++ b/imblearn/utils/fixes.py @@ -15,7 +15,7 @@ from .._config import config_context, get_config sp_version = parse_version(scipy.__version__) -sklearn_version = parse_version(sklearn.__version__) +sklearn_version = parse_version(parse_version(sklearn.__version__).base_version) # TODO: Remove when SciPy 1.9 is the minimum supported version @@ -136,3 +136,39 @@ def _is_pandas_df(X): return False return isinstance(X, pd.DataFrame) return False + + +if sklearn_version < parse_version("1.6"): + from sklearn.utils._tags import _safe_tags as get_tags +else: + from sklearn.utils import get_tags + +if sklearn_version < parse_version("1.6"): + def validate_data( + _estimator, + /, + X="no_validation", + y="no_validation", + reset=True, + validate_separately=False, + skip_check_array=False, + **check_params, + ): + return _estimator._validate_data( + X, y, reset, validate_separately, skip_check_array, **check_params + ) +else: + from sklearn.utils.validation import validate_data # type: ignore[no-redef] + + +if sklearn_version < parse_version("1.6"): + def _check_n_features(estimator, X, *, reset): + return estimator._check_n_features(X, reset=reset) +else: + from sklearn.utils.validation import _check_n_features # type: ignore[no-redef] + +if sklearn_version < parse_version("1.6"): + def _check_feature_names(estimator, X, *, reset): + return estimator._check_feature_names(X, reset=reset) +else: + from sklearn.utils.validation import _check_feature_names # type: ignore[no-redef] diff --git a/imblearn/utils/tests/test_estimator_checks.py b/imblearn/utils/tests/test_estimator_checks.py index 32e9c6723..0ebd495a1 100644 --- a/imblearn/utils/tests/test_estimator_checks.py +++ b/imblearn/utils/tests/test_estimator_checks.py @@ -15,6 +15,7 @@ check_samplers_string, check_target_type, ) +from imblearn.utils.fixes import validate_data class BaseBadSampler(BaseEstimator): @@ -47,7 +48,7 @@ class NotFittedSampler(BaseBadSampler): """Sampler without target checking.""" def fit(self, X, y): - X, y = self._validate_data(X, y) + X, y = validate_data(self, X=X, y=y) return self @@ -55,7 +56,7 @@ class NoAcceptingSparseSampler(BaseBadSampler): """Sampler which does not accept sparse matrix.""" def fit(self, X, y): - X, y = self._validate_data(X, y) + X, y = validate_data(self, X=X, y=y) self.sampling_strategy_ = "sampling_strategy_" return self @@ -72,9 +73,10 @@ def _fit_resample(self, X, y): class IndicesSampler(BaseOverSampler): def _check_X_y(self, X, y): y, binarize_y = target_check(y, indicate_one_vs_all=True) - X, y = self._validate_data( - X, - y, + X, y = validate_data( + self, + X=X, + y=y, reset=True, dtype=None, force_all_finite=False, From 3d25e4738793354140f48be266f9fcd92c5eaf5d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 11 Nov 2024 11:17:13 +0100 Subject: [PATCH 02/20] fix --- imblearn/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/imblearn/base.py b/imblearn/base.py index 18913667d..b47e47788 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -17,7 +17,6 @@ from .utils import check_sampling_strategy, check_target_type from .utils.fixes import validate_data from .utils._param_validation import validate_parameter_constraints -from .utils._tags import InputTags from .utils._validation import ArraysTransformer @@ -213,6 +212,8 @@ def _more_tags(self): def __sklearn_tags__(self): tags = super().__sklearn_tags__() + + from .utils._tags import InputTags tags.input_tags = InputTags() tags.input_tags.two_d_array = True tags.input_tags.sparse = True From eaa6873afcb5114dd40178076493ee19f75fdca0 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 11 Nov 2024 11:43:36 +0100 Subject: [PATCH 03/20] iter --- imblearn/utils/fixes.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/imblearn/utils/fixes.py b/imblearn/utils/fixes.py index 94d1c03f8..a38d29847 100644 --- a/imblearn/utils/fixes.py +++ b/imblearn/utils/fixes.py @@ -144,19 +144,8 @@ def _is_pandas_df(X): from sklearn.utils import get_tags if sklearn_version < parse_version("1.6"): - def validate_data( - _estimator, - /, - X="no_validation", - y="no_validation", - reset=True, - validate_separately=False, - skip_check_array=False, - **check_params, - ): - return _estimator._validate_data( - X, y, reset, validate_separately, skip_check_array, **check_params - ) + def validate_data(_estimator, **kwargs): + return _estimator._validate_data(**kwargs) else: from sklearn.utils.validation import validate_data # type: ignore[no-redef] From 92924eb8af02d257dca4d3efa37519cb7c0370cb Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 11 Nov 2024 11:53:48 +0100 Subject: [PATCH 04/20] iter --- imblearn/metrics/pairwise.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/imblearn/metrics/pairwise.py b/imblearn/metrics/pairwise.py index 802d726d4..6f840ed6b 100644 --- a/imblearn/metrics/pairwise.py +++ b/imblearn/metrics/pairwise.py @@ -10,7 +10,7 @@ from sklearn.base import BaseEstimator from sklearn.utils import check_consistent_length from sklearn.utils.multiclass import unique_labels -from sklearn.utils.validation import check_is_fitted +from sklearn.utils.validation import check_array, check_is_fitted from ..base import _ParamsValidationMixin from ..utils._param_validation import StrOptions @@ -208,11 +208,11 @@ def pairwise(self, X, Y=None): The VDM pairwise distance. """ check_is_fitted(self) - X = validate_data(self, X=X, reset=False, dtype=np.int32) + X = check_array(X, dtype=np.int32) n_samples_X = X.shape[0] if Y is not None: - Y = validate_data(self, Y=Y, reset=False, dtype=np.int32) + Y = check_array(Y, dtype=np.int32) n_samples_Y = Y.shape[0] else: n_samples_Y = n_samples_X From 176b61435db8bfef07acd799d692989d985731a8 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 11 Nov 2024 11:59:07 +0100 Subject: [PATCH 05/20] fix --- imblearn/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imblearn/base.py b/imblearn/base.py index b47e47788..cb4172f7d 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -20,7 +20,7 @@ from .utils._validation import ArraysTransformer -def check_version(): +def check_version(estimator): return parse_version( parse_version(sklearn.__version__).base_version ) >= parse_version("1.6") From fa206e4047523c18719308d805a02e868d78b0e8 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 11 Nov 2024 12:03:52 +0100 Subject: [PATCH 06/20] real fix --- imblearn/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imblearn/base.py b/imblearn/base.py index cb4172f7d..acdce49b7 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -23,7 +23,7 @@ def check_version(estimator): return parse_version( parse_version(sklearn.__version__).base_version - ) >= parse_version("1.6") + ) < parse_version("1.6") class _ParamsValidationMixin: From c1514dc6a995e4173d3649bd1ea267c75d01d9f5 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 11 Nov 2024 20:05:40 +0100 Subject: [PATCH 07/20] fix _more_tags --- imblearn/base.py | 13 ++---- imblearn/ensemble/_bagging.py | 3 +- imblearn/ensemble/_easy_ensemble.py | 3 +- imblearn/ensemble/_forest.py | 4 +- imblearn/metrics/pairwise.py | 4 +- imblearn/over_sampling/_adasyn.py | 3 ++ .../over_sampling/_random_over_sampler.py | 4 +- imblearn/over_sampling/_smote/base.py | 11 ++++- .../_cluster_centroids.py | 3 ++ .../_condensed_nearest_neighbour.py | 3 ++ .../_edited_nearest_neighbours.py | 6 ++- .../_instance_hardness_threshold.py | 3 ++ .../_prototype_selection/_nearmiss.py | 3 ++ .../_neighbourhood_cleaning_rule.py | 3 ++ .../_one_sided_selection.py | 3 ++ .../_random_under_sampler.py | 8 +++- .../_prototype_selection/_tomek_links.py | 3 ++ imblearn/utils/fixes.py | 41 +++++++++++++++++++ 18 files changed, 103 insertions(+), 18 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index acdce49b7..5bc784f3f 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -12,20 +12,12 @@ from sklearn.preprocessing import label_binarize from sklearn.utils.metaestimators import available_if from sklearn.utils.multiclass import check_classification_targets -from sklearn.utils.fixes import parse_version from .utils import check_sampling_strategy, check_target_type -from .utils.fixes import validate_data +from .utils.fixes import check_version_package, validate_data from .utils._param_validation import validate_parameter_constraints from .utils._validation import ArraysTransformer - -def check_version(estimator): - return parse_version( - parse_version(sklearn.__version__).base_version - ) < parse_version("1.6") - - class _ParamsValidationMixin: """Mixin class to validate parameters.""" @@ -206,10 +198,11 @@ def fit_resample(self, X, y): self._validate_params() return super().fit_resample(X, y) - @available_if(check_version) + @available_if(check_version_package("sklearn", "<", "1.6")) def _more_tags(self): return {"X_types": ["2darray", "sparse", "dataframe"]} + @available_if(check_version_package("sklearn", ">=", "1.6")) def __sklearn_tags__(self): tags = super().__sklearn_tags__() diff --git a/imblearn/ensemble/_bagging.py b/imblearn/ensemble/_bagging.py index 0ce2526ae..974759fa3 100644 --- a/imblearn/ensemble/_bagging.py +++ b/imblearn/ensemble/_bagging.py @@ -26,7 +26,7 @@ from ..utils import Substitution, check_sampling_strategy, check_target_type from ..utils._docstring import _n_jobs_docstring, _random_state_docstring from ..utils._param_validation import HasMethods, Interval, StrOptions -from ..utils.fixes import _fit_context, validate_data +from ..utils.fixes import _fit_context, check_version_package, validate_data from ._common import _bagging_parameter_constraints, _estimator_has sklearn_version = parse_version(parse_version(sklearn.__version__).base_version) @@ -420,6 +420,7 @@ def base_estimator_(self): ) raise error + @available_if(check_version_package("sklearn", "<", "1.6")) def _more_tags(self): tags = super()._more_tags() tags_key = "_xfail_checks" diff --git a/imblearn/ensemble/_easy_ensemble.py b/imblearn/ensemble/_easy_ensemble.py index 78b1e842a..cea77690f 100644 --- a/imblearn/ensemble/_easy_ensemble.py +++ b/imblearn/ensemble/_easy_ensemble.py @@ -26,7 +26,7 @@ from ..utils import Substitution, check_sampling_strategy, check_target_type from ..utils._docstring import _n_jobs_docstring, _random_state_docstring from ..utils._param_validation import Interval, StrOptions -from ..utils.fixes import _fit_context, get_tags, validate_data +from ..utils.fixes import _fit_context, check_version_package, get_tags, validate_data from ._common import _bagging_parameter_constraints, _estimator_has MAX_INT = np.iinfo(np.int32).max @@ -354,6 +354,7 @@ def _get_estimator(self): return self.estimator # TODO: remove when minimum supported version of scikit-learn is 1.5 + @available_if(check_version_package("sklearn", "<", "1.6")) def _more_tags(self): # This code should not be called for scikit-learn >= 1.6 # Therefore, get_tags corresponds to _safe_tags that returns a dict diff --git a/imblearn/ensemble/_forest.py b/imblearn/ensemble/_forest.py index 5f1b700bc..c2904e7e9 100644 --- a/imblearn/ensemble/_forest.py +++ b/imblearn/ensemble/_forest.py @@ -24,6 +24,7 @@ from sklearn.tree import DecisionTreeClassifier from sklearn.utils import _safe_indexing, check_random_state from sklearn.utils.fixes import parse_version +from sklearn.utils.metaestimators import available_if from sklearn.utils.multiclass import type_of_target from sklearn.utils.parallel import Parallel, delayed from sklearn.utils.validation import _check_sample_weight @@ -35,7 +36,7 @@ from ..utils._docstring import _n_jobs_docstring, _random_state_docstring from ..utils._param_validation import Hidden, Interval, StrOptions from ..utils._validation import check_sampling_strategy -from ..utils.fixes import _fit_context, validate_data +from ..utils.fixes import _fit_context, check_version_package, validate_data from ._common import _random_forest_classifier_parameter_constraints MAX_INT = np.iinfo(np.int32).max @@ -884,5 +885,6 @@ def _compute_oob_predictions(self, X, y): return oob_pred + @available_if(check_version_package("sklearn", "<", "1.6")) def _more_tags(self): return {"multioutput": False, "multilabel": False} diff --git a/imblearn/metrics/pairwise.py b/imblearn/metrics/pairwise.py index 6f840ed6b..12e221935 100644 --- a/imblearn/metrics/pairwise.py +++ b/imblearn/metrics/pairwise.py @@ -9,12 +9,13 @@ from scipy.spatial import distance_matrix from sklearn.base import BaseEstimator from sklearn.utils import check_consistent_length +from sklearn.utils.metaestimators import available_if from sklearn.utils.multiclass import unique_labels from sklearn.utils.validation import check_array, check_is_fitted from ..base import _ParamsValidationMixin from ..utils._param_validation import StrOptions -from ..utils.fixes import validate_data +from ..utils.fixes import check_version_package, validate_data class ValueDifferenceMetric(_ParamsValidationMixin, BaseEstimator): @@ -229,6 +230,7 @@ def pairwise(self, X, Y=None): ) return distance + @available_if(check_version_package("sklearn", "<", "1.6")) def _more_tags(self): return { "requires_positive_X": True, # X should be encoded with OrdinalEncoder diff --git a/imblearn/over_sampling/_adasyn.py b/imblearn/over_sampling/_adasyn.py index 2dffed228..afc4e7417 100644 --- a/imblearn/over_sampling/_adasyn.py +++ b/imblearn/over_sampling/_adasyn.py @@ -10,8 +10,10 @@ import numpy as np from scipy import sparse from sklearn.utils import _safe_indexing, check_random_state +from sklearn.utils.metaestimators import available_if from ..utils import Substitution, check_neighbors_object +from ..utils.fixes import check_version_package from ..utils._docstring import _n_jobs_docstring, _random_state_docstring from ..utils._param_validation import HasMethods, Interval from .base import BaseOverSampler @@ -229,6 +231,7 @@ def _fit_resample(self, X, y): return X_resampled, y_resampled + @available_if(check_version_package("sklearn", "<", "1.6")) def _more_tags(self): return { "X_types": ["2darray"], diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py index 71da059da..eca331986 100644 --- a/imblearn/over_sampling/_random_over_sampler.py +++ b/imblearn/over_sampling/_random_over_sampler.py @@ -10,12 +10,13 @@ import numpy as np from scipy import sparse from sklearn.utils import _safe_indexing, check_array, check_random_state +from sklearn.utils.metaestimators import available_if from sklearn.utils.sparsefuncs import mean_variance_axis from ..utils import Substitution, check_target_type from ..utils._docstring import _random_state_docstring from ..utils._param_validation import Interval -from ..utils.fixes import _check_n_features, _check_feature_names +from ..utils.fixes import _check_n_features, _check_feature_names, check_version_package from ..utils._validation import _check_X from .base import BaseOverSampler @@ -250,6 +251,7 @@ def _fit_resample(self, X, y): return X_resampled, y_resampled + @available_if(check_version_package("sklearn", "<", "1.6")) def _more_tags(self): return { "X_types": ["2darray", "string", "sparse", "dataframe"], diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py index dc2e565ec..c7decb9da 100644 --- a/imblearn/over_sampling/_smote/base.py +++ b/imblearn/over_sampling/_smote/base.py @@ -21,6 +21,7 @@ check_array, check_random_state, ) +from sklearn.utils.metaestimators import available_if from sklearn.utils.fixes import parse_version from sklearn.utils.sparsefuncs_fast import ( csr_mean_variance_axis0, @@ -32,7 +33,14 @@ from ...utils._docstring import _n_jobs_docstring, _random_state_docstring from ...utils._param_validation import HasMethods, Interval, StrOptions from ...utils._validation import _check_X -from ...utils.fixes import _check_n_features, _check_feature_names, _is_pandas_df, _mode, validate_data +from ...utils.fixes import ( + _check_n_features, + _check_feature_names, + _is_pandas_df, + _mode, + check_version_package, + validate_data, +) from ..base import BaseOverSampler sklearn_version = parse_version(sklearn.__version__).base_version @@ -1062,5 +1070,6 @@ def _fit_resample(self, X, y): else: return X_resampled, y_resampled + @available_if(check_version_package("sklearn", "<", "1.6")) def _more_tags(self): return {"X_types": ["2darray", "dataframe", "string"]} diff --git a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py index 5e2ca3a82..d8cdde1ce 100644 --- a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py @@ -12,10 +12,12 @@ from sklearn.cluster import KMeans from sklearn.neighbors import NearestNeighbors from sklearn.utils import _safe_indexing +from sklearn.utils.metaestimators import available_if from ...utils import Substitution from ...utils._docstring import _random_state_docstring from ...utils._param_validation import HasMethods, StrOptions +from ...utils.fixes import check_version_package from ..base import BaseUnderSampler VOTING_KIND = ("auto", "hard", "soft") @@ -201,5 +203,6 @@ def _fit_resample(self, X, y): return X_resampled, np.array(y_resampled, dtype=y.dtype) + @available_if(check_version_package("sklearn", "<", "1.6")) def _more_tags(self): return {"sample_indices": False} diff --git a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py index fe49f1707..803fa6858 100644 --- a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py @@ -14,9 +14,11 @@ from sklearn.base import clone from sklearn.neighbors import KNeighborsClassifier from sklearn.utils import _safe_indexing, check_random_state +from sklearn.utils.metaestimators import available_if from ...utils import Substitution from ...utils._docstring import _n_jobs_docstring, _random_state_docstring +from ...utils.fixes import check_version_package from ...utils._param_validation import HasMethods, Interval from ..base import BaseCleaningSampler @@ -259,5 +261,6 @@ def estimator_(self): ) return self.estimators_[-1] + @available_if(check_version_package("sklearn", "<", "1.6")) def _more_tags(self): return {"sample_indices": True} diff --git a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py index 38abd4bed..6133dae66 100644 --- a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py @@ -11,11 +11,12 @@ import numpy as np from sklearn.utils import _safe_indexing +from sklearn.utils.metaestimators import available_if from ...utils import Substitution, check_neighbors_object from ...utils._docstring import _n_jobs_docstring from ...utils._param_validation import HasMethods, Interval, StrOptions -from ...utils.fixes import _mode +from ...utils.fixes import _mode, check_version_package from ..base import BaseCleaningSampler SEL_KIND = ("all", "mode") @@ -189,6 +190,7 @@ def _fit_resample(self, X, y): return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under) + @available_if(check_version_package("sklearn", "<", "1.6")) def _more_tags(self): return {"sample_indices": True} @@ -410,6 +412,7 @@ def _fit_resample(self, X, y): return X_resampled, y_resampled + @available_if(check_version_package("sklearn", "<", "1.6")) def _more_tags(self): return {"sample_indices": True} @@ -619,5 +622,6 @@ def _fit_resample(self, X, y): return X_resampled, y_resampled + @available_if(check_version_package("sklearn", "<", "1.6")) def _more_tags(self): return {"sample_indices": True} diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py index dac3f3c33..94977784b 100644 --- a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py @@ -15,9 +15,11 @@ from sklearn.ensemble._base import _set_random_states from sklearn.model_selection import StratifiedKFold, cross_val_predict from sklearn.utils import _safe_indexing, check_random_state +from sklearn.utils.metaestimators import available_if from ...utils import Substitution from ...utils._docstring import _n_jobs_docstring, _random_state_docstring +from ...utils.fixes import check_version_package from ...utils._param_validation import HasMethods from ..base import BaseUnderSampler @@ -200,5 +202,6 @@ def _fit_resample(self, X, y): return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under) + @available_if(check_version_package("sklearn", "<", "1.6")) def _more_tags(self): return {"sample_indices": True} diff --git a/imblearn/under_sampling/_prototype_selection/_nearmiss.py b/imblearn/under_sampling/_prototype_selection/_nearmiss.py index 70f647fa5..0817694da 100644 --- a/imblearn/under_sampling/_prototype_selection/_nearmiss.py +++ b/imblearn/under_sampling/_prototype_selection/_nearmiss.py @@ -10,8 +10,10 @@ import numpy as np from sklearn.utils import _safe_indexing +from sklearn.utils.metaestimators import available_if from ...utils import Substitution, check_neighbors_object +from ...utils.fixes import check_version_package from ...utils._docstring import _n_jobs_docstring from ...utils._param_validation import HasMethods, Interval from ..base import BaseUnderSampler @@ -303,6 +305,7 @@ def _fit_resample(self, X, y): return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under) # fmt: off + @available_if(check_version_package("sklearn", "<", "1.6")) def _more_tags(self): return { "sample_indices": True, diff --git a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py index 7277a3c99..9d0f1831b 100644 --- a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py @@ -12,10 +12,12 @@ from sklearn.base import clone from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors from sklearn.utils import _safe_indexing +from sklearn.utils.metaestimators import available_if from ...utils import Substitution from ...utils._docstring import _n_jobs_docstring from ...utils._param_validation import HasMethods, Hidden, Interval, StrOptions +from ...utils.fixes import check_version_package from ..base import BaseCleaningSampler from ._edited_nearest_neighbours import EditedNearestNeighbours @@ -256,5 +258,6 @@ def _fit_resample(self, X, y): _safe_indexing(y, self.sample_indices_), ) + @available_if(check_version_package("sklearn", "<", "1.6")) def _more_tags(self): return {"sample_indices": True} diff --git a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py index bfd3449bd..f02b25778 100644 --- a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py +++ b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py @@ -12,8 +12,10 @@ from sklearn.base import clone from sklearn.neighbors import KNeighborsClassifier from sklearn.utils import _safe_indexing, check_random_state +from sklearn.utils.metaestimators import available_if from ...utils import Substitution +from ...utils.fixes import check_version_package from ...utils._docstring import _n_jobs_docstring, _random_state_docstring from ...utils._param_validation import HasMethods, Interval from ..base import BaseCleaningSampler @@ -225,5 +227,6 @@ def estimator_(self): ) return self.estimators_[-1] + @available_if(check_version_package("sklearn", "<", "1.6")) def _more_tags(self): return {"sample_indices": True} diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py index f914b7882..7dcfbe4fb 100644 --- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py @@ -6,9 +6,14 @@ import numpy as np from sklearn.utils import _safe_indexing, check_random_state +from sklearn.utils.metaestimators import available_if from ...utils import Substitution, check_target_type -from ...utils.fixes import _check_n_features, _check_feature_names +from ...utils.fixes import ( + _check_n_features, + _check_feature_names, + check_version_package, +) from ...utils._docstring import _random_state_docstring from ...utils._validation import _check_X from ..base import BaseUnderSampler @@ -132,6 +137,7 @@ def _fit_resample(self, X, y): return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under) + @available_if(check_version_package("sklearn", "<", "1.6")) def _more_tags(self): return { "X_types": ["2darray", "string", "sparse", "dataframe"], diff --git a/imblearn/under_sampling/_prototype_selection/_tomek_links.py b/imblearn/under_sampling/_prototype_selection/_tomek_links.py index b0f954959..254d482c9 100644 --- a/imblearn/under_sampling/_prototype_selection/_tomek_links.py +++ b/imblearn/under_sampling/_prototype_selection/_tomek_links.py @@ -10,8 +10,10 @@ import numpy as np from sklearn.neighbors import NearestNeighbors from sklearn.utils import _safe_indexing +from sklearn.utils.metaestimators import available_if from ...utils import Substitution +from ...utils.fixes import check_version_package from ...utils._docstring import _n_jobs_docstring from ..base import BaseCleaningSampler @@ -156,5 +158,6 @@ def _fit_resample(self, X, y): _safe_indexing(y, self.sample_indices_), ) + @available_if(check_version_package("sklearn", "<", "1.6")) def _more_tags(self): return {"sample_indices": True} diff --git a/imblearn/utils/fixes.py b/imblearn/utils/fixes.py index a38d29847..d960f5374 100644 --- a/imblearn/utils/fixes.py +++ b/imblearn/utils/fixes.py @@ -161,3 +161,44 @@ def _check_feature_names(estimator, X, *, reset): return estimator._check_feature_names(X, reset=reset) else: from sklearn.utils.validation import _check_feature_names # type: ignore[no-redef] + + +def check_version_package(package, constraint, version, /): + """Create a function to check package version against a constraint. + + Parameters + ---------- + package : str + The package name to check version for. + constraint : {"<", "<=", ">", ">="} + The version constraint. + version : str + The version to compare against. + + Returns + ------- + callable + A function that takes an estimator and returns bool. + """ + operators = { + "<": lambda x, y: x < y, + "<=": lambda x, y: x <= y, + ">": lambda x, y: x > y, + ">=": lambda x, y: x >= y + } + + if constraint not in operators: + raise ValueError(f"Invalid constraint: {constraint}") + + op = operators[constraint] + parsed_version = parse_version(version) + + def check_version(estimator): + try: + pkg = __import__(package) + pkg_version = parse_version(parse_version(pkg.__version__).base_version) + return op(pkg_version, parsed_version) + except (ImportError, AttributeError): + return False + + return check_version From acb8234cd21c517422257db9178a2237446047d3 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 12 Nov 2024 11:44:27 +0100 Subject: [PATCH 08/20] iter --- imblearn/base.py | 16 +++- imblearn/ensemble/_bagging.py | 6 +- imblearn/ensemble/_easy_ensemble.py | 6 +- imblearn/ensemble/_forest.py | 9 +- imblearn/metrics/tests/test_classification.py | 12 ++- .../over_sampling/_random_over_sampler.py | 8 ++ imblearn/over_sampling/_smote/base.py | 18 ++++ .../_condensed_nearest_neighbour.py | 6 ++ .../_edited_nearest_neighbours.py | 18 ++++ .../_instance_hardness_threshold.py | 6 ++ .../_prototype_selection/_nearmiss.py | 6 ++ .../_neighbourhood_cleaning_rule.py | 6 ++ .../_one_sided_selection.py | 6 ++ .../_random_under_sampler.py | 8 ++ .../_prototype_selection/_tomek_links.py | 6 ++ imblearn/utils/_tags.py | 94 ++++++++++++++++++- .../utils/_test_common/instance_generator.py | 47 +++++++++- imblearn/utils/_validation.py | 6 +- imblearn/utils/estimator_checks.py | 2 +- imblearn/utils/fixes.py | 11 ++- imblearn/utils/tests/test_estimator_checks.py | 2 +- 21 files changed, 257 insertions(+), 42 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index 5bc784f3f..d31a898a0 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -37,7 +37,7 @@ class attribute, which is a dictionary `param_name: list of constraints`. See ) -class SamplerMixin(_ParamsValidationMixin, BaseEstimator, metaclass=ABCMeta): +class SamplerMixin(_ParamsValidationMixin, metaclass=ABCMeta): """Mixin class for samplers with abstract method. Warning: This class should not be used directly. Use the derive classes @@ -135,7 +135,7 @@ def _fit_resample(self, X, y): pass -class BaseSampler(SamplerMixin, OneToOneFeatureMixin): +class BaseSampler(SamplerMixin, OneToOneFeatureMixin, BaseEstimator): """Base class for sampling algorithms. Warning: This class should not be used directly. Use the derive classes @@ -204,9 +204,15 @@ def _more_tags(self): @available_if(check_version_package("sklearn", ">=", "1.6")) def __sklearn_tags__(self): - tags = super().__sklearn_tags__() - - from .utils._tags import InputTags + from .utils._tags import Tags, SamplerTags, TargetTags, InputTags + tags = Tags( + estimator_type="sampler", + target_tags=TargetTags(required=True), + transformer_tags=None, + regressor_tags=None, + classifier_tags=None, + sampler_tags=SamplerTags(), + ) tags.input_tags = InputTags() tags.input_tags.two_d_array = True tags.input_tags.sparse = True diff --git a/imblearn/ensemble/_bagging.py b/imblearn/ensemble/_bagging.py index 974759fa3..59bd7816f 100644 --- a/imblearn/ensemble/_bagging.py +++ b/imblearn/ensemble/_bagging.py @@ -382,17 +382,13 @@ def decision_function(self, X): check_is_fitted(self) # Check data - if sklearn_version < parse_version("1.6"): - kwargs = {"force_all_finite": False} - else: - kwargs = {"ensure_all_finite": False} X = validate_data( self, X=X, accept_sparse=["csr", "csc"], dtype=None, reset=False, - **kwargs + ensure_all_finite=False, ) # Parallel loop diff --git a/imblearn/ensemble/_easy_ensemble.py b/imblearn/ensemble/_easy_ensemble.py index cea77690f..35a7df2cd 100644 --- a/imblearn/ensemble/_easy_ensemble.py +++ b/imblearn/ensemble/_easy_ensemble.py @@ -310,17 +310,13 @@ def decision_function(self, X): check_is_fitted(self) # Check data - if sklearn_version < parse_version("1.6"): - kwargs = {"force_all_finite": False} - else: - kwargs = {"ensure_all_finite": False} X = validate_data( self, X=X, accept_sparse=["csr", "csc"], dtype=None, reset=False, - **kwargs, + ensure_all_finite=False, ) # Parallel loop diff --git a/imblearn/ensemble/_forest.py b/imblearn/ensemble/_forest.py index c2904e7e9..6b0fb686e 100644 --- a/imblearn/ensemble/_forest.py +++ b/imblearn/ensemble/_forest.py @@ -598,12 +598,9 @@ def fit(self, X, y, sample_weight=None): # TODO: remove when the minimum supported version of scipy will be 1.4 # Support for missing values if parse_version(sklearn_version.base_version) >= parse_version("1.4"): - if sklearn_version >= parse_version("1.6"): - kwargs = {"ensure_all_finite": False} - else: - kwargs = {"force_all_finite": False} + ensure_all_finite = False else: - kwargs = {"force_all_finite": False} + ensure_all_finite = False X, y = validate_data( self, @@ -612,7 +609,7 @@ def fit(self, X, y, sample_weight=None): multi_output=True, accept_sparse="csc", dtype=DTYPE, - **kwargs, + ensure_all_finite=ensure_all_finite, ) # TODO: remove when the minimum supported version of scikit-learn will be 1.4 diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py index 8169cee81..ec579e52d 100644 --- a/imblearn/metrics/tests/test_classification.py +++ b/imblearn/metrics/tests/test_classification.py @@ -4,6 +4,7 @@ # Christos Aridas # License: MIT +import warnings from functools import partial import numpy as np @@ -23,7 +24,6 @@ from sklearn.utils._testing import ( assert_allclose, assert_array_equal, - assert_no_warnings, ) from sklearn.utils.validation import check_random_state @@ -105,11 +105,13 @@ def test_sensitivity_specificity_score_binary(): # binary class case the score is the value of the measure for the positive # class (e.g. label == 1). This is deprecated for average != 'binary'. for kwargs in ({}, {"average": "binary"}): - sen = assert_no_warnings(sensitivity_score, y_true, y_pred, **kwargs) - assert sen == pytest.approx(0.68, rel=R_TOL) + with warnings.catch_warnings(): + warnings.simplefilter("error") + sen = sensitivity_score(y_true, y_pred, **kwargs) + assert sen == pytest.approx(0.68, rel=R_TOL) - spe = assert_no_warnings(specificity_score, y_true, y_pred, **kwargs) - assert spe == pytest.approx(0.88, rel=R_TOL) + spe = specificity_score(y_true, y_pred, **kwargs) + assert spe == pytest.approx(0.88, rel=R_TOL) @pytest.mark.filterwarnings("ignore:Specificity is ill-defined") diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py index eca331986..5495491d7 100644 --- a/imblearn/over_sampling/_random_over_sampler.py +++ b/imblearn/over_sampling/_random_over_sampler.py @@ -261,3 +261,11 @@ def _more_tags(self): "check_complex_data": "Robust to this type of data.", }, } + + @available_if(check_version_package("sklearn", ">=", "1.6")) + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + tags.input_tags.string = True + tags.sampler_tags.sample_indices = True + return tags diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py index c7decb9da..261a58d87 100644 --- a/imblearn/over_sampling/_smote/base.py +++ b/imblearn/over_sampling/_smote/base.py @@ -829,6 +829,17 @@ def ohe_(self): ) return self.categorical_encoder_ + @available_if(check_version_package("sklearn", "<", "1.6")) + def _more_tags(self): + return {"X_types": ["2darray", "dataframe", "string"]} + + @available_if(check_version_package("sklearn", ">=", "1.6")) + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = False + tags.input_tags.string = True + return tags + @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, @@ -1073,3 +1084,10 @@ def _fit_resample(self, X, y): @available_if(check_version_package("sklearn", "<", "1.6")) def _more_tags(self): return {"X_types": ["2darray", "dataframe", "string"]} + + @available_if(check_version_package("sklearn", ">=", "1.6")) + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = False + tags.input_tags.string = True + return tags diff --git a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py index 803fa6858..987fe0541 100644 --- a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py @@ -264,3 +264,9 @@ def estimator_(self): @available_if(check_version_package("sklearn", "<", "1.6")) def _more_tags(self): return {"sample_indices": True} + + @available_if(check_version_package("sklearn", ">=", "1.6")) + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.sampler_tags.sample_indices = True + return tags diff --git a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py index 6133dae66..b31faf440 100644 --- a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py @@ -194,6 +194,12 @@ def _fit_resample(self, X, y): def _more_tags(self): return {"sample_indices": True} + @available_if(check_version_package("sklearn", ">=", "1.6")) + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.sampler_tags.sample_indices = True + return tags + @Substitution( sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, @@ -416,6 +422,12 @@ def _fit_resample(self, X, y): def _more_tags(self): return {"sample_indices": True} + @available_if(check_version_package("sklearn", ">=", "1.6")) + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.sampler_tags.sample_indices = True + return tags + @Substitution( sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, @@ -625,3 +637,9 @@ def _fit_resample(self, X, y): @available_if(check_version_package("sklearn", "<", "1.6")) def _more_tags(self): return {"sample_indices": True} + + @available_if(check_version_package("sklearn", ">=", "1.6")) + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.sampler_tags.sample_indices = True + return tags diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py index 94977784b..02a2ab38b 100644 --- a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py @@ -205,3 +205,9 @@ def _fit_resample(self, X, y): @available_if(check_version_package("sklearn", "<", "1.6")) def _more_tags(self): return {"sample_indices": True} + + @available_if(check_version_package("sklearn", ">=", "1.6")) + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.sampler_tags.sample_indices = True + return tags diff --git a/imblearn/under_sampling/_prototype_selection/_nearmiss.py b/imblearn/under_sampling/_prototype_selection/_nearmiss.py index 0817694da..5b9734e04 100644 --- a/imblearn/under_sampling/_prototype_selection/_nearmiss.py +++ b/imblearn/under_sampling/_prototype_selection/_nearmiss.py @@ -315,3 +315,9 @@ def _more_tags(self): } } # fmt: on + + @available_if(check_version_package("sklearn", ">=", "1.6")) + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.sampler_tags.sample_indices = True + return tags diff --git a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py index 9d0f1831b..53395a9e8 100644 --- a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py @@ -261,3 +261,9 @@ def _fit_resample(self, X, y): @available_if(check_version_package("sklearn", "<", "1.6")) def _more_tags(self): return {"sample_indices": True} + + @available_if(check_version_package("sklearn", ">=", "1.6")) + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.sampler_tags.sample_indices = True + return tags diff --git a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py index f02b25778..72e29dafe 100644 --- a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py +++ b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py @@ -230,3 +230,9 @@ def estimator_(self): @available_if(check_version_package("sklearn", "<", "1.6")) def _more_tags(self): return {"sample_indices": True} + + @available_if(check_version_package("sklearn", ">=", "1.6")) + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.sampler_tags.sample_indices = True + return tags diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py index 7dcfbe4fb..1b0130eb4 100644 --- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py @@ -147,3 +147,11 @@ def _more_tags(self): "check_complex_data": "Robust to this type of data.", }, } + + @available_if(check_version_package("sklearn", ">=", "1.6")) + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + tags.input_tags.string = True + tags.sampler_tags.sample_indices = True + return tags diff --git a/imblearn/under_sampling/_prototype_selection/_tomek_links.py b/imblearn/under_sampling/_prototype_selection/_tomek_links.py index 254d482c9..b21ab9190 100644 --- a/imblearn/under_sampling/_prototype_selection/_tomek_links.py +++ b/imblearn/under_sampling/_prototype_selection/_tomek_links.py @@ -161,3 +161,9 @@ def _fit_resample(self, X, y): @available_if(check_version_package("sklearn", "<", "1.6")) def _more_tags(self): return {"sample_indices": True} + + @available_if(check_version_package("sklearn", ">=", "1.6")) + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.sampler_tags.sample_indices = True + return tags diff --git a/imblearn/utils/_tags.py b/imblearn/utils/_tags.py index 5a43b4d52..7d59e73a8 100644 --- a/imblearn/utils/_tags.py +++ b/imblearn/utils/_tags.py @@ -1,13 +1,101 @@ -from dataclasses import dataclass +from dataclasses import dataclass, field import sklearn from sklearn.utils.fixes import parse_version +from .fixes import _dataclass_args sklearn_version = parse_version(parse_version(sklearn.__version__).base_version) if sklearn_version >= parse_version("1.6"): - from sklearn.utils._tags import InputTags + from sklearn.utils._tags import ( + TargetTags, + TransformerTags, + ClassifierTags, + RegressorTags, + InputTags, + ) - @dataclass + @dataclass(**_dataclass_args()) class InputTags(InputTags): dataframe: bool = True + + @dataclass(**_dataclass_args()) + class SamplerTags: + """Tags for the sampler. + + Parameters + ---------- + sample_indices : bool, default=False + Whether the sampler returns the indices of the samples that were + selected. + """ + + sample_indices: bool = False + + +@dataclass(**_dataclass_args()) +class Tags: + """Tags for the estimator. + + See :ref:`estimator_tags` for more information. + + Parameters + ---------- + estimator_type : str or None + The type of the estimator. Can be one of: + - "classifier" + - "regressor" + - "transformer" + - "clusterer" + - "outlier_detector" + - "density_estimator" + + target_tags : :class:`TargetTags` + The target(y) tags. + + transformer_tags : :class:`TransformerTags` or None + The transformer tags. + + classifier_tags : :class:`ClassifierTags` or None + The classifier tags. + + regressor_tags : :class:`RegressorTags` or None + The regressor tags. + + sampler_tags : :class:`SamplerTags` or None + The sampler tags. + + array_api_support : bool, default=False + Whether the estimator supports Array API compatible inputs. + + no_validation : bool, default=False + Whether the estimator skips input-validation. This is only meant for + stateless and dummy transformers! + + non_deterministic : bool, default=False + Whether the estimator is not deterministic given a fixed ``random_state``. + + requires_fit : bool, default=True + Whether the estimator requires to be fitted before calling one of + `transform`, `predict`, `predict_proba`, or `decision_function`. + + _skip_test : bool, default=False + Whether to skip common tests entirely. Don't use this unless + you have a *very good* reason. + + input_tags : :class:`InputTags` + The input data(X) tags. + """ + + estimator_type: str | None + target_tags: TargetTags + transformer_tags: TransformerTags | None + classifier_tags: ClassifierTags | None + regressor_tags: RegressorTags | None + sampler_tags: SamplerTags | None + array_api_support: bool = False + no_validation: bool = False + non_deterministic: bool = False + requires_fit: bool = True + _skip_test: bool = False + input_tags: InputTags = field(default_factory=InputTags) diff --git a/imblearn/utils/_test_common/instance_generator.py b/imblearn/utils/_test_common/instance_generator.py index 455427967..0d7b8fbea 100644 --- a/imblearn/utils/_test_common/instance_generator.py +++ b/imblearn/utils/_test_common/instance_generator.py @@ -13,19 +13,56 @@ from sklearn.exceptions import SkipTestWarning from sklearn.utils._testing import SkipTest -from imblearn.over_sampling import SMOTENC +from imblearn.combine import SMOTEENN, SMOTETomek +from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier +from imblearn.over_sampling import ( + ADASYN, + BorderlineSMOTE, + KMeansSMOTE, + RandomOverSampler, + SMOTE, + SMOTEN, + SMOTENC, + SVMSMOTE, +) from imblearn.pipeline import Pipeline -from imblearn.under_sampling import NearMiss, RandomUnderSampler +from imblearn.under_sampling import ( + ClusterCentroids, + CondensedNearestNeighbour, + InstanceHardnessThreshold, + NearMiss, + OneSidedSelection, + RandomUnderSampler, +) from imblearn.utils.testing import all_estimators # The following dictionary is to indicate constructor arguments suitable for the test # suite, which uses very small datasets, and is intended to run rather quickly. INIT_PARAMS = { - NearMiss: [dict(version=1), dict(version=2), dict(version=3)], + # estimator + BalancedBaggingClassifier: dict(random_state=42), + BalancedRandomForestClassifier: dict(random_state=42), Pipeline: dict( steps=[("sampler", RandomUnderSampler()), ("logistic", LogisticRegression())] ), - SMOTENC: dict(categorical_features=[0]), + # over-sampling + ADASYN: dict(random_state=42), + BorderlineSMOTE: dict(random_state=42), + KMeansSMOTE: dict(random_state=0), + RandomOverSampler: dict(random_state=42), + SMOTE: dict(random_state=42), + SMOTEN: dict(random_state=42), + SVMSMOTE: dict(random_state=42), + # under-sampling + ClusterCentroids: dict(random_state=42), + CondensedNearestNeighbour: dict(random_state=42), + InstanceHardnessThreshold: dict(random_state=42), + NearMiss: [dict(version=1), dict(version=2), dict(version=3)], + OneSidedSelection: dict(random_state=42), + RandomUnderSampler: dict(random_state=42), + # combination + SMOTEENN: dict(random_state=42), + SMOTETomek: dict(random_state=42), } # This dictionary stores parameters for specific checks. It also enables running the @@ -34,7 +71,7 @@ # TODO(devtools): allow third-party developers to pass test specific params to checks PER_ESTIMATOR_CHECK_PARAMS: dict = {} -SKIPPED_ESTIMATORS = [] +SKIPPED_ESTIMATORS = [SMOTENC] def _tested_estimators(type_filter=None): diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index 66e637763..8a3e761a7 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -647,11 +647,7 @@ def _check_X(X): ) if _is_pandas_df(X): return X - if sklearn_version >= parse_version("1.6"): - kwargs = {"ensure_all_finite": False} - else: - kwargs = {"force_all_finite": False} return check_array( - X, dtype=None, accept_sparse=["csr", "csc"], **kwargs + X, dtype=None, accept_sparse=["csr", "csc"], ensure_all_finite=False ) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 83793a443..ffb5de129 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -598,7 +598,7 @@ def check_samplers_sample_indices(name, sampler_orig): sampler.fit_resample(X, y) tags = get_tags(sampler) if is_dataclass(tags): - sample_indices = getattr(tags, "sample_indices", None) + sample_indices = tags.sampler_tags.sample_indices else: # scikit-learn < 1.6 sample_indices = tags.get("sample_indices", None) diff --git a/imblearn/utils/fixes.py b/imblearn/utils/fixes.py index d960f5374..b5cec84d0 100644 --- a/imblearn/utils/fixes.py +++ b/imblearn/utils/fixes.py @@ -145,7 +145,9 @@ def _is_pandas_df(X): if sklearn_version < parse_version("1.6"): def validate_data(_estimator, **kwargs): - return _estimator._validate_data(**kwargs) + if "ensure_all_finite" in kwargs: + force_all_finite = kwargs.pop("ensure_all_finite") + return _estimator._validate_data(**kwargs, force_all_finite=force_all_finite) else: from sklearn.utils.validation import validate_data # type: ignore[no-redef] @@ -202,3 +204,10 @@ def check_version(estimator): return False return check_version + + +# TODO: Remove when python>=3.10 is the minimum supported version +def _dataclass_args(): + if sys.version_info < (3, 10): + return {} + return {"slots": True} diff --git a/imblearn/utils/tests/test_estimator_checks.py b/imblearn/utils/tests/test_estimator_checks.py index 0ebd495a1..71e44a896 100644 --- a/imblearn/utils/tests/test_estimator_checks.py +++ b/imblearn/utils/tests/test_estimator_checks.py @@ -79,7 +79,7 @@ def _check_X_y(self, X, y): y=y, reset=True, dtype=None, - force_all_finite=False, + ensure_all_finite=False, ) return X, y, binarize_y From 2453ca1f663e20823b27a1284599b2e35b9bfab1 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 12 Nov 2024 11:57:38 +0100 Subject: [PATCH 09/20] iter --- imblearn/over_sampling/_random_over_sampler.py | 2 +- imblearn/over_sampling/_smote/base.py | 2 +- .../_prototype_selection/_random_under_sampler.py | 2 +- imblearn/utils/_validation.py | 10 +++++----- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py index 5495491d7..6c47c6131 100644 --- a/imblearn/over_sampling/_random_over_sampler.py +++ b/imblearn/over_sampling/_random_over_sampler.py @@ -157,7 +157,7 @@ def __init__( def _check_X_y(self, X, y): y, binarize_y = check_target_type(y, indicate_one_vs_all=True) - X = _check_X(X) + X = _check_X(self, X) _check_n_features(self, X, reset=True) _check_feature_names(self, X, reset=True) return X, y, binarize_y diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py index 261a58d87..e18a5f455 100644 --- a/imblearn/over_sampling/_smote/base.py +++ b/imblearn/over_sampling/_smote/base.py @@ -608,7 +608,7 @@ def _check_X_y(self, X, y): features. """ y, binarize_y = check_target_type(y, indicate_one_vs_all=True) - X = _check_X(X) + X = _check_X(self, X) _check_n_features(self, X, reset=True) _check_feature_names(self, X, reset=True) return X, y, binarize_y diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py index 1b0130eb4..b2493c857 100644 --- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py @@ -104,7 +104,7 @@ def __init__( def _check_X_y(self, X, y): y, binarize_y = check_target_type(y, indicate_one_vs_all=True) - X = _check_X(X) + X = _check_X(self, X) _check_n_features(self, X, reset=True) _check_feature_names(self, X, reset=True) return X, y, binarize_y diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index 8a3e761a7..bf1a896b1 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -14,12 +14,12 @@ import sklearn from sklearn.base import clone from sklearn.neighbors import NearestNeighbors -from sklearn.utils import check_array, column_or_1d +from sklearn.utils import column_or_1d from sklearn.utils.fixes import parse_version from sklearn.utils.multiclass import type_of_target from sklearn.utils.validation import _num_samples -from .fixes import _is_pandas_df +from .fixes import _is_pandas_df, validate_data SAMPLING_KIND = ( "over-sampling", @@ -638,7 +638,7 @@ def inner_f(*args, **kwargs): return inner_f -def _check_X(X): +def _check_X(estimator, X): """Check X and do not check it if a dataframe.""" n_samples = _num_samples(X) if n_samples < 1: @@ -648,6 +648,6 @@ def _check_X(X): if _is_pandas_df(X): return X - return check_array( - X, dtype=None, accept_sparse=["csr", "csc"], ensure_all_finite=False + return validate_data( + estimator, X, dtype=None, accept_sparse=["csr", "csc"], ensure_all_finite=False ) From ef735f484168e6e1c672528b0f6f2c25e6677498 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 12 Nov 2024 12:02:24 +0100 Subject: [PATCH 10/20] iter --- imblearn/utils/fixes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/imblearn/utils/fixes.py b/imblearn/utils/fixes.py index b5cec84d0..69420fa7d 100644 --- a/imblearn/utils/fixes.py +++ b/imblearn/utils/fixes.py @@ -147,6 +147,8 @@ def _is_pandas_df(X): def validate_data(_estimator, **kwargs): if "ensure_all_finite" in kwargs: force_all_finite = kwargs.pop("ensure_all_finite") + else: + force_all_finite = True return _estimator._validate_data(**kwargs, force_all_finite=force_all_finite) else: from sklearn.utils.validation import validate_data # type: ignore[no-redef] From 1629b06bb32fa1394b85c3e56ece5be40afe80ea Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 12 Nov 2024 17:35:12 +0100 Subject: [PATCH 11/20] iter --- imblearn/ensemble/_easy_ensemble.py | 15 +++++++--- imblearn/ensemble/_forest.py | 28 +++++++++++++------ imblearn/ensemble/_weight_boosting.py | 27 ++++++++++++------ .../utils/_test_common/instance_generator.py | 13 ++++++++- 4 files changed, 61 insertions(+), 22 deletions(-) diff --git a/imblearn/ensemble/_easy_ensemble.py b/imblearn/ensemble/_easy_ensemble.py index 35a7df2cd..1bcb2c1dd 100644 --- a/imblearn/ensemble/_easy_ensemble.py +++ b/imblearn/ensemble/_easy_ensemble.py @@ -346,12 +346,19 @@ def base_estimator_(self): def _get_estimator(self): if self.estimator is None: - return AdaBoostClassifier(algorithm="SAMME") + if parse_version("1.4") <= sklearn_version < parse_version("1.6"): + return AdaBoostClassifier(algorithm="SAMME") + else: + return AdaBoostClassifier() return self.estimator # TODO: remove when minimum supported version of scikit-learn is 1.5 @available_if(check_version_package("sklearn", "<", "1.6")) def _more_tags(self): - # This code should not be called for scikit-learn >= 1.6 - # Therefore, get_tags corresponds to _safe_tags that returns a dict - return {"allow_nan": get_tags(self._get_estimator(), "allow_nan")} + return {"allow_nan": get_tags(self._get_estimator())["allow_nan"]} + + @available_if(check_version_package("sklearn", ">=", "1.6")) + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = get_tags(self._get_estimator()).input_tags.allow_nan + return tags diff --git a/imblearn/ensemble/_forest.py b/imblearn/ensemble/_forest.py index 6b0fb686e..386293185 100644 --- a/imblearn/ensemble/_forest.py +++ b/imblearn/ensemble/_forest.py @@ -5,6 +5,7 @@ import numbers from copy import deepcopy +from dataclasses import is_dataclass from warnings import warn import numpy as np @@ -36,7 +37,7 @@ from ..utils._docstring import _n_jobs_docstring, _random_state_docstring from ..utils._param_validation import Hidden, Interval, StrOptions from ..utils._validation import check_sampling_strategy -from ..utils.fixes import _fit_context, check_version_package, validate_data +from ..utils.fixes import _fit_context, check_version_package, get_tags, validate_data from ._common import _random_forest_classifier_parameter_constraints MAX_INT = np.iinfo(np.int32).max @@ -78,7 +79,7 @@ def _local_parallel_build_trees( "bootstrap": bootstrap, } - if parse_version(sklearn_version.base_version) >= parse_version("1.4"): + if sklearn_version >= parse_version("1.4"): # TODO: remove when the minimum supported version of scikit-learn will be 1.4 # support for missing values params_parallel_build_trees["missing_values_in_feature_mask"] = ( @@ -475,7 +476,7 @@ def __init__( "max_samples": max_samples, } # TODO: remove when the minimum supported version of scikit-learn will be 1.4 - if parse_version(sklearn_version.base_version) >= parse_version("1.4"): + if sklearn_version >= parse_version("1.4"): # use scikit-learn support for monotonic constraints params_random_forest["monotonic_cst"] = monotonic_cst else: @@ -595,12 +596,12 @@ def fit(self, X, y, sample_weight=None): if issparse(y): raise ValueError("sparse multilabel-indicator for y is not supported.") - # TODO: remove when the minimum supported version of scipy will be 1.4 - # Support for missing values - if parse_version(sklearn_version.base_version) >= parse_version("1.4"): - ensure_all_finite = False + # TODO (1.6): simplify because we will only have dataclass tags + tags = get_tags(self) + if is_dataclass(tags): + ensure_all_finite = not tags.input_tags.allow_nan else: - ensure_all_finite = False + ensure_all_finite = not tags.get("allow_nan", False) X, y = validate_data( self, @@ -884,4 +885,13 @@ def _compute_oob_predictions(self, X, y): @available_if(check_version_package("sklearn", "<", "1.6")) def _more_tags(self): - return {"multioutput": False, "multilabel": False} + allow_nan = sklearn_version >= parse_version("1.4") + return {"multioutput": False, "multilabel": False, "allow_nan": allow_nan} + + @available_if(check_version_package("sklearn", ">=", "1.6")) + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.target_tags.multi_output = False + tags.classifier_tags.multi_label = False + tags.input_tags.allow_nan = sklearn_version >= parse_version("1.4") + return tags diff --git a/imblearn/ensemble/_weight_boosting.py b/imblearn/ensemble/_weight_boosting.py index 9da02255e..fbb77142e 100644 --- a/imblearn/ensemble/_weight_boosting.py +++ b/imblearn/ensemble/_weight_boosting.py @@ -10,6 +10,7 @@ from sklearn.tree import DecisionTreeClassifier from sklearn.utils import _safe_indexing from sklearn.utils.fixes import parse_version +from sklearn.utils.metaestimators import available_if from sklearn.utils.validation import has_fit_parameter from ..base import _ParamsValidationMixin @@ -18,8 +19,8 @@ from ..under_sampling.base import BaseUnderSampler from ..utils import Substitution, check_target_type from ..utils._docstring import _random_state_docstring -from ..utils._param_validation import Interval, StrOptions -from ..utils.fixes import _fit_context +from ..utils._param_validation import Hidden, Interval, StrOptions +from ..utils.fixes import _fit_context, check_version_package from ._common import _adaboost_classifier_parameter_constraints sklearn_version = parse_version(sklearn.__version__) @@ -58,7 +59,7 @@ class RUSBoostClassifier(_ParamsValidationMixin, AdaBoostClassifier): ``learning_rate``. There is a trade-off between ``learning_rate`` and ``n_estimators``. - algorithm : {{'SAMME', 'SAMME.R'}}, default='SAMME.R' + algorithm : {{'SAMME', 'SAMME.R'}}, default='deprecated' If 'SAMME.R' then use the SAMME.R real boosting algorithm. ``base_estimator`` must support calculation of class probabilities. If 'SAMME' then use the SAMME discrete boosting algorithm. @@ -66,8 +67,10 @@ class RUSBoostClassifier(_ParamsValidationMixin, AdaBoostClassifier): achieving a lower test error with fewer boosting iterations. .. deprecated:: 0.12 - `"SAMME.R"` is deprecated and will be removed in version 0.14. - '"SAMME"' will become the default. + `algorithm` is deprecated in 0.12 and will be removed 0.14. + Depending on the `scikit-learn` version, the "SAMME.R" algorithm might not + be available. Refer to the documentation of + :class:`~sklearn.ensemble.AdaBoostClassifier` for more information. {sampling_strategy} @@ -109,7 +112,7 @@ class RUSBoostClassifier(_ParamsValidationMixin, AdaBoostClassifier): ensemble. feature_importances_ : ndarray of shape (n_features,) - The feature importances if supported by the ``base_estimator``. + The feature importances if supported by the ``estimator``. n_features_in_ : int Number of features in the input dataset. @@ -167,6 +170,10 @@ class RUSBoostClassifier(_ParamsValidationMixin, AdaBoostClassifier): _parameter_constraints.update( { + "algorithm": [ + StrOptions({"SAMME", "SAMME.R"}), + Hidden(StrOptions({"deprecated"})), + ], "sampling_strategy": [ Interval(numbers.Real, 0, 1, closed="right"), StrOptions({"auto", "majority", "not minority", "not majority", "all"}), @@ -186,7 +193,7 @@ def __init__( *, n_estimators=50, learning_rate=1.0, - algorithm="SAMME.R", + algorithm="deprecated", sampling_strategy="auto", replacement=False, random_state=None, @@ -194,9 +201,9 @@ def __init__( super().__init__( n_estimators=n_estimators, learning_rate=learning_rate, - algorithm=algorithm, random_state=random_state, ) + self.algorithm = algorithm self.estimator = estimator self.sampling_strategy = sampling_strategy self.replacement = replacement @@ -394,3 +401,7 @@ def _boost_discrete(self, iboost, X, y, sample_weight, random_state): sample_weight *= np.exp(estimator_weight * incorrect * (sample_weight > 0)) return sample_weight, estimator_weight, estimator_error + + @available_if(check_version_package("sklearn", ">=", "1.6")) + def _boost(self, iboost, X, y, sample_weight, random_state): + return self._boost_discrete(iboost, X, y, sample_weight, random_state) \ No newline at end of file diff --git a/imblearn/utils/_test_common/instance_generator.py b/imblearn/utils/_test_common/instance_generator.py index 0d7b8fbea..f83111854 100644 --- a/imblearn/utils/_test_common/instance_generator.py +++ b/imblearn/utils/_test_common/instance_generator.py @@ -10,11 +10,16 @@ from sklearn import clone, config_context from sklearn.linear_model import LogisticRegression +from sklearn.tree import DecisionTreeClassifier from sklearn.exceptions import SkipTestWarning from sklearn.utils._testing import SkipTest from imblearn.combine import SMOTEENN, SMOTETomek -from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier +from imblearn.ensemble import ( + BalancedBaggingClassifier, + BalancedRandomForestClassifier, + EasyEnsembleClassifier, +) from imblearn.over_sampling import ( ADASYN, BorderlineSMOTE, @@ -42,6 +47,12 @@ # estimator BalancedBaggingClassifier: dict(random_state=42), BalancedRandomForestClassifier: dict(random_state=42), + EasyEnsembleClassifier: [ + # AdaBoostClassifier does not allow nan values + dict(random_state=42), + # DecisionTreeClassifier allows nan values + dict(estimator=DecisionTreeClassifier(random_state=42), random_state=42), + ], Pipeline: dict( steps=[("sampler", RandomUnderSampler()), ("logistic", LogisticRegression())] ), From 7c91d5dd255b4029ec60f4a182bbb3a14ee04205 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 12 Nov 2024 17:42:05 +0100 Subject: [PATCH 12/20] iter --- .../plot_comparison_ensemble_classifier.py | 2 +- imblearn/ensemble/tests/test_easy_ensemble.py | 17 ++++++----------- imblearn/utils/_validation.py | 6 +++++- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/examples/ensemble/plot_comparison_ensemble_classifier.py b/examples/ensemble/plot_comparison_ensemble_classifier.py index 8c318e5bc..602e477e5 100644 --- a/examples/ensemble/plot_comparison_ensemble_classifier.py +++ b/examples/ensemble/plot_comparison_ensemble_classifier.py @@ -197,7 +197,7 @@ from imblearn.ensemble import EasyEnsembleClassifier, RUSBoostClassifier -estimator = AdaBoostClassifier(n_estimators=10, algorithm="SAMME") +estimator = AdaBoostClassifier(n_estimators=10) eec = EasyEnsembleClassifier(n_estimators=10, estimator=estimator) eec.fit(X_train, y_train) y_pred_eec = eec.predict(X_test) diff --git a/imblearn/ensemble/tests/test_easy_ensemble.py b/imblearn/ensemble/tests/test_easy_ensemble.py index 6fe50500f..5d3b23fd2 100644 --- a/imblearn/ensemble/tests/test_easy_ensemble.py +++ b/imblearn/ensemble/tests/test_easy_ensemble.py @@ -37,13 +37,10 @@ Y = np.array([1, 2, 2, 2, 1, 0, 1, 1, 1, 0]) -@pytest.mark.parametrize("n_estimators", [10, 20]) +@pytest.mark.parametrize("n_estimators", [5, 10]) @pytest.mark.parametrize( "estimator", - [ - AdaBoostClassifier(algorithm="SAMME", n_estimators=5), - AdaBoostClassifier(algorithm="SAMME", n_estimators=10), - ], + [AdaBoostClassifier(n_estimators=5), AdaBoostClassifier(n_estimators=10)], ) def test_easy_ensemble_classifier(n_estimators, estimator): # Check classification for various parameter settings. @@ -89,7 +86,7 @@ def test_estimator(): assert isinstance(ensemble.estimator_.steps[-1][1], AdaBoostClassifier) ensemble = EasyEnsembleClassifier( - 2, AdaBoostClassifier(algorithm="SAMME"), n_jobs=-1, random_state=0 + 2, AdaBoostClassifier(), n_jobs=-1, random_state=0 ).fit(X_train, y_train) assert isinstance(ensemble.estimator_.steps[-1][1], AdaBoostClassifier) @@ -104,9 +101,7 @@ def test_bagging_with_pipeline(): ) estimator = EasyEnsembleClassifier( n_estimators=2, - estimator=make_pipeline( - SelectKBest(k=1), AdaBoostClassifier(algorithm="SAMME") - ), + estimator=make_pipeline(SelectKBest(k=1), AdaBoostClassifier()), ) estimator.fit(X, y).predict(X) @@ -198,7 +193,7 @@ def test_easy_ensemble_classifier_single_estimator(): clf1 = EasyEnsembleClassifier(n_estimators=1, random_state=0).fit(X_train, y_train) clf2 = make_pipeline( RandomUnderSampler(random_state=0), - AdaBoostClassifier(algorithm="SAMME", random_state=0), + AdaBoostClassifier(random_state=0), ).fit(X_train, y_train) assert_array_equal(clf1.predict(X_test), clf2.predict(X_test)) @@ -217,7 +212,7 @@ def test_easy_ensemble_classifier_grid_search(): "estimator__n_estimators": [3, 4], } grid_search = GridSearchCV( - EasyEnsembleClassifier(estimator=AdaBoostClassifier(algorithm="SAMME")), + EasyEnsembleClassifier(estimator=AdaBoostClassifier()), parameters, cv=5, ) diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index bf1a896b1..8a9745c37 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -649,5 +649,9 @@ def _check_X(estimator, X): return X return validate_data( - estimator, X, dtype=None, accept_sparse=["csr", "csc"], ensure_all_finite=False + estimator, + X=X, + dtype=None, + accept_sparse=["csr", "csc"], + ensure_all_finite=False, ) From 5d12d07717dfe41da839562b595f6b8a0355f4a1 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 12 Nov 2024 17:50:33 +0100 Subject: [PATCH 13/20] more fix --- imblearn/metrics/pairwise.py | 6 ++++++ imblearn/tests/test_docstring_parameters.py | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/imblearn/metrics/pairwise.py b/imblearn/metrics/pairwise.py index 12e221935..ff6fa39b5 100644 --- a/imblearn/metrics/pairwise.py +++ b/imblearn/metrics/pairwise.py @@ -235,3 +235,9 @@ def _more_tags(self): return { "requires_positive_X": True, # X should be encoded with OrdinalEncoder } + + @available_if(check_version_package("sklearn", ">=", "1.6")) + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.positive_only = True # X should be encoded with OrdinalEncoder + return tags diff --git a/imblearn/tests/test_docstring_parameters.py b/imblearn/tests/test_docstring_parameters.py index 03b66b3cb..5cf5a6c2e 100644 --- a/imblearn/tests/test_docstring_parameters.py +++ b/imblearn/tests/test_docstring_parameters.py @@ -18,7 +18,6 @@ ) from sklearn.utils.deprecation import _is_deprecated from sklearn.utils.estimator_checks import ( - _construct_instance, _enforce_estimator_tags_X, _enforce_estimator_tags_y, ) @@ -27,6 +26,7 @@ from imblearn.base import is_sampler from imblearn.utils.estimator_checks import _set_checking_parameters from imblearn.utils.testing import all_estimators +from imblearn.utils._test_common.instance_generator import _construct_instances # walk_packages() ignores DeprecationWarnings, now we need to ignore # FutureWarnings @@ -179,7 +179,7 @@ def test_fit_docstring_attributes(name, Estimator): if Estimator.__name__ == "Pipeline": est = _construct_compose_pipeline_instance(Estimator) else: - est = _construct_instance(Estimator) + est = next(_construct_instances(Estimator)) _set_checking_parameters(est) X, y = make_classification( From e74293a91ce92a206abb5615dd4e8ec8f01eb15b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 12 Nov 2024 17:55:46 +0100 Subject: [PATCH 14/20] iter --- imblearn/ensemble/_weight_boosting.py | 1 - 1 file changed, 1 deletion(-) diff --git a/imblearn/ensemble/_weight_boosting.py b/imblearn/ensemble/_weight_boosting.py index fbb77142e..8c0d41af2 100644 --- a/imblearn/ensemble/_weight_boosting.py +++ b/imblearn/ensemble/_weight_boosting.py @@ -402,6 +402,5 @@ def _boost_discrete(self, iboost, X, y, sample_weight, random_state): return sample_weight, estimator_weight, estimator_error - @available_if(check_version_package("sklearn", ">=", "1.6")) def _boost(self, iboost, X, y, sample_weight, random_state): return self._boost_discrete(iboost, X, y, sample_weight, random_state) \ No newline at end of file From a33b9f863f14ace8361c57c8202249f179b149b8 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 12 Nov 2024 17:57:46 +0100 Subject: [PATCH 15/20] iter --- imblearn/ensemble/_weight_boosting.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/imblearn/ensemble/_weight_boosting.py b/imblearn/ensemble/_weight_boosting.py index 8c0d41af2..82c009d0a 100644 --- a/imblearn/ensemble/_weight_boosting.py +++ b/imblearn/ensemble/_weight_boosting.py @@ -403,4 +403,8 @@ def _boost_discrete(self, iboost, X, y, sample_weight, random_state): return sample_weight, estimator_weight, estimator_error def _boost(self, iboost, X, y, sample_weight, random_state): - return self._boost_discrete(iboost, X, y, sample_weight, random_state) \ No newline at end of file + if self.algorithm == "SAMME.R": + return self._boost_real(iboost, X, y, sample_weight, random_state) + + else: # elif self.algorithm == "SAMME": + return self._boost_discrete(iboost, X, y, sample_weight, random_state) \ No newline at end of file From 7878bb29b7fa0c6a13552d82ed3752ccb8626f48 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 12 Nov 2024 18:04:58 +0100 Subject: [PATCH 16/20] iter --- imblearn/ensemble/tests/test_easy_ensemble.py | 7 ++----- imblearn/pipeline.py | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/imblearn/ensemble/tests/test_easy_ensemble.py b/imblearn/ensemble/tests/test_easy_ensemble.py index 5d3b23fd2..472182060 100644 --- a/imblearn/ensemble/tests/test_easy_ensemble.py +++ b/imblearn/ensemble/tests/test_easy_ensemble.py @@ -1,4 +1,5 @@ """Test the module easy ensemble.""" + # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT @@ -211,9 +212,5 @@ def test_easy_ensemble_classifier_grid_search(): "n_estimators": [1, 2], "estimator__n_estimators": [3, 4], } - grid_search = GridSearchCV( - EasyEnsembleClassifier(estimator=AdaBoostClassifier()), - parameters, - cv=5, - ) + grid_search = GridSearchCV(EasyEnsembleClassifier(), parameters, cv=5) grid_search.fit(X, y) diff --git a/imblearn/pipeline.py b/imblearn/pipeline.py index 7453446ad..654d1a5a2 100644 --- a/imblearn/pipeline.py +++ b/imblearn/pipeline.py @@ -1059,7 +1059,7 @@ def _fit_resample_one(sampler, X, y, message_clsname="", message=None, params=No return X_res, y_res, sampler -def _transform_one(transformer, X, y, weight, params): +def _transform_one(transformer, X, y, weight, params=None): """Call transform and apply weight to output. Parameters From 8903b00d526cea2b756db40bcf825920a9323c56 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 12 Nov 2024 18:12:01 +0100 Subject: [PATCH 17/20] iter --- imblearn/utils/_tags.py | 131 ++++++++++++++++++++-------------------- 1 file changed, 65 insertions(+), 66 deletions(-) diff --git a/imblearn/utils/_tags.py b/imblearn/utils/_tags.py index 7d59e73a8..84b5de60c 100644 --- a/imblearn/utils/_tags.py +++ b/imblearn/utils/_tags.py @@ -32,70 +32,69 @@ class SamplerTags: sample_indices: bool = False + @dataclass(**_dataclass_args()) + class Tags: + """Tags for the estimator. + + See :ref:`estimator_tags` for more information. + + Parameters + ---------- + estimator_type : str or None + The type of the estimator. Can be one of: + - "classifier" + - "regressor" + - "transformer" + - "clusterer" + - "outlier_detector" + - "density_estimator" + + target_tags : :class:`TargetTags` + The target(y) tags. + + transformer_tags : :class:`TransformerTags` or None + The transformer tags. + + classifier_tags : :class:`ClassifierTags` or None + The classifier tags. + + regressor_tags : :class:`RegressorTags` or None + The regressor tags. + + sampler_tags : :class:`SamplerTags` or None + The sampler tags. + + array_api_support : bool, default=False + Whether the estimator supports Array API compatible inputs. + + no_validation : bool, default=False + Whether the estimator skips input-validation. This is only meant for + stateless and dummy transformers! + + non_deterministic : bool, default=False + Whether the estimator is not deterministic given a fixed ``random_state``. + + requires_fit : bool, default=True + Whether the estimator requires to be fitted before calling one of + `transform`, `predict`, `predict_proba`, or `decision_function`. + + _skip_test : bool, default=False + Whether to skip common tests entirely. Don't use this unless + you have a *very good* reason. + + input_tags : :class:`InputTags` + The input data(X) tags. + """ -@dataclass(**_dataclass_args()) -class Tags: - """Tags for the estimator. - - See :ref:`estimator_tags` for more information. - - Parameters - ---------- - estimator_type : str or None - The type of the estimator. Can be one of: - - "classifier" - - "regressor" - - "transformer" - - "clusterer" - - "outlier_detector" - - "density_estimator" - - target_tags : :class:`TargetTags` - The target(y) tags. - - transformer_tags : :class:`TransformerTags` or None - The transformer tags. - - classifier_tags : :class:`ClassifierTags` or None - The classifier tags. - - regressor_tags : :class:`RegressorTags` or None - The regressor tags. - - sampler_tags : :class:`SamplerTags` or None - The sampler tags. - - array_api_support : bool, default=False - Whether the estimator supports Array API compatible inputs. - - no_validation : bool, default=False - Whether the estimator skips input-validation. This is only meant for - stateless and dummy transformers! - - non_deterministic : bool, default=False - Whether the estimator is not deterministic given a fixed ``random_state``. - - requires_fit : bool, default=True - Whether the estimator requires to be fitted before calling one of - `transform`, `predict`, `predict_proba`, or `decision_function`. - - _skip_test : bool, default=False - Whether to skip common tests entirely. Don't use this unless - you have a *very good* reason. - - input_tags : :class:`InputTags` - The input data(X) tags. - """ - - estimator_type: str | None - target_tags: TargetTags - transformer_tags: TransformerTags | None - classifier_tags: ClassifierTags | None - regressor_tags: RegressorTags | None - sampler_tags: SamplerTags | None - array_api_support: bool = False - no_validation: bool = False - non_deterministic: bool = False - requires_fit: bool = True - _skip_test: bool = False - input_tags: InputTags = field(default_factory=InputTags) + estimator_type: str | None + target_tags: TargetTags + transformer_tags: TransformerTags | None + classifier_tags: ClassifierTags | None + regressor_tags: RegressorTags | None + sampler_tags: SamplerTags | None + array_api_support: bool = False + no_validation: bool = False + non_deterministic: bool = False + requires_fit: bool = True + _skip_test: bool = False + input_tags: InputTags = field(default_factory=InputTags) From 468f92564b47fced5bfc4008096b0955a4e65763 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 13 Nov 2024 22:40:14 +0100 Subject: [PATCH 18/20] iter --- imblearn/ensemble/_weight_boosting.py | 10 +++++++--- imblearn/ensemble/tests/test_weight_boosting.py | 14 ++++++++++++-- imblearn/pipeline.py | 4 ++++ imblearn/utils/_tags.py | 16 +++------------- .../utils/_test_common/instance_generator.py | 5 ++++- 5 files changed, 30 insertions(+), 19 deletions(-) diff --git a/imblearn/ensemble/_weight_boosting.py b/imblearn/ensemble/_weight_boosting.py index 82c009d0a..0caaae27e 100644 --- a/imblearn/ensemble/_weight_boosting.py +++ b/imblearn/ensemble/_weight_boosting.py @@ -1,5 +1,6 @@ import copy import numbers +import warnings from copy import deepcopy import numpy as np @@ -68,9 +69,6 @@ class RUSBoostClassifier(_ParamsValidationMixin, AdaBoostClassifier): .. deprecated:: 0.12 `algorithm` is deprecated in 0.12 and will be removed 0.14. - Depending on the `scikit-learn` version, the "SAMME.R" algorithm might not - be available. Refer to the documentation of - :class:`~sklearn.ensemble.AdaBoostClassifier` for more information. {sampling_strategy} @@ -403,6 +401,12 @@ def _boost_discrete(self, iboost, X, y, sample_weight, random_state): return sample_weight, estimator_weight, estimator_error def _boost(self, iboost, X, y, sample_weight, random_state): + if self.algorithm != "deprecated": + warnings.warn( + "`algorithm` parameter is deprecated in 0.12 and will be removed in " + "0.14. In the future, the SAMME algorithm will always be used.", + FutureWarning, + ) if self.algorithm == "SAMME.R": return self._boost_real(iboost, X, y, sample_weight, random_state) diff --git a/imblearn/ensemble/tests/test_weight_boosting.py b/imblearn/ensemble/tests/test_weight_boosting.py index 8096a2b16..89589d248 100644 --- a/imblearn/ensemble/tests/test_weight_boosting.py +++ b/imblearn/ensemble/tests/test_weight_boosting.py @@ -24,7 +24,7 @@ def imbalanced_dataset(): @pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"]) -@pytest.mark.filterwarnings("ignore:The SAMME.R algorithm (the default) is") +@pytest.mark.filterwarnings("ignore:`algorithm` parameter is deprecated in 0.12") def test_rusboost(imbalanced_dataset, algorithm): X, y = imbalanced_dataset X_train, X_test, y_train, y_test = train_test_split( @@ -70,7 +70,7 @@ def test_rusboost(imbalanced_dataset, algorithm): @pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"]) -@pytest.mark.filterwarnings("ignore:The SAMME.R algorithm (the default) is") +@pytest.mark.filterwarnings("ignore:`algorithm` parameter is deprecated in 0.12") def test_rusboost_sample_weight(imbalanced_dataset, algorithm): X, y = imbalanced_dataset sample_weight = np.ones_like(y) @@ -88,3 +88,13 @@ def test_rusboost_sample_weight(imbalanced_dataset, algorithm): with pytest.raises(AssertionError): assert_array_equal(y_pred_no_sample_weight, y_pred_sample_weight) + + +@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"]) +def test_rusboost_algorithm_future_warning(imbalanced_dataset, algorithm): + X, y = imbalanced_dataset + rusboost = RUSBoostClassifier(algorithm=algorithm, random_state=0) + + warning_msg = "`algorithm` parameter is deprecated in 0.12" + with pytest.warns(FutureWarning, match=warning_msg): + rusboost.fit(X, y) diff --git a/imblearn/pipeline.py b/imblearn/pipeline.py index 654d1a5a2..c97d94995 100644 --- a/imblearn/pipeline.py +++ b/imblearn/pipeline.py @@ -95,6 +95,10 @@ class Pipeline(_ParamsValidationMixin, pipeline.Pipeline): n_features_in_ : int Number of features seen during first step `fit` method. + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Only defined if the + underlying estimator exposes such an attribute when fit. + See Also -------- make_pipeline : Helper function to make pipeline. diff --git a/imblearn/utils/_tags.py b/imblearn/utils/_tags.py index 84b5de60c..9b73eeb70 100644 --- a/imblearn/utils/_tags.py +++ b/imblearn/utils/_tags.py @@ -8,6 +8,7 @@ if sklearn_version >= parse_version("1.6"): from sklearn.utils._tags import ( + Tags, TargetTags, TransformerTags, ClassifierTags, @@ -33,7 +34,7 @@ class SamplerTags: sample_indices: bool = False @dataclass(**_dataclass_args()) - class Tags: + class Tags(Tags): """Tags for the estimator. See :ref:`estimator_tags` for more information. @@ -86,15 +87,4 @@ class Tags: The input data(X) tags. """ - estimator_type: str | None - target_tags: TargetTags - transformer_tags: TransformerTags | None - classifier_tags: ClassifierTags | None - regressor_tags: RegressorTags | None - sampler_tags: SamplerTags | None - array_api_support: bool = False - no_validation: bool = False - non_deterministic: bool = False - requires_fit: bool = True - _skip_test: bool = False - input_tags: InputTags = field(default_factory=InputTags) + sampler_tags: SamplerTags | None = None diff --git a/imblearn/utils/_test_common/instance_generator.py b/imblearn/utils/_test_common/instance_generator.py index f83111854..64ee971e2 100644 --- a/imblearn/utils/_test_common/instance_generator.py +++ b/imblearn/utils/_test_common/instance_generator.py @@ -54,7 +54,10 @@ dict(estimator=DecisionTreeClassifier(random_state=42), random_state=42), ], Pipeline: dict( - steps=[("sampler", RandomUnderSampler()), ("logistic", LogisticRegression())] + steps=[ + ("sampler", RandomUnderSampler(random_state=0)), + ("logistic", LogisticRegression()), + ] ), # over-sampling ADASYN: dict(random_state=42), From c457b4ab8167c6dac440eb65c8557b390f9f754d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 13 Nov 2024 23:08:15 +0100 Subject: [PATCH 19/20] iter --- imblearn/pipeline.py | 228 ++++++++++++++++++++------------ imblearn/tests/test_pipeline.py | 9 +- 2 files changed, 148 insertions(+), 89 deletions(-) diff --git a/imblearn/pipeline.py b/imblearn/pipeline.py index c97d94995..cdb3d0ba5 100644 --- a/imblearn/pipeline.py +++ b/imblearn/pipeline.py @@ -2,6 +2,7 @@ The :mod:`imblearn.pipeline` module implements utilities to build a composite estimator, as a chain of transforms, samples and estimators. """ + # Adapted from scikit-learn # Author: Edouard Duchesnay @@ -12,13 +13,18 @@ # Christos Aridas # Guillaume Lemaitre # License: BSD +import warnings +from contextlib import contextmanager +from copy import deepcopy + import sklearn from sklearn import pipeline from sklearn.base import clone +from sklearn.exceptions import NotFittedError from sklearn.utils import Bunch from sklearn.utils.fixes import parse_version from sklearn.utils.metaestimators import available_if -from sklearn.utils.validation import check_memory +from sklearn.utils.validation import check_memory, check_is_fitted from .base import _ParamsValidationMixin from .utils._metadata_requests import ( @@ -30,7 +36,7 @@ process_routing, ) from .utils._param_validation import HasMethods, validate_params -from .utils.fixes import _fit_context +from .utils.fixes import _fit_context, get_tags METHODS.append("fit_resample") @@ -43,6 +49,31 @@ from sklearn.utils._user_interface import _print_elapsed_time +@contextmanager +def _raise_or_warn_if_not_fitted(estimator): + """A context manager to make sure a NotFittedError is raised, if a sub-estimator + raises the error. + Otherwise, we raise a warning if the pipeline is not fitted, with the deprecation. + TODO(1.8): remove this context manager and replace with check_is_fitted. + """ + try: + yield + except NotFittedError as exc: + raise NotFittedError("Pipeline is not fitted yet.") from exc + + # we only get here if the above didn't raise + try: + check_is_fitted(estimator) + except NotFittedError: + warnings.warn( + "This Pipeline instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using other methods such as transform, " + "predict, etc. This will raise an error in 1.8 instead of the current " + "warning.", + FutureWarning, + ) + + class Pipeline(_ParamsValidationMixin, pipeline.Pipeline): """Pipeline of transforms and resamples with a final estimator. @@ -456,18 +487,22 @@ def predict(self, X, **params): y_pred : ndarray Result of calling `predict` on the final estimator. """ - Xt = X + # TODO(1.8): Remove the context manager and use check_is_fitted(self) + with _raise_or_warn_if_not_fitted(self): + Xt = X - if not _routing_enabled(): - for _, name, transform in self._iter(with_final=False): - Xt = transform.transform(Xt) - return self.steps[-1][1].predict(Xt, **params) + if not _routing_enabled(): + for _, name, transform in self._iter(with_final=False): + Xt = transform.transform(Xt) + return self.steps[-1][1].predict(Xt, **params) - # metadata routing enabled - routed_params = process_routing(self, "predict", **params) - for _, name, transform in self._iter(with_final=False): - Xt = transform.transform(Xt, **routed_params[name].transform) - return self.steps[-1][1].predict(Xt, **routed_params[self.steps[-1][0]].predict) + # metadata routing enabled + routed_params = process_routing(self, "predict", **params) + for _, name, transform in self._iter(with_final=False): + Xt = transform.transform(Xt, **routed_params[name].transform) + return self.steps[-1][1].predict( + Xt, **routed_params[self.steps[-1][0]].predict + ) def _can_fit_resample(self): return self._final_estimator == "passthrough" or hasattr( @@ -646,20 +681,22 @@ def predict_proba(self, X, **params): y_proba : ndarray of shape (n_samples, n_classes) Result of calling `predict_proba` on the final estimator. """ - Xt = X + # TODO(1.8): Remove the context manager and use check_is_fitted(self) + with _raise_or_warn_if_not_fitted(self): + Xt = X + + if not _routing_enabled(): + for _, name, transform in self._iter(with_final=False): + Xt = transform.transform(Xt) + return self.steps[-1][1].predict_proba(Xt, **params) - if not _routing_enabled(): + # metadata routing enabled + routed_params = process_routing(self, "predict_proba", **params) for _, name, transform in self._iter(with_final=False): - Xt = transform.transform(Xt) - return self.steps[-1][1].predict_proba(Xt, **params) - - # metadata routing enabled - routed_params = process_routing(self, "predict_proba", **params) - for _, name, transform in self._iter(with_final=False): - Xt = transform.transform(Xt, **routed_params[name].transform) - return self.steps[-1][1].predict_proba( - Xt, **routed_params[self.steps[-1][0]].predict_proba - ) + Xt = transform.transform(Xt, **routed_params[name].transform) + return self.steps[-1][1].predict_proba( + Xt, **routed_params[self.steps[-1][0]].predict_proba + ) @available_if(pipeline._final_estimator_has("decision_function")) def decision_function(self, X, **params): @@ -691,20 +728,23 @@ def decision_function(self, X, **params): y_score : ndarray of shape (n_samples, n_classes) Result of calling `decision_function` on the final estimator. """ - _raise_for_params(params, self, "decision_function") + # TODO(1.8): Remove the context manager and use check_is_fitted(self) + with _raise_or_warn_if_not_fitted(self): + _raise_for_params(params, self, "decision_function") - # not branching here since params is only available if - # enable_metadata_routing=True - routed_params = process_routing(self, "decision_function", **params) + # not branching here since params is only available if + # enable_metadata_routing=True + routed_params = process_routing(self, "decision_function", **params) - Xt = X - for _, name, transform in self._iter(with_final=False): - Xt = transform.transform( - Xt, **routed_params.get(name, {}).get("transform", {}) + Xt = X + for _, name, transform in self._iter(with_final=False): + Xt = transform.transform( + Xt, **routed_params.get(name, {}).get("transform", {}) + ) + return self.steps[-1][1].decision_function( + Xt, + **routed_params.get(self.steps[-1][0], {}).get("decision_function", {}), ) - return self.steps[-1][1].decision_function( - Xt, **routed_params.get(self.steps[-1][0], {}).get("decision_function", {}) - ) @available_if(pipeline._final_estimator_has("score_samples")) def score_samples(self, X): @@ -726,10 +766,12 @@ def score_samples(self, X): y_score : ndarray of shape (n_samples,) Result of calling `score_samples` on the final estimator. """ - Xt = X - for _, _, transformer in self._iter(with_final=False): - Xt = transformer.transform(Xt) - return self.steps[-1][1].score_samples(Xt) + # TODO(1.8): Remove the context manager and use check_is_fitted(self) + with _raise_or_warn_if_not_fitted(self): + Xt = X + for _, _, transformer in self._iter(with_final=False): + Xt = transformer.transform(Xt) + return self.steps[-1][1].score_samples(Xt) @available_if(pipeline._final_estimator_has("predict_log_proba")) def predict_log_proba(self, X, **params): @@ -773,20 +815,22 @@ def predict_log_proba(self, X, **params): y_log_proba : ndarray of shape (n_samples, n_classes) Result of calling `predict_log_proba` on the final estimator. """ - Xt = X + # TODO(1.8): Remove the context manager and use check_is_fitted(self) + with _raise_or_warn_if_not_fitted(self): + Xt = X - if not _routing_enabled(): + if not _routing_enabled(): + for _, name, transform in self._iter(with_final=False): + Xt = transform.transform(Xt) + return self.steps[-1][1].predict_log_proba(Xt, **params) + + # metadata routing enabled + routed_params = process_routing(self, "predict_log_proba", **params) for _, name, transform in self._iter(with_final=False): - Xt = transform.transform(Xt) - return self.steps[-1][1].predict_log_proba(Xt, **params) - - # metadata routing enabled - routed_params = process_routing(self, "predict_log_proba", **params) - for _, name, transform in self._iter(with_final=False): - Xt = transform.transform(Xt, **routed_params[name].transform) - return self.steps[-1][1].predict_log_proba( - Xt, **routed_params[self.steps[-1][0]].predict_log_proba - ) + Xt = transform.transform(Xt, **routed_params[name].transform) + return self.steps[-1][1].predict_log_proba( + Xt, **routed_params[self.steps[-1][0]].predict_log_proba + ) def _can_transform(self): return self._final_estimator == "passthrough" or hasattr( @@ -826,15 +870,17 @@ def transform(self, X, **params): Xt : ndarray of shape (n_samples, n_transformed_features) Transformed data. """ - _raise_for_params(params, self, "transform") - - # not branching here since params is only available if - # enable_metadata_routing=True - routed_params = process_routing(self, "transform", **params) - Xt = X - for _, name, transform in self._iter(): - Xt = transform.transform(Xt, **routed_params[name].transform) - return Xt + # TODO(1.8): Remove the context manager and use check_is_fitted(self) + with _raise_or_warn_if_not_fitted(self): + _raise_for_params(params, self, "transform") + + # not branching here since params is only available if + # enable_metadata_routing=True + routed_params = process_routing(self, "transform", **params) + Xt = X + for _, name, transform in self._iter(): + Xt = transform.transform(Xt, **routed_params[name].transform) + return Xt def _can_inverse_transform(self): return all(hasattr(t, "inverse_transform") for _, _, t in self._iter()) @@ -869,17 +915,19 @@ def inverse_transform(self, Xt, **params): Inverse transformed data, that is, data in the original feature space. """ - _raise_for_params(params, self, "inverse_transform") - - # we don't have to branch here, since params is only non-empty if - # enable_metadata_routing=True. - routed_params = process_routing(self, "inverse_transform", **params) - reverse_iter = reversed(list(self._iter())) - for _, name, transform in reverse_iter: - Xt = transform.inverse_transform( - Xt, **routed_params[name].inverse_transform - ) - return Xt + # TODO(1.8): Remove the context manager and use check_is_fitted(self) + with _raise_or_warn_if_not_fitted(self): + _raise_for_params(params, self, "inverse_transform") + + # we don't have to branch here, since params is only non-empty if + # enable_metadata_routing=True. + routed_params = process_routing(self, "inverse_transform", **params) + reverse_iter = reversed(list(self._iter())) + for _, name, transform in reverse_iter: + Xt = transform.inverse_transform( + Xt, **routed_params[name].inverse_transform + ) + return Xt @available_if(pipeline._final_estimator_has("score")) def score(self, X, y=None, sample_weight=None, **params): @@ -918,24 +966,28 @@ def score(self, X, y=None, sample_weight=None, **params): score : float Result of calling `score` on the final estimator. """ - Xt = X - if not _routing_enabled(): - for _, name, transform in self._iter(with_final=False): - Xt = transform.transform(Xt) - score_params = {} - if sample_weight is not None: - score_params["sample_weight"] = sample_weight - return self.steps[-1][1].score(Xt, y, **score_params) - - # metadata routing is enabled. - routed_params = process_routing( - self, "score", sample_weight=sample_weight, **params - ) + # TODO(1.8): Remove the context manager and use check_is_fitted(self) + with _raise_or_warn_if_not_fitted(self): + Xt = X + if not _routing_enabled(): + for _, name, transform in self._iter(with_final=False): + Xt = transform.transform(Xt) + score_params = {} + if sample_weight is not None: + score_params["sample_weight"] = sample_weight + return self.steps[-1][1].score(Xt, y, **score_params) + + # metadata routing is enabled. + routed_params = process_routing( + self, "score", sample_weight=sample_weight, **params + ) - Xt = X - for _, name, transform in self._iter(with_final=False): - Xt = transform.transform(Xt, **routed_params[name].transform) - return self.steps[-1][1].score(Xt, y, **routed_params[self.steps[-1][0]].score) + Xt = X + for _, name, transform in self._iter(with_final=False): + Xt = transform.transform(Xt, **routed_params[name].transform) + return self.steps[-1][1].score( + Xt, y, **routed_params[self.steps[-1][0]].score + ) # TODO: once scikit-learn >= 1.4, the following function should be simplified by # calling `super().get_metadata_routing()` diff --git a/imblearn/tests/test_pipeline.py b/imblearn/tests/test_pipeline.py index ed90b263c..bafd50261 100644 --- a/imblearn/tests/test_pipeline.py +++ b/imblearn/tests/test_pipeline.py @@ -49,7 +49,7 @@ R_TOL = 1e-4 -class NoFit: +class NoFit(BaseEstimator): """Small class to test parameter dispatching.""" def __init__(self, a=None, b=None): @@ -109,6 +109,9 @@ def predict(self, X): def score(self, X, y=None): return np.sum(X) + def __sklearn_is_fitted__(self): + return True + class FitParamT(BaseEstimator): """Mock classifier""" @@ -118,6 +121,7 @@ def __init__(self): def fit(self, X, y, should_succeed=False): self.successful = should_succeed + self.fitted_ = True def predict(self, X): return self.successful @@ -146,6 +150,9 @@ def fit(self, X, y): class DummyEstimatorParams(BaseEstimator): """Mock classifier that takes params on predict""" + def __sklearn_is_fitted__(self): + return True + def fit(self, X, y): return self From 762fa4836a6912c1330fa91a126d8dc55eba9a87 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 13 Nov 2024 23:54:28 +0100 Subject: [PATCH 20/20] iter --- imblearn/tests/test_common.py | 19 +++++++++- .../utils/_test_common/instance_generator.py | 37 ++++++++++++++++++- imblearn/utils/estimator_checks.py | 1 - 3 files changed, 53 insertions(+), 4 deletions(-) diff --git a/imblearn/tests/test_common.py b/imblearn/tests/test_common.py index 4028f439a..43028a33c 100644 --- a/imblearn/tests/test_common.py +++ b/imblearn/tests/test_common.py @@ -9,8 +9,10 @@ import numpy as np import pytest +import sklearn from sklearn.base import clone from sklearn.exceptions import ConvergenceWarning +from sklearn.utils.fixes import parse_version from sklearn.utils._testing import ignore_warnings from sklearn.utils.estimator_checks import ( parametrize_with_checks as parametrize_with_checks_sklearn, @@ -27,9 +29,18 @@ from imblearn.utils.testing import all_estimators from imblearn.utils._test_common.instance_generator import ( _get_check_estimator_ids, + _get_expected_failed_checks, _tested_estimators, ) +sklearn_version = parse_version(parse_version(sklearn.__version__).base_version) +if sklearn_version >= parse_version("1.6"): + kwargs_parametrize_with_checks = { + "expected_failed_checks": _get_expected_failed_checks + } +else: + kwargs_parametrize_with_checks = {} + @pytest.mark.parametrize("name, Estimator", all_estimators()) def test_all_estimator_no_base_class(name, Estimator): @@ -38,13 +49,17 @@ def test_all_estimator_no_base_class(name, Estimator): assert not name.lower().startswith("base"), msg -@parametrize_with_checks_sklearn(list(_tested_estimators())) +@parametrize_with_checks_sklearn( + list(_tested_estimators()), **kwargs_parametrize_with_checks +) def test_estimators_compatibility_sklearn(estimator, check, request): _set_checking_parameters(estimator) check(estimator) -@parametrize_with_checks(list(_tested_estimators())) +@parametrize_with_checks( + list(_tested_estimators()), expected_failed_checks=_get_expected_failed_checks +) def test_estimators_imblearn(estimator, check, request): # Common tests for estimator instances with ignore_warnings( diff --git a/imblearn/utils/_test_common/instance_generator.py b/imblearn/utils/_test_common/instance_generator.py index 64ee971e2..82fdebe25 100644 --- a/imblearn/utils/_test_common/instance_generator.py +++ b/imblearn/utils/_test_common/instance_generator.py @@ -19,6 +19,7 @@ BalancedBaggingClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier, + RUSBoostClassifier, ) from imblearn.over_sampling import ( ADASYN, @@ -83,7 +84,13 @@ # same check with multiple instances of the same estimator with different parameters. # The special key "*" allows to apply the parameters to all checks. # TODO(devtools): allow third-party developers to pass test specific params to checks -PER_ESTIMATOR_CHECK_PARAMS: dict = {} +PER_ESTIMATOR_CHECK_PARAMS: dict = { + Pipeline: { + "check_classifiers_with_encoded_labels": dict( + sampler__sampling_strategy={"setosa": 20, "virginica": 20} + ) + } +} SKIPPED_ESTIMATORS = [SMOTENC] @@ -187,3 +194,31 @@ def _yield_instances_for_check(check, estimator_orig): estimator = clone(estimator_orig) estimator.set_params(**params) yield estimator + + +PER_ESTIMATOR_XFAIL_CHECKS = { + BalancedRandomForestClassifier: { + "check_sample_weight_equivalence": "FIXME", + }, + NearMiss: { + "check_samplers_fit_resample": "FIXME", + }, + Pipeline: { + "check_dont_overwrite_parameters": ( + "Pipeline changes the `steps` parameter, which it shouldn't." + "Therefore this test is x-fail until we fix this." + ), + "check_estimators_overwrite_params": ( + "Pipeline changes the `steps` parameter, which it shouldn't." + "Therefore this test is x-fail until we fix this." + ), + }, + RUSBoostClassifier: { + "check_sample_weight_equivalence": "FIXME", + }, +} + +def _get_expected_failed_checks(estimator): + """Get the expected failed checks for all estimators in scikit-learn.""" + failed_checks = PER_ESTIMATOR_XFAIL_CHECKS.get(type(estimator), {}) + return failed_checks \ No newline at end of file diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index ffb5de129..dfba7e50d 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -668,7 +668,6 @@ def check_classifiers_with_encoded_labels(name, classifier_orig): "virginica": 50, }, ) - classifier.set_params(sampling_strategy={"setosa": 20, "virginica": 20}) classifier.fit(df, y) assert set(classifier.classes_) == set(y.cat.categories.tolist()) y_pred = classifier.predict(df)