iter

glemaitre · glemaitre · commit a5cb58b98a0b · 2024-12-17T23:04:00.000+01:00
diff --git a/imblearn/base.py b/imblearn/base.py
@@ -12,7 +12,7 @@
 from sklearn.utils.multiclass import check_classification_targets
 
 from .utils import check_sampling_strategy, check_target_type
-from .utils._sklearn_compat import _fit_context, validate_data
+from .utils._sklearn_compat import _fit_context, get_tags, validate_data
 from .utils._validation import ArraysTransformer
 
 
@@ -217,7 +217,11 @@ def is_sampler(estimator):
     is_sampler : bool
         True if estimator is a sampler, otherwise False.
     """
-    if estimator._estimator_type == "sampler":
+
+    if hasattr(estimator, "_estimator_type") and estimator._estimator_type == "sampler":
+        return True
+    tags = get_tags(estimator)
+    if hasattr(tags, "sampler_tags") and tags.sampler_tags is not None:
         return True
     return False
 
diff --git a/imblearn/metrics/pairwise.py b/imblearn/metrics/pairwise.py
@@ -118,6 +118,7 @@ class ValueDifferenceMetric(BaseEstimator):
            [0.04,  0.  ,  1.44],
            [1.96,  1.44,  0.  ]])
     """
+
     _parameter_constraints: dict = {
         "n_categories": [StrOptions({"auto"}), "array-like"],
         "k": [numbers.Integral],
@@ -150,6 +151,7 @@ def fit(self, X, y):
         self._validate_params()
         check_consistent_length(X, y)
         X, y = validate_data(self, X=X, y=y, reset=True, dtype=np.int32)
+        X = check_array(X, ensure_non_negative=True)
 
         if isinstance(self.n_categories, str) and self.n_categories == "auto":
             # categories are expected to be encoded from 0 to n_categories - 1
@@ -208,11 +210,11 @@ def pairwise(self, X, Y=None):
             The VDM pairwise distance.
         """
         check_is_fitted(self)
-        X = check_array(X, dtype=np.int32)
+        X = check_array(X, ensure_non_negative=True, dtype=np.int32)
         n_samples_X = X.shape[0]
 
         if Y is not None:
-            Y = check_array(Y, dtype=np.int32)
+            Y = check_array(Y, ensure_non_negative=True, dtype=np.int32)
             n_samples_Y = Y.shape[0]
         else:
             n_samples_Y = n_samples_X
diff --git a/imblearn/tests/test_common.py b/imblearn/tests/test_common.py
@@ -12,13 +12,13 @@
 import sklearn
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.utils._testing import ignore_warnings
-from sklearn.utils.estimator_checks import (
-    parametrize_with_checks as parametrize_with_checks_sklearn,
-)
 from sklearn.utils.fixes import parse_version
 
 from imblearn.over_sampling import RandomOverSampler
 from imblearn.under_sampling import RandomUnderSampler
+from imblearn.utils._sklearn_compat import (
+    parametrize_with_checks as parametrize_with_checks_sklearn,
+)
 from imblearn.utils._test_common.instance_generator import (
     _get_check_estimator_ids,
     _get_expected_failed_checks,
diff --git a/imblearn/tests/test_docstring_parameters.py b/imblearn/tests/test_docstring_parameters.py
@@ -10,7 +10,6 @@
 
 import pytest
 from sklearn.datasets import make_classification
-from sklearn.linear_model import LogisticRegression
 from sklearn.utils._testing import (
     _get_func_name,
     check_docstring_parameters,
@@ -24,9 +23,9 @@
 
 import imblearn
 from imblearn.base import is_sampler
-from imblearn.utils._sklearn_compat import _construct_instances
+from imblearn.under_sampling import NearMiss
+from imblearn.utils._test_common.instance_generator import _tested_estimators
 from imblearn.utils.estimator_checks import _set_checking_parameters
-from imblearn.utils.testing import all_estimators
 
 # walk_packages() ignores DeprecationWarnings, now we need to ignore
 # FutureWarnings
@@ -43,10 +42,10 @@
     )
 
 # functions to ignore args / docstring of
-_DOCSTRING_IGNORES = [
-    "RUSBoostClassifier",  # TODO remove after releasing scikit-learn 1.0.1
-    "ValueDifferenceMetric",
-]
+_DOCSTRING_IGNORES = ["ValueDifferenceMetric"]
+_IGNORE_ATTRIBUTES = {
+    NearMiss: ["nn_ver3_"],
+}
 
 # Methods where y param should be ignored if y=None by default
 _METHODS_IGNORE_NONE_Y = [
@@ -159,28 +158,19 @@ def test_tabs():
         )
 
 
-def _construct_compose_pipeline_instance(Estimator):
-    # Minimal / degenerate instances: only useful to test the docstrings.
-    if Estimator.__name__ == "Pipeline":
-        return Estimator(steps=[("clf", LogisticRegression())])
-
-
-@pytest.mark.parametrize("name, Estimator", all_estimators())
-def test_fit_docstring_attributes(name, Estimator):
+@pytest.mark.parametrize("estimator", list(_tested_estimators()))
+def test_fit_docstring_attributes(estimator):
     pytest.importorskip("numpydoc")
     from numpydoc import docscrape
 
+    Estimator = estimator.__class__
     if Estimator.__name__ in _DOCSTRING_IGNORES:
         return
 
     doc = docscrape.ClassDoc(Estimator)
     attributes = doc["Attributes"]
 
-    if Estimator.__name__ == "Pipeline":
-        est = _construct_compose_pipeline_instance(Estimator)
-    else:
-        est = next(_construct_instances(Estimator))
-    _set_checking_parameters(est)
+    _set_checking_parameters(estimator)
 
     X, y = make_classification(
         n_samples=20,
@@ -190,16 +180,16 @@ def test_fit_docstring_attributes(name, Estimator):
         random_state=2,
     )
 
-    y = _enforce_estimator_tags_y(est, y)
-    X = _enforce_estimator_tags_X(est, X)
+    y = _enforce_estimator_tags_y(estimator, y)
+    X = _enforce_estimator_tags_X(estimator, X)
 
-    if "oob_score" in est.get_params():
-        est.set_params(bootstrap=True, oob_score=True)
+    if "oob_score" in estimator.get_params():
+        estimator.set_params(bootstrap=True, oob_score=True)
 
-    if is_sampler(est):
-        est.fit_resample(X, y)
+    if is_sampler(estimator):
+        estimator.fit_resample(X, y)
     else:
-        est.fit(X, y)
+        estimator.fit(X, y)
 
     skipped_attributes = set(
         [
@@ -218,9 +208,11 @@ def test_fit_docstring_attributes(name, Estimator):
             continue
         # ignore deprecation warnings
         with ignore_warnings(category=FutureWarning):
-            assert hasattr(est, attr.name)
+            if attr.name in _IGNORE_ATTRIBUTES.get(Estimator, []):
+                continue
+            assert hasattr(estimator, attr.name)
 
-    fit_attr = _get_all_fitted_attributes(est)
+    fit_attr = _get_all_fitted_attributes(estimator)
     fit_attr_names = [attr.name for attr in attributes]
     undocumented_attrs = set(fit_attr).difference(fit_attr_names)
     undocumented_attrs = set(undocumented_attrs).difference(skipped_attributes)
diff --git a/imblearn/under_sampling/_prototype_selection/_nearmiss.py b/imblearn/under_sampling/_prototype_selection/_nearmiss.py
@@ -62,6 +62,9 @@ class NearMiss(BaseUnderSampler):
     nn_ : estimator object
         Validated K-nearest Neighbours object created from `n_neighbors` parameter.
 
+    nn_ver3_ : estimator object
+        Validated K-nearest Neighbours object created from `n_neighbors_ver3` parameter.
+
     sample_indices_ : ndarray of shape (n_new_samples,)
         Indices of the samples selected.
 
diff --git a/imblearn/utils/_test_common/instance_generator.py b/imblearn/utils/_test_common/instance_generator.py
@@ -67,6 +67,7 @@
     RandomOverSampler: dict(random_state=42),
     SMOTE: dict(random_state=42),
     SMOTEN: dict(random_state=42),
+    SMOTENC: dict(categorical_features=[0], random_state=42),
     SVMSMOTE: dict(random_state=42),
     # under-sampling
     ClusterCentroids: dict(random_state=42),
@@ -199,6 +200,8 @@ def _yield_instances_for_check(check, estimator_orig):
 PER_ESTIMATOR_XFAIL_CHECKS = {
     BalancedRandomForestClassifier: {
         "check_sample_weight_equivalence": "FIXME",
+        "check_sample_weight_equivalence_on_sparse_data": "FIXME",
+        "check_sample_weight_equivalence_on_dense_data": "FIXME",
     },
     NearMiss: {
         "check_samplers_fit_resample": "FIXME",
@@ -212,9 +215,14 @@ def _yield_instances_for_check(check, estimator_orig):
             "Pipeline changes the `steps` parameter, which it shouldn't."
             "Therefore this test is x-fail until we fix this."
         ),
+        "check_classifiers_train": "FIXME",
+        "check_supervised_y_2d": "FIXME",
     },
     RUSBoostClassifier: {
         "check_sample_weight_equivalence": "FIXME",
+        "check_sample_weight_equivalence_on_sparse_data": "FIXME",
+        "check_sample_weight_equivalence_on_dense_data": "FIXME",
+        "check_estimator_sparse_matrix": "FIXME",
     },
 }
 
diff --git a/imblearn/utils/tests/test_estimator_checks.py b/imblearn/utils/tests/test_estimator_checks.py
@@ -6,6 +6,7 @@
 from imblearn.base import BaseSampler
 from imblearn.over_sampling.base import BaseOverSampler
 from imblearn.utils import check_target_type as target_check
+from imblearn.utils._sklearn_compat import validate_data
 from imblearn.utils.estimator_checks import (
     check_samplers_fit,
     check_samplers_nan,
@@ -47,15 +48,15 @@ class NotFittedSampler(BaseBadSampler):
     """Sampler without target checking."""
 
     def fit(self, X, y):
-        X, y = self._validate_data(X, y)
+        X, y = validate_data(self, X=X, y=y)
         return self
 
 
 class NoAcceptingSparseSampler(BaseBadSampler):
     """Sampler which does not accept sparse matrix."""
 
     def fit(self, X, y):
-        X, y = self._validate_data(X, y)
+        X, y = validate_data(self, X=X, y=y)
         self.sampling_strategy_ = "sampling_strategy_"
         return self
 
@@ -72,12 +73,13 @@ def _fit_resample(self, X, y):
 class IndicesSampler(BaseOverSampler):
     def _check_X_y(self, X, y):
         y, binarize_y = target_check(y, indicate_one_vs_all=True)
-        X, y = self._validate_data(
-            X,
-            y,
+        X, y = validate_data(
+            self,
+            X=X,
+            y=y,
             reset=True,
             dtype=None,
-            force_all_finite=False,
+            ensure_all_finite=False,
         )
         return X, y, binarize_y