Use new sklearn tagging concept. fixes #448

PaulWestenthanner · PaulWestenthanner · commit 6b394dbfcffb · 2025-01-19T20:27:44.000+01:00
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -17,7 +17,7 @@ jobs:
       - name: Directly build docs
         run: |
           pip install -r docs/requirements.txt
-          sphinx-build -D docs/source ./docs/build/html/
+          sphinx-build docs/source ./docs/build/html/
       - name: Deploy Docs
         uses: peaceiris/actions-gh-pages@v3
         with:
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
+v.2.8.0
+=======
+
+* Fix: Support new concept of sklearn tags, now requiring sklearn >= 1.6.0
+* Fix: Docs deployment
+
 v.2.7.0
-==========
+=======
 
 * Refactor: Use poetry as packaging tool
 * Refactor: Add more typing
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -35,19 +35,25 @@ The preferred workflow to contribute to git-pandas is:
 Guidelines
 ==========
 
-This is still a very young project, but we do have a few guiding principles:
 
  1. Maintain semantics of the scikit-learn API
  2. Write detailed docstrings in numpy format
  3. Support pandas dataframes and numpy arrays as inputs
  4. Write tests
 
+Styleguide: 
+
+We're using ruff for linting. Rules are implemented in the `pyproject.toml` file. To run the linter, use:
+
+    $ poetry run ruff check category_encoders --fix
+
+
 Running Tests
 =============
 
 To run the tests, use:
 
-    $ pytest
+    $ poetry run pytest tests/
     
 Easy Issues / Getting Started
 =============================
diff --git a/category_encoders/base_contrast_encoder.py b/category_encoders/base_contrast_encoder.py
@@ -12,7 +12,7 @@
 __author__ = 'paulwestenthanner'
 
 
-class BaseContrastEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
+class BaseContrastEncoder(util.UnsupervisedTransformerMixin, util.BaseEncoder):
     """Base class for various contrast encoders.
 
     Parameters
diff --git a/category_encoders/basen.py b/category_encoders/basen.py
@@ -34,7 +34,7 @@ def _ceillogint(n, base):
     return ret
 
 
-class BaseNEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
+class BaseNEncoder( util.UnsupervisedTransformerMixin,util.BaseEncoder):
     """Base-N encoder encodes the categories into arrays of their base-N representation.
 
     A base of 1 is equivalent to one-hot encoding (not really base-1, but useful),
diff --git a/category_encoders/cat_boost.py b/category_encoders/cat_boost.py
@@ -9,7 +9,7 @@
 __author__ = 'Jan Motl'
 
 
-class CatBoostEncoder(util.BaseEncoder, util.SupervisedTransformerMixin):
+class CatBoostEncoder(util.SupervisedTransformerMixin, util.BaseEncoder):
     """CatBoost Encoding for categorical features.
 
     Supported targets: binomial and continuous.
@@ -202,10 +202,10 @@ def _transform(self, X, y=None):
 
         return X
 
-    def _more_tags(self) -> dict[str, bool]:
+    def __sklearn_tags__(self) -> util.EncoderTags:
         """Set scikit transformer tags."""
-        tags = super()._more_tags()
-        tags['predict_depends_on_y'] = True
+        tags = super().__sklearn_tags__()
+        tags.predict_depends_on_y = True
         return tags
 
     def _fit_column_map(self, series, y):
diff --git a/category_encoders/count.py b/category_encoders/count.py
@@ -11,7 +11,7 @@
 __author__ = 'joshua t. dunn'
 
 
-class CountEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
+class CountEncoder( util.UnsupervisedTransformerMixin,util.BaseEncoder):
     """Count encoding for categorical features.
 
     For a given categorical feature, replace the names of the groups with the group counts.
diff --git a/category_encoders/glmm.py b/category_encoders/glmm.py
@@ -15,7 +15,7 @@
 __author__ = 'Jan Motl'
 
 
-class GLMMEncoder(util.BaseEncoder, util.SupervisedTransformerMixin):
+class GLMMEncoder( util.SupervisedTransformerMixin ,util.BaseEncoder):
     """Generalized linear mixed model.
 
     Supported targets: binomial and continuous.
@@ -164,10 +164,10 @@ def _transform(self, X, y=None):
         X = self._score(X, y)
         return X
 
-    def _more_tags(self) -> dict[str, bool]:
+    def __sklearn_tags__(self) -> util.EncoderTags:
         """Set scikit transformer tags."""
-        tags = super()._more_tags()
-        tags['predict_depends_on_y'] = True
+        tags = super().__sklearn_tags__()
+        tags.predict_depends_on_y = True
         return tags
 
     def _train(self, X, y):
diff --git a/category_encoders/hashing.py b/category_encoders/hashing.py
@@ -14,7 +14,7 @@
 __author__ = 'willmcginnis', 'LiuShulun'
 
 
-class HashingEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
+class HashingEncoder( util.UnsupervisedTransformerMixin,util.BaseEncoder):
     """A multivariate hashing implementation with configurable dimensionality/precision.
 
     The advantage of this encoder is that it does not maintain a dictionary of observed categories.
diff --git a/category_encoders/james_stein.py b/category_encoders/james_stein.py
@@ -11,7 +11,7 @@
 __author__ = 'Jan Motl'
 
 
-class JamesSteinEncoder(util.BaseEncoder, util.SupervisedTransformerMixin):
+class JamesSteinEncoder( util.SupervisedTransformerMixin,util.BaseEncoder):
     """James-Stein estimator.
 
     Supported targets: binomial and continuous.
@@ -228,10 +228,10 @@ def _transform(self, X, y=None):
         X = self._score(X, y)
         return X
 
-    def _more_tags(self) -> dict[str, bool]:
+    def __sklearn_tags__(self) -> util.EncoderTags:
         """Set scikit transformer tags."""
-        tags = super()._more_tags()
-        tags['predict_depends_on_y'] = True
+        tags = super().__sklearn_tags__()
+        tags.predict_depends_on_y = True
         return tags
 
     def _train_pooled(self, X, y):
diff --git a/category_encoders/leave_one_out.py b/category_encoders/leave_one_out.py
@@ -9,7 +9,7 @@
 __author__ = 'hbghhy'
 
 
-class LeaveOneOutEncoder(util.BaseEncoder, util.SupervisedTransformerMixin):
+class LeaveOneOutEncoder( util.SupervisedTransformerMixin,util.BaseEncoder):
     """Leave one out coding for categorical features.
 
     This is very similar to target encoding but excludes the current row's
@@ -124,10 +124,10 @@ def _transform(self, X, y=None):
         X = self.transform_leave_one_out(X, y, mapping=self.mapping)
         return X
 
-    def _more_tags(self) -> dict[str, bool]:
+    def __sklearn_tags__(self) -> util.EncoderTags:
         """Set scikit transformer tags."""
-        tags = super()._more_tags()
-        tags['predict_depends_on_y'] = True
+        tags = super().__sklearn_tags__()
+        tags.predict_depends_on_y = True
         return tags
 
     def fit_leave_one_out(
diff --git a/category_encoders/m_estimate.py b/category_encoders/m_estimate.py
@@ -9,7 +9,7 @@
 __author__ = 'Jan Motl'
 
 
-class MEstimateEncoder(util.BaseEncoder, util.SupervisedTransformerMixin):
+class MEstimateEncoder( util.SupervisedTransformerMixin,util.BaseEncoder):
     """M-probability estimate of likelihood.
 
     Supported targets: binomial and continuous.
@@ -150,10 +150,10 @@ def _transform(self, X, y=None):
         X = self._score(X, y)
         return X
 
-    def _more_tags(self) -> dict[str, bool]:
+    def __sklearn_tags__(self) -> util.EncoderTags:
         """Set scikit transformer tags."""
-        tags = super()._more_tags()
-        tags['predict_depends_on_y'] = True
+        tags = super().__sklearn_tags__()
+        tags.predict_depends_on_y = True
         return tags
 
     def _train(self, X, y):
diff --git a/category_encoders/one_hot.py b/category_encoders/one_hot.py
@@ -11,7 +11,7 @@
 __author__ = 'willmcginnis'
 
 
-class OneHotEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
+class OneHotEncoder( util.UnsupervisedTransformerMixin,util.BaseEncoder):
     """Onehot (or dummy) coding for categorical features, produces a binary feature per category.
 
     Parameters
diff --git a/category_encoders/ordinal.py b/category_encoders/ordinal.py
@@ -12,7 +12,7 @@
 __author__ = 'willmcginnis'
 
 
-class OrdinalEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
+class OrdinalEncoder( util.UnsupervisedTransformerMixin,util.BaseEncoder):
     """Encodes categorical features as ordinal, in one ordered feature.
 
     Ordinal encoding uses a single column of integers to represent the classes.
diff --git a/category_encoders/quantile_encoder.py b/category_encoders/quantile_encoder.py
@@ -18,7 +18,7 @@
 from category_encoders.ordinal import OrdinalEncoder
 
 
-class QuantileEncoder(util.BaseEncoder, util.SupervisedTransformerMixin):
+class QuantileEncoder(util.SupervisedTransformerMixin, util.BaseEncoder):
     """Quantile Encoding for categorical features.
 
     This a statistically modified version of target MEstimate encoder where selected features
@@ -204,7 +204,7 @@ def quantile_encode(self, X_in: pd.DataFrame) -> pd.DataFrame:
 
 
 # todo does not fit in schema since it is an ensemble of other encoders
-class SummaryEncoder(BaseEstimator, util.TransformerWithTargetMixin):
+class SummaryEncoder(BaseEstimator):
     """Summary Encoding for categorical features.
 
     It's an encoder designed for creating richer representations by applying quantile
@@ -418,6 +418,22 @@ def transform(
         else:
             return transformed_df.to_numpy()
 
+    def __sklearn_tags__(self) -> util.EncoderTags:
+        """Set scikit transformer tags."""
+        sk_tags = super().__sklearn_tags__()
+        tags = util.EncoderTags.from_sk_tags(sk_tags)
+        tags.target_tags.required = True
+        return tags
+
+    def fit_transform(self, X: util.X_type, y: util.y_type | None = None):
+        """Fit and transform using target.
+
+        This also uses the target for transforming, not only for training.
+        """
+        if y is None:
+            raise TypeError('fit_transform() missing argument: ' 'y' '')
+        return self.fit(X, y).transform(X, y)
+
     def get_feature_names(self) -> np.ndarray:
         """Deprecated method to get feature names. Use `get_feature_names_out` instead."""
         msg = (
diff --git a/category_encoders/rankhot.py b/category_encoders/rankhot.py
@@ -9,7 +9,7 @@
 from category_encoders import OrdinalEncoder
 
 
-class RankHotEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
+class RankHotEncoder( util.UnsupervisedTransformerMixin,util.BaseEncoder):
     """Rank Hot Encoder.
 
     The rank-hot encoder is similar to a one-hot encoder,
diff --git a/category_encoders/target_encoder.py b/category_encoders/target_encoder.py
@@ -14,7 +14,7 @@
 __author__ = 'chappers'
 
 
-class TargetEncoder(util.BaseEncoder, util.SupervisedTransformerMixin):
+class TargetEncoder( util.SupervisedTransformerMixin,util.BaseEncoder):
     """Target encoding for categorical features.
 
     Supported targets: binomial and continuous.
diff --git a/category_encoders/utils.py b/category_encoders/utils.py
@@ -4,6 +4,7 @@
 
 import warnings
 from abc import abstractmethod
+from dataclasses import dataclass, fields
 from enum import Enum, auto
 from typing import Hashable, Sequence
 
@@ -16,6 +17,7 @@
 from sklearn.base import BaseEstimator
 from sklearn.exceptions import NotFittedError
 from sklearn.preprocessing import LabelEncoder
+from sklearn.utils import Tags
 
 __author__ = 'willmcginnis'
 
@@ -345,6 +347,21 @@ def get_docstring_output_shape(in_out_relation: EncodingRelation) -> str:
         return 'M features (M can be anything)'
 
 
+@dataclass
+class EncoderTags(Tags):
+    """Custom Tags for encoders."""
+
+    predict_depends_on_y: bool = False
+
+    @classmethod
+    def from_sk_tags(cls, tags: Tags) -> EncoderTags:
+        """Initialize EncoderTags from given sklearn Tags."""
+        as_dict = {
+            field.name: getattr(tags, field.name)
+            for field in fields(tags)
+        }
+        return cls(**as_dict)
+
 class BaseEncoder(BaseEstimator):
     """BaseEstimator class for all encoders.
 
@@ -437,7 +454,7 @@ def fit(self, X: X_type, y: y_type | None = None, **kwargs):
         self.feature_names_in_ = X.columns.tolist()
         self.n_features_in_ = len(self.feature_names_in_)
 
-        if self._get_tags().get('supervised_encoder'):
+        if self.__sklearn_tags__().target_tags.required:
             if not is_numeric_dtype(y):
                 self.lab_encoder_ = LabelEncoder()
                 y = self.lab_encoder_.fit_transform(y)
@@ -475,7 +492,7 @@ def fit(self, X: X_type, y: y_type | None = None, **kwargs):
         return self
 
     def _check_fit_inputs(self, X: X_type, y: y_type) -> None:
-        if self._get_tags().get('supervised_encoder'):
+        if self.__sklearn_tags__().target_tags.required:
             if y is None:
                 raise ValueError(
                     'Supervised encoders need a target for the fitting. The target cannot be None'
@@ -573,9 +590,12 @@ def _fit(self, X: pd.DataFrame, y: pd.Series | None, **kwargs): ...
 class SupervisedTransformerMixin(sklearn.base.TransformerMixin):
     """Mixin for supervised transformers (with target)."""
 
-    def _more_tags(self) -> dict[str, bool]:
+    def __sklearn_tags__(self) -> EncoderTags:
         """Set scikit transformer tags."""
-        return {'supervised_encoder': True}
+        sk_tags = super().__sklearn_tags__()
+        tags = EncoderTags.from_sk_tags(sk_tags)
+        tags.target_tags.required = True
+        return tags
 
     def transform(self, X: X_type, y: y_type | None = None, override_return_df: bool = False):
         """Perform the transformation to new categorical data.
@@ -653,20 +673,3 @@ def transform(self, X: X_type, override_return_df: bool = False):
 
     @abstractmethod
     def _transform(self, X: pd.DataFrame) -> pd.DataFrame: ...
-
-
-class TransformerWithTargetMixin:
-    """Mixin for transformers with target information."""
-
-    def _more_tags(self) -> dict[str, bool]:
-        """Set scikit transformer tags."""
-        return {'supervised_encoder': True}
-
-    def fit_transform(self, X: X_type, y: y_type | None = None, **fit_params):
-        """Fit and transform using target.
-
-        This also uses the target for transforming, not only for training.
-        """
-        if y is None:
-            raise TypeError('fit_transform() missing argument: ' 'y' '')
-        return self.fit(X, y, **fit_params).transform(X, y)
diff --git a/category_encoders/woe.py b/category_encoders/woe.py
@@ -12,7 +12,7 @@
 __author__ = 'Jan Motl'
 
 
-class WOEEncoder(util.BaseEncoder, util.SupervisedTransformerMixin):
+class WOEEncoder( util.SupervisedTransformerMixin,util.BaseEncoder):
     """Weight of Evidence coding for categorical features.
 
     Supported targets: binomial. For polynomial target support, see PolynomialWrapper.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/tests/test_encoders.py b/tests/test_encoders.py