|
1 | 1 | """Target Encoder""" |
| 2 | +import warnings |
2 | 3 | import numpy as np |
3 | 4 | import pandas as pd |
4 | 5 | from sklearn.base import BaseEstimator |
@@ -35,10 +36,12 @@ class TargetEncoder(BaseEstimator, util.TransformerWithTargetMixin): |
35 | 36 | handle_unknown: str |
36 | 37 | options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target mean. |
37 | 38 | min_samples_leaf: int |
38 | | - minimum samples to take category average into account. |
| 39 | + For regularization the weighted average between category mean and global mean is taken. The weight is |
| 40 | + an S-shaped curve between 0 and 1 with the number of samples for a category on the x-axis. |
| 41 | + The curve reaches 0.5 at min_samples_leaf. (parameter k in the original paper) |
39 | 42 | smoothing: float |
40 | 43 | smoothing effect to balance categorical average vs prior. Higher value means stronger regularization. |
41 | | - The value must be strictly bigger than 0. |
| 44 | + The value must be strictly bigger than 0. Higher values mean a flatter S-curve (see min_samples_leaf). |
42 | 45 |
|
43 | 46 | Example |
44 | 47 | ------- |
@@ -88,7 +91,13 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, h |
88 | 91 | self.cols = cols |
89 | 92 | self.ordinal_encoder = None |
90 | 93 | self.min_samples_leaf = min_samples_leaf |
91 | | - self.smoothing = float(smoothing) # Make smoothing a float so that python 2 does not treat as integer division |
| 94 | + if min_samples_leaf == 1: |
| 95 | + warnings.warn("Default parameter min_samples_leaf will change in version 2.6." |
| 96 | + "See https://github.com/scikit-learn-contrib/category_encoders/issues/327") |
| 97 | + self.smoothing = smoothing |
| 98 | + if min_samples_leaf == 1.0: |
| 99 | + warnings.warn("Default parameter smoothing will change in version 2.6." |
| 100 | + "See https://github.com/scikit-learn-contrib/category_encoders/issues/327") |
92 | 101 | self._dim = None |
93 | 102 | self.mapping = None |
94 | 103 | self.handle_unknown = handle_unknown |
|
0 commit comments