Skip to content

Commit b045143

Browse files
prepare release 2.6
1 parent c189587 commit b045143

File tree

6 files changed

+29
-26
lines changed

6 files changed

+29
-26
lines changed

CHANGELOG.md

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,23 @@
11
unreleased
22
==========
3-
* added gray encoder
43

4+
v2.6.0
5+
======
6+
* added gray encoder
7+
* added thermometer / rank-hot encoder
8+
* introduce compatibility with sklearn 1.2
9+
* compatibility with `feature_names_out_`
10+
* remove boston housing dataset
11+
* drop support for dataframes with non-homogenous data types in column names (i.e. having both string and integer column names)
12+
* improve performance of hashing encoder
13+
* improve catboost documentation
14+
* fix inverse transform in baseN with special character column names (issue 392)
15+
* fix inverse transform of ordinal encoder with custom mapping (issue 202)
16+
* fix re-fittable polynomial wrapper (issue 313)
17+
* fix numerical stability for target encoding (issue 377)
18+
* change default parameters of target encoding (issue 327)
19+
* drop support for sklearn 0.x
20+
521
v2.5.1.post0
622
============
723
* fix pypi sdist

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ __Unsupervised:__
2626
* Helmert Contrast [2][3]
2727
* Ordinal [2][3]
2828
* One-Hot [2][3]
29+
* Rank Hot [15]
2930
* Polynomial Contrast [2][3]
3031
* Sum Contrast [2][3]
3132

@@ -149,3 +150,4 @@ References
149150
12. Andrew Gelman and Jennifer Hill (2006). Data Analysis Using Regression and Multilevel/Hierarchical Models. From https://faculty.psau.edu.sa/filedownload/doc-12-pdf-a1997d0d31f84d13c1cdc44ac39a8f2c-original.pdf
150151
13. Carlos Mougan, David Masip, Jordi Nin and Oriol Pujol (2021). Quantile Encoder: Tackling High Cardinality Categorical Features in Regression Problems. https://link.springer.com/chapter/10.1007%2F978-3-030-85529-1_14
151152
14. Gray Encoding. From https://en.wikipedia.org/wiki/Gray_code
153+
15. Jacob Buckman, Aurko Roy, Colin Raffel, Ian Goodfellow: Thermometer Encoding: One Hot Way To Resist Adversarial Examples. From https://openreview.net/forum?id=S18Su--CW

category_encoders/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
from category_encoders.glmm import GLMMEncoder
2828
from category_encoders.quantile_encoder import QuantileEncoder, SummaryEncoder
2929

30-
__version__ = '2.5.1.post0'
30+
__version__ = '2.6.0'
3131

3232
__author__ = "willmcginnis", "cmougan", "paulwestenthanner"
3333

category_encoders/target_encoder.py

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
from scipy.special import expit
55
from category_encoders.ordinal import OrdinalEncoder
66
import category_encoders.utils as util
7-
import warnings
87

98
__author__ = 'chappers'
109

@@ -44,10 +43,10 @@ class TargetEncoder(util.BaseEncoder, util.SupervisedTransformerMixin):
4443
The value must be strictly bigger than 0. Higher values mean a flatter S-curve (see min_samples_leaf).
4544
hierarchy: dict or dataframe
4645
A dictionary or a dataframe to define the hierarchy for mapping.
47-
46+
4847
If a dictionary, this contains a dict of columns to map into hierarchies. Dictionary key(s) should be the column name from X
4948
which requires mapping. For multiple hierarchical maps, this should be a dictionary of dictionaries.
50-
49+
5150
If dataframe: a dataframe defining columns to be used for the hierarchies. Column names must take the form:
5251
HIER_colA_1, ... HIER_colA_N, HIER_colB_1, ... HIER_colB_M, ...
5352
where [colA, colB, ...] are given columns in cols list.
@@ -111,20 +110,12 @@ class TargetEncoder(util.BaseEncoder, util.SupervisedTransformerMixin):
111110
encoding_relation = util.EncodingRelation.ONE_TO_ONE
112111

113112
def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, handle_missing='value',
114-
handle_unknown='value', min_samples_leaf=1, smoothing=1.0, hierarchy=None):
113+
handle_unknown='value', min_samples_leaf=20, smoothing=10, hierarchy=None):
115114
super().__init__(verbose=verbose, cols=cols, drop_invariant=drop_invariant, return_df=return_df,
116115
handle_unknown=handle_unknown, handle_missing=handle_missing)
117116
self.ordinal_encoder = None
118117
self.min_samples_leaf = min_samples_leaf
119-
if min_samples_leaf == 1:
120-
warnings.warn("Default parameter min_samples_leaf will change in version 2.6."
121-
"See https://github.com/scikit-learn-contrib/category_encoders/issues/327",
122-
category=FutureWarning)
123118
self.smoothing = smoothing
124-
if smoothing == 1.0:
125-
warnings.warn("Default parameter smoothing will change in version 2.6."
126-
"See https://github.com/scikit-learn-contrib/category_encoders/issues/327",
127-
category=FutureWarning)
128119
self.mapping = None
129120
self._mean = None
130121
if isinstance(hierarchy, (dict, pd.DataFrame)) and cols is None:
@@ -203,7 +194,7 @@ def fit_target_encoding(self, X, y):
203194
col = switch.get('col')
204195
if 'HIER_' not in str(col):
205196
values = switch.get('mapping')
206-
197+
207198
scalar = prior
208199
if (isinstance(self.hierarchy, dict) and col in self.hierarchy) or \
209200
(isinstance(self.hierarchy, pd.DataFrame)):
@@ -222,7 +213,6 @@ def fit_target_encoding(self, X, y):
222213
smoove = self._weighting(stats['count'])
223214

224215
smoothing = scalar * (1 - smoove) + stats['mean'] * smoove
225-
smoothing[stats['count'] == 1] = scalar
226216

227217
if self.handle_unknown == 'return_nan':
228218
smoothing.loc[-1] = np.nan

tests/test_encoders.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,7 @@ def test_preserve_names(self):
381381
def test_unique_column_is_not_predictive(self):
382382
# @ToDo not sure how useful this test is. TargetEncoders set the value to the default if there is only
383383
# one category but they probably should not. See discussion in issue 327
384-
test_encoders = ['LeaveOneOutEncoder', 'TargetEncoder', 'WOEEncoder', 'MEstimateEncoder',
384+
test_encoders = ['LeaveOneOutEncoder', 'WOEEncoder', 'MEstimateEncoder',
385385
'JamesSteinEncoder', 'CatBoostEncoder', 'GLMMEncoder']
386386
for encoder_name in test_encoders:
387387
enc = getattr(encoders, encoder_name)()

tests/test_target_encoder.py

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -109,11 +109,6 @@ def test_target_encoder_fit_transform_HaveNanValue_ExpectCorrectValueInResult(se
109109
self.assertAlmostEqual(0.4125, values[2], delta=1e-4)
110110
self.assertEqual(0.5, values[3])
111111

112-
def test_target_encoder_noncontiguous_index(self):
113-
data = pd.DataFrame({'x': ['a', 'b', np.nan, 'd', 'e'], 'y': range(5)}).dropna()
114-
result = encoders.TargetEncoder(cols=['x']).fit_transform(data[['x']], data['y'])
115-
self.assertTrue(np.allclose(result, 2.0))
116-
117112
def test_HandleMissingIsValueAndNanInTest_ExpectMean(self):
118113
df = pd.DataFrame({
119114
'color': ["a", "a", "a", "b", "b", "b"],
@@ -175,7 +170,7 @@ def test_hierarchical_smoothing_multi(self):
175170
self.assertAlmostEqual(0.3248, values[5], delta=1e-4)
176171
self.assertAlmostEqual(0.6190, values[11], delta=1e-4)
177172
self.assertAlmostEqual(0.1309, values[13], delta=1e-4)
178-
self.assertAlmostEqual(0.7381, values[15], delta=1e-4)
173+
self.assertAlmostEqual(0.8370, values[15], delta=1e-4)
179174

180175
def test_hierarchical_part_named_cols(self):
181176

@@ -299,10 +294,10 @@ def test_hierarchy_multi_level(self):
299294
values = result['Animal'].values
300295
self.assertAlmostEqual(0.6261, values[0], delta=1e-4)
301296
self.assertAlmostEqual(0.9065, values[2], delta=1e-4)
302-
self.assertAlmostEqual(0.4107, values[5], delta=1e-4)
297+
self.assertAlmostEqual(0.2556, values[5], delta=1e-4)
303298
self.assertAlmostEqual(0.3680, values[8], delta=1e-4)
304299
self.assertAlmostEqual(0.4626, values[11], delta=1e-4)
305-
self.assertAlmostEqual(0.2466, values[13], delta=1e-4)
300+
self.assertAlmostEqual(0.1535, values[13], delta=1e-4)
306301
self.assertAlmostEqual(0.4741, values[14], delta=1e-4)
307302

308303
def test_hierarchy_columnwise_compass(self):
@@ -330,7 +325,7 @@ def test_hierarchy_columnwise_postcodes(self):
330325
result = enc.fit_transform(X[cols], y)
331326

332327
values = result['postcode'].values
333-
self.assertAlmostEqual(0.7506, values[0], delta=1e-4)
328+
self.assertAlmostEqual(0.8448, values[0], delta=1e-4)
334329

335330

336331
def test_hierarchy_columnwise_missing_level(self):

0 commit comments

Comments
 (0)