Skip to content

Commit 570827e

Browse files
Merge pull request #398 from JaimeArboleda/fix-get-feature-names
(WIP) Partial fix for getting feature names out
2 parents 5eb7a2d + 673de07 commit 570827e

File tree

7 files changed

+158
-23
lines changed

7 files changed

+158
-23
lines changed

category_encoders/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@
2626
from category_encoders.rankhot import RankHotEncoder
2727
from category_encoders.glmm import GLMMEncoder
2828
from category_encoders.quantile_encoder import QuantileEncoder, SummaryEncoder
29+
import sklearn
30+
import warnings
31+
from textwrap import dedent
32+
2933

3034
__version__ = '2.6.0'
3135

category_encoders/quantile_encoder.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,7 @@ def get_feature_names(self) -> List[str]:
344344
category=FutureWarning)
345345
return self.get_feature_names_out()
346346

347-
def get_feature_names_out(self) -> List[str]:
347+
def get_feature_names_out(self, input_features=None) -> np.ndarray:
348348
"""
349349
Returns the names of all transformed / added columns.
350350
@@ -355,7 +355,7 @@ def get_feature_names_out(self) -> List[str]:
355355
356356
Returns
357357
-------
358-
feature_names: list
358+
feature_names: np.ndarray
359359
A list with all feature names transformed or added.
360360
Note: potentially dropped features (because the feature is constant/invariant) are not included!
361361
@@ -364,7 +364,7 @@ def get_feature_names_out(self) -> List[str]:
364364
if not isinstance(out_feats, list):
365365
raise NotFittedError("Estimator has to be fitted to return feature names.")
366366
else:
367-
return out_feats
367+
return np.array(out_feats, dtype=object)
368368

369369
def get_feature_names_in(self) -> List[str]:
370370
"""

category_encoders/utils.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -363,7 +363,7 @@ def get_feature_names(self) -> List[str]:
363363
category=FutureWarning)
364364
return self.get_feature_names_out()
365365

366-
def get_feature_names_out(self) -> List[str]:
366+
def get_feature_names_out(self, input_features=None) -> np.ndarray:
367367
"""
368368
Returns the names of all transformed / added columns.
369369
@@ -374,16 +374,16 @@ def get_feature_names_out(self) -> List[str]:
374374
375375
Returns
376376
-------
377-
feature_names: list
378-
A list with all feature names transformed or added.
377+
feature_names: np.ndarray
378+
A numpy array with all feature names transformed or added.
379379
Note: potentially dropped features (because the feature is constant/invariant) are not included!
380380
381381
"""
382382
out_feats = getattr(self, "feature_names_out_", None)
383383
if not isinstance(out_feats, list):
384384
raise NotFittedError("Estimator has to be fitted to return feature names.")
385385
else:
386-
return out_feats
386+
return np.array(out_feats, dtype=object)
387387

388388
def get_feature_names_in(self) -> List[str]:
389389
"""

docs/source/index.rst

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,16 @@ transformers in this library all share a few useful properties:
1414
* Can explicitly configure which columns in the data are encoded by name or index, or infer non-numeric columns regardless of input type
1515
* Can drop any columns with very low variance based on training set optionally
1616
* Portability: train a transformer on data, pickle it, reuse it later and get the same thing out.
17-
* Full compatibility with sklearn pipelines, input an array-like dataset like any other transformer
17+
* Full compatibility with sklearn pipelines, input an array-like dataset like any other transformer (\*)
18+
19+
(\*) For full compatibility with Pipelines and ColumnTransformers, and consistent behaviour of `get_feature_names_out`, it's recommended to upgrade `sklearn` to a version at least '1.2.0' and to set output as pandas:
20+
21+
.. code-block:: python
22+
23+
import sklearn
24+
sklearn.set_config(transform_output="pandas")
25+
26+
1827
1928
Usage
2029
-----
@@ -65,7 +74,30 @@ To use:
6574
All of these are fully compatible sklearn transformers, so they can be used in pipelines or in your existing scripts. If
6675
the cols parameter isn't passed, every non-numeric column will be converted. See below for detailed documentation
6776

77+
Known issues:
78+
----
79+
80+
`CategoryEncoders` internally works with `pandas DataFrames` as apposed to `sklearn` which works with `numpy arrays`. This can cause problems in `sklearn` versions prior to 1.2.0. In order to ensure full compatibility with `sklearn` set `sklearn` to also output `DataFrames`. This can be done by
81+
82+
.. code-block::python
83+
84+
sklearn.set_config(transform_output="pandas")
85+
86+
for a whole project or just for a single pipeline using
87+
88+
.. code-block::python
89+
90+
Pipeline(
91+
steps=[
92+
("preprocessor", SomePreprocessor().set_output("pandas"),
93+
("encoder", SomeEncoder()),
94+
]
95+
)
96+
97+
If you experience another bug, feel free to report it on [github](https://github.com/scikit-learn-contrib/category_encoders/issues)
98+
6899
Contents:
100+
----
69101

70102
.. toctree::
71103
:maxdepth: 3

tests/test_encoders.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from datetime import timedelta
33

44
import numpy as np
5+
from numpy.testing import assert_array_equal
56
import pandas as pd
67
import sklearn
78
import tests.helpers as th
@@ -251,7 +252,7 @@ def test_sklearn_compliance(self):
251252
self.assertTrue(hasattr(encoder, "feature_names_out_"))
252253
self.assertListEqual(encoder.feature_names_in_, ["city"])
253254
self.assertEqual(encoder.n_features_in_, 1)
254-
self.assertIsInstance(encoder.get_feature_names_out(), list)
255+
self.assertIsInstance(encoder.get_feature_names_out(), np.ndarray)
255256
self.assertIsInstance(encoder.get_feature_names_in(), list)
256257

257258
def test_inverse_transform(self):
@@ -456,11 +457,11 @@ def test_get_feature_names_out(self):
456457
# Target encoders also need y
457458
if enc._get_tags().get('supervised_encoder'):
458459
obtained = enc.fit(X, y).get_feature_names_out()
459-
expected = enc.transform(X, y).columns.tolist()
460+
expected = np.array(enc.transform(X, y).columns)
460461
else:
461462
obtained = enc.fit(X).get_feature_names_out()
462-
expected = enc.transform(X).columns.tolist()
463-
self.assertEqual(obtained, expected)
463+
expected = np.array(enc.transform(X).columns)
464+
assert_array_equal(obtained, expected)
464465

465466
def test_get_feature_names_out_drop_invariant(self):
466467
# TODO: What could a DF look like that results in constant
@@ -471,11 +472,11 @@ def test_get_feature_names_out_drop_invariant(self):
471472
# Target encoders also need y
472473
if enc._get_tags().get('supervised_encoder'):
473474
obtained = enc.fit(X, y).get_feature_names_out()
474-
expected = enc.transform(X, y).columns.tolist()
475+
expected = np.array(enc.transform(X, y).columns)
475476
else:
476477
obtained = enc.fit(X).get_feature_names_out()
477-
expected = enc.transform(X).columns.tolist()
478-
self.assertEqual(obtained, expected)
478+
expected = np.array(enc.transform(X).columns)
479+
assert_array_equal(obtained, expected)
479480

480481
def test_get_feature_names_out_not_set(self):
481482
for encoder_name in encoders.__all__:

tests/test_feature_names.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
import numpy as np
2+
import pandas as pd
3+
import tests.helpers as th
4+
from numpy.testing import assert_array_equal
5+
import sklearn
6+
from sklearn.impute import SimpleImputer
7+
from sklearn.preprocessing import StandardScaler
8+
from sklearn.pipeline import Pipeline
9+
from sklearn.compose import ColumnTransformer
10+
import category_encoders as encoders
11+
from unittest import TestCase
12+
13+
__author__ = 'JaimeArboleda'
14+
15+
# data definitions
16+
X = th.create_dataset(n_rows=100)
17+
cat_columns = ['categorical', 'na_categorical']
18+
num_columns = ['float']
19+
X = X[cat_columns + num_columns]
20+
np_y = np.random.randn(X.shape[0]) > 0.5
21+
y = pd.DataFrame(np_y)
22+
23+
class TestEncodersFeaturesOut(TestCase):
24+
25+
def test_feature_names_out(self):
26+
for encoder_name in encoders.__all__:
27+
if sklearn.__version__ < "1.2.0":
28+
continue
29+
else:
30+
sklearn.set_config(transform_output="pandas")
31+
with self.subTest(encoder_name=encoder_name):
32+
encoder = getattr(encoders, encoder_name)()
33+
X_t = encoder.fit_transform(X, y)
34+
35+
categorical_preprocessor_start = Pipeline(
36+
steps=[
37+
("encoder", getattr(encoders, encoder_name)())
38+
]
39+
)
40+
categorical_preprocessor_middle = Pipeline(
41+
steps=[
42+
("imputation_constant", SimpleImputer(fill_value="missing", strategy="constant")),
43+
("encoder", getattr(encoders, encoder_name)())
44+
]
45+
)
46+
numerical_preprocessor = Pipeline(
47+
steps=[
48+
("imputation_constant", SimpleImputer(fill_value=0, strategy="constant"))
49+
]
50+
)
51+
preprocessor = ColumnTransformer(
52+
[
53+
("categorical_prep_start", categorical_preprocessor_start, ["categorical", "na_categorical"]),
54+
("categorical_prep_middle", categorical_preprocessor_middle, ["categorical", "na_categorical"]),
55+
("numerical_prep", numerical_preprocessor, ["float"])
56+
]
57+
)
58+
X_tt = preprocessor.fit_transform(X, y)
59+
60+
assert_array_equal(
61+
np.array(X_t.columns),
62+
encoder.get_feature_names_out()
63+
)
64+
assert_array_equal(
65+
np.array(X_tt.columns),
66+
preprocessor.get_feature_names_out()
67+
)
68+
assert_array_equal(
69+
np.array(
70+
[
71+
c
72+
for c in X_t.columns
73+
if c not in num_columns
74+
]
75+
),
76+
np.array(
77+
[
78+
c[len("categorical_prep_start__"):]
79+
for c in X_tt.columns
80+
if "categorical_prep_start" in c
81+
]
82+
)
83+
)
84+
assert_array_equal(
85+
np.array(
86+
[
87+
c
88+
for c in X_t.columns
89+
if c not in num_columns
90+
]
91+
),
92+
np.array(
93+
[
94+
c[len("categorical_prep_middle__"):]
95+
for c in X_tt.columns
96+
if "categorical_prep_middle" in c
97+
]
98+
)
99+
)
100+
sklearn.set_config(transform_output="default")

tests/test_rankhot.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import numpy as np
55
import category_encoders as encoders
66

7-
87
np_X = th.create_array(n_rows=100)
98
np_X_t = th.create_array(n_rows=50, extras=True)
109
np_y = np.random.randn(np_X.shape[0]) > 0.5
@@ -22,8 +21,8 @@ def test_handleNaNvalue(self):
2221
enc.fit(X)
2322
t_f = enc.transform(X)
2423
inv_tf = enc.inverse_transform(t_f)
25-
self.assertEqual(t_f.shape[1]-(X.shape[1]-1), len(X.none.unique()))
26-
self.assertTupleEqual(inv_tf.shape,X.shape)
24+
self.assertEqual(t_f.shape[1] - (X.shape[1] - 1), len(X.none.unique()))
25+
self.assertTupleEqual(inv_tf.shape, X.shape)
2726

2827
def test_handleCategoricalValue(self):
2928
enc = encoders.RankHotEncoder(cols=['categorical'])
@@ -45,11 +44,11 @@ def test_extraValue(self):
4544
test = pd.DataFrame({'city': ['chicago', 'los angeles']})
4645
enc = encoders.RankHotEncoder(handle_unknown='value')
4746
train_out = enc.fit_transform(train)
48-
expected_mapping = pd.DataFrame([[1, 0],[1, 1],], columns=["city_1", "city_2"], index=[1,2])
49-
expected_out_train = pd.DataFrame([[1, 0],[1, 1],[1, 0],[1, 1],], columns=["city_1", "city_2"])
50-
expected_out_test = pd.DataFrame([[1, 0],[0, 0],], columns=["city_1", "city_2"])
47+
expected_mapping = pd.DataFrame([[1, 0], [1, 1], ], columns=["city_1", "city_2"], index=[1, 2])
48+
expected_out_train = pd.DataFrame([[1, 0], [1, 1], [1, 0], [1, 1], ], columns=["city_1", "city_2"])
49+
expected_out_test = pd.DataFrame([[1, 0], [0, 0], ], columns=["city_1", "city_2"])
5150
pd.testing.assert_frame_equal(train_out, expected_out_train)
52-
pd.testing.assert_frame_equal(enc.mapping[0]["mapping"], expected_mapping)
51+
pd.testing.assert_frame_equal(enc.mapping[0]["mapping"], expected_mapping, check_dtype=False)
5352
t_f = enc.transform(test)
5453
pd.testing.assert_frame_equal(t_f, expected_out_test)
5554
inv_tf = enc.inverse_transform(t_f)
@@ -92,4 +91,3 @@ def test_order(self):
9291
for m1, m2 in zip(mapping_order_1, mapping_order_2):
9392
self.assertEqual(m1["col"], m2["col"])
9493
pd.testing.assert_series_equal(m1["mapping"], m2["mapping"])
95-

0 commit comments

Comments
 (0)