Merge pull request #398 from JaimeArboleda/fix-get-feature-names

PaulWestenthanner · web-flow · commit 570827e6b487 · 2023-03-13T12:48:23.000+01:00
(WIP) Partial fix for getting feature names out
diff --git a/category_encoders/__init__.py b/category_encoders/__init__.py
@@ -26,6 +26,10 @@
 from category_encoders.rankhot import RankHotEncoder
 from category_encoders.glmm import GLMMEncoder
 from category_encoders.quantile_encoder import QuantileEncoder, SummaryEncoder
+import sklearn
+import warnings
+from textwrap import dedent
+
 
 __version__ = '2.6.0'
 
diff --git a/category_encoders/quantile_encoder.py b/category_encoders/quantile_encoder.py
@@ -344,7 +344,7 @@ def get_feature_names(self) -> List[str]:
                       category=FutureWarning)
         return self.get_feature_names_out()
 
-    def get_feature_names_out(self) -> List[str]:
+    def get_feature_names_out(self, input_features=None) -> np.ndarray:
         """
         Returns the names of all transformed / added columns.
 
@@ -355,7 +355,7 @@ def get_feature_names_out(self) -> List[str]:
 
         Returns
         -------
-        feature_names: list
+        feature_names: np.ndarray
             A list with all feature names transformed or added.
             Note: potentially dropped features (because the feature is constant/invariant) are not included!
 
@@ -364,7 +364,7 @@ def get_feature_names_out(self) -> List[str]:
         if not isinstance(out_feats, list):
             raise NotFittedError("Estimator has to be fitted to return feature names.")
         else:
-            return out_feats
+            return np.array(out_feats, dtype=object)
 
     def get_feature_names_in(self) -> List[str]:
         """
diff --git a/category_encoders/utils.py b/category_encoders/utils.py
@@ -363,7 +363,7 @@ def get_feature_names(self) -> List[str]:
                       category=FutureWarning)
         return self.get_feature_names_out()
 
-    def get_feature_names_out(self) -> List[str]:
+    def get_feature_names_out(self, input_features=None) -> np.ndarray:
         """
         Returns the names of all transformed / added columns.
 
@@ -374,16 +374,16 @@ def get_feature_names_out(self) -> List[str]:
 
         Returns
         -------
-        feature_names: list
-            A list with all feature names transformed or added.
+        feature_names: np.ndarray
+            A numpy array with all feature names transformed or added.
             Note: potentially dropped features (because the feature is constant/invariant) are not included!
 
         """
         out_feats = getattr(self, "feature_names_out_", None)
         if not isinstance(out_feats, list):
             raise NotFittedError("Estimator has to be fitted to return feature names.")
         else:
-            return out_feats
+            return np.array(out_feats, dtype=object)
 
     def get_feature_names_in(self) -> List[str]:
         """
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -14,7 +14,16 @@ transformers in this library all share a few useful properties:
  * Can explicitly configure which columns in the data are encoded by name or index, or infer non-numeric columns regardless of input type
  * Can drop any columns with very low variance based on training set optionally
  * Portability: train a transformer on data, pickle it, reuse it later and get the same thing out.
- * Full compatibility with sklearn pipelines, input an array-like dataset like any other transformer
+ * Full compatibility with sklearn pipelines, input an array-like dataset like any other transformer (\*)
+
+(\*) For full compatibility with Pipelines and ColumnTransformers, and consistent behaviour of `get_feature_names_out`, it's recommended to upgrade `sklearn` to a version at least '1.2.0' and to set output as pandas:
+
+.. code-block:: python
+
+    import sklearn
+    sklearn.set_config(transform_output="pandas")
+
+
 
 Usage
 -----
@@ -65,7 +74,30 @@ To use:
 All of these are fully compatible sklearn transformers, so they can be used in pipelines or in your existing scripts. If
 the cols parameter isn't passed, every non-numeric column will be converted. See below for detailed documentation
 
+Known issues:
+----
+
+`CategoryEncoders` internally works with `pandas DataFrames` as apposed to `sklearn` which works with `numpy arrays`. This can cause problems in `sklearn` versions prior to 1.2.0. In order to ensure full compatibility with `sklearn` set `sklearn` to also output `DataFrames`. This can be done by
+
+.. code-block::python
+
+   sklearn.set_config(transform_output="pandas")
+
+for a whole project or just for a single pipeline using
+
+.. code-block::python
+
+   Pipeline(
+       steps=[
+           ("preprocessor", SomePreprocessor().set_output("pandas"),
+           ("encoder", SomeEncoder()),
+       ]
+   )
+
+If you experience another bug, feel free to report it on [github](https://github.com/scikit-learn-contrib/category_encoders/issues)
+
 Contents:
+----
 
 .. toctree::
    :maxdepth: 3
diff --git a/tests/test_encoders.py b/tests/test_encoders.py
@@ -2,6 +2,7 @@
 from datetime import timedelta
 
 import numpy as np
+from numpy.testing import assert_array_equal
 import pandas as pd
 import sklearn
 import tests.helpers as th
@@ -251,7 +252,7 @@ def test_sklearn_compliance(self):
                 self.assertTrue(hasattr(encoder, "feature_names_out_"))
                 self.assertListEqual(encoder.feature_names_in_, ["city"])
                 self.assertEqual(encoder.n_features_in_, 1)
-                self.assertIsInstance(encoder.get_feature_names_out(), list)
+                self.assertIsInstance(encoder.get_feature_names_out(), np.ndarray)
                 self.assertIsInstance(encoder.get_feature_names_in(), list)
 
     def test_inverse_transform(self):
@@ -456,11 +457,11 @@ def test_get_feature_names_out(self):
                 # Target encoders also need y
                 if enc._get_tags().get('supervised_encoder'):
                     obtained = enc.fit(X, y).get_feature_names_out()
-                    expected = enc.transform(X, y).columns.tolist()
+                    expected = np.array(enc.transform(X, y).columns)
                 else:
                     obtained = enc.fit(X).get_feature_names_out()
-                    expected = enc.transform(X).columns.tolist()
-                self.assertEqual(obtained, expected)
+                    expected = np.array(enc.transform(X).columns)
+                assert_array_equal(obtained, expected)
 
     def test_get_feature_names_out_drop_invariant(self):
         # TODO: What could a DF look like that results in constant
@@ -471,11 +472,11 @@ def test_get_feature_names_out_drop_invariant(self):
                 # Target encoders also need y
                 if enc._get_tags().get('supervised_encoder'):
                     obtained = enc.fit(X, y).get_feature_names_out()
-                    expected = enc.transform(X, y).columns.tolist()
+                    expected = np.array(enc.transform(X, y).columns)
                 else:
                     obtained = enc.fit(X).get_feature_names_out()
-                    expected = enc.transform(X).columns.tolist()
-                self.assertEqual(obtained, expected)
+                    expected = np.array(enc.transform(X).columns)
+                assert_array_equal(obtained, expected)
 
     def test_get_feature_names_out_not_set(self):
         for encoder_name in encoders.__all__:
diff --git a/tests/test_feature_names.py b/tests/test_feature_names.py
@@ -0,0 +1,100 @@
+import numpy as np
+import pandas as pd
+import tests.helpers as th
+from numpy.testing import assert_array_equal
+import sklearn
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import StandardScaler
+from sklearn.pipeline import Pipeline
+from sklearn.compose import ColumnTransformer
+import category_encoders as encoders
+from unittest import TestCase
+
+__author__ = 'JaimeArboleda'
+
+# data definitions
+X = th.create_dataset(n_rows=100)
+cat_columns = ['categorical', 'na_categorical']
+num_columns = ['float']
+X = X[cat_columns + num_columns]
+np_y = np.random.randn(X.shape[0]) > 0.5
+y = pd.DataFrame(np_y)
+
+class TestEncodersFeaturesOut(TestCase):
+
+    def test_feature_names_out(self):
+        for encoder_name in encoders.__all__:
+            if sklearn.__version__ < "1.2.0":
+                continue
+            else:
+                sklearn.set_config(transform_output="pandas")
+            with self.subTest(encoder_name=encoder_name):
+                encoder = getattr(encoders, encoder_name)()
+                X_t = encoder.fit_transform(X, y)
+
+                categorical_preprocessor_start = Pipeline(
+                    steps=[
+                        ("encoder", getattr(encoders, encoder_name)())
+                    ]
+                )
+                categorical_preprocessor_middle = Pipeline(
+                    steps=[
+                        ("imputation_constant", SimpleImputer(fill_value="missing", strategy="constant")),
+                        ("encoder", getattr(encoders, encoder_name)())
+                    ]
+                )
+                numerical_preprocessor = Pipeline(
+                    steps=[
+                        ("imputation_constant", SimpleImputer(fill_value=0, strategy="constant"))
+                    ]
+                )
+                preprocessor = ColumnTransformer(
+                    [
+                        ("categorical_prep_start", categorical_preprocessor_start, ["categorical", "na_categorical"]),
+                        ("categorical_prep_middle", categorical_preprocessor_middle, ["categorical", "na_categorical"]),
+                        ("numerical_prep", numerical_preprocessor, ["float"])
+                    ]
+                )
+                X_tt = preprocessor.fit_transform(X, y)
+
+                assert_array_equal(
+                    np.array(X_t.columns),
+                    encoder.get_feature_names_out()
+                )
+                assert_array_equal(
+                    np.array(X_tt.columns),
+                    preprocessor.get_feature_names_out()
+                )
+                assert_array_equal(
+                    np.array(
+                        [
+                            c
+                            for c in X_t.columns
+                            if c not in num_columns
+                        ]
+                    ),
+                    np.array(
+                        [
+                            c[len("categorical_prep_start__"):]
+                            for c in X_tt.columns
+                            if "categorical_prep_start" in c
+                        ]
+                    )
+                )
+                assert_array_equal(
+                    np.array(
+                        [
+                            c
+                            for c in X_t.columns
+                            if c not in num_columns
+                        ]
+                    ),
+                    np.array(
+                        [
+                            c[len("categorical_prep_middle__"):]
+                            for c in X_tt.columns
+                            if "categorical_prep_middle" in c
+                        ]
+                    )
+                )
+            sklearn.set_config(transform_output="default")
diff --git a/tests/test_rankhot.py b/tests/test_rankhot.py
@@ -4,7 +4,6 @@
 import numpy as np
 import category_encoders as encoders
 
-
 np_X = th.create_array(n_rows=100)
 np_X_t = th.create_array(n_rows=50, extras=True)
 np_y = np.random.randn(np_X.shape[0]) > 0.5
@@ -22,8 +21,8 @@ def test_handleNaNvalue(self):
         enc.fit(X)
         t_f = enc.transform(X)
         inv_tf = enc.inverse_transform(t_f)
-        self.assertEqual(t_f.shape[1]-(X.shape[1]-1), len(X.none.unique()))
-        self.assertTupleEqual(inv_tf.shape,X.shape)
+        self.assertEqual(t_f.shape[1] - (X.shape[1] - 1), len(X.none.unique()))
+        self.assertTupleEqual(inv_tf.shape, X.shape)
 
     def test_handleCategoricalValue(self):
         enc = encoders.RankHotEncoder(cols=['categorical'])
@@ -45,11 +44,11 @@ def test_extraValue(self):
         test = pd.DataFrame({'city': ['chicago', 'los angeles']})
         enc = encoders.RankHotEncoder(handle_unknown='value')
         train_out = enc.fit_transform(train)
-        expected_mapping = pd.DataFrame([[1, 0],[1, 1],], columns=["city_1", "city_2"], index=[1,2])
-        expected_out_train = pd.DataFrame([[1, 0],[1, 1],[1, 0],[1, 1],], columns=["city_1", "city_2"])
-        expected_out_test = pd.DataFrame([[1, 0],[0, 0],], columns=["city_1", "city_2"])
+        expected_mapping = pd.DataFrame([[1, 0], [1, 1], ], columns=["city_1", "city_2"], index=[1, 2])
+        expected_out_train = pd.DataFrame([[1, 0], [1, 1], [1, 0], [1, 1], ], columns=["city_1", "city_2"])
+        expected_out_test = pd.DataFrame([[1, 0], [0, 0], ], columns=["city_1", "city_2"])
         pd.testing.assert_frame_equal(train_out, expected_out_train)
-        pd.testing.assert_frame_equal(enc.mapping[0]["mapping"], expected_mapping)
+        pd.testing.assert_frame_equal(enc.mapping[0]["mapping"], expected_mapping, check_dtype=False)
         t_f = enc.transform(test)
         pd.testing.assert_frame_equal(t_f, expected_out_test)
         inv_tf = enc.inverse_transform(t_f)
@@ -92,4 +91,3 @@ def test_order(self):
         for m1, m2 in zip(mapping_order_1, mapping_order_2):
             self.assertEqual(m1["col"], m2["col"])
             pd.testing.assert_series_equal(m1["mapping"], m2["mapping"])
-