Allow float categories to be passed into CatBoost estimators (#3966)

tamargrey · web-flow · commit 481c92cd1a14 · 2023-01-31T15:54:00.000-05:00
* add initial, messy, implementation of util to remove float  cats

* use fixture for tests and test predict

* consolidate tests

* Init woodwork at new predict method

* raise error for true float case and refactor

* Clean up

* Add releaser note

* remove arrow comment

* Add predict proba to classifier
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -4,6 +4,7 @@ Release Notes
     * Enhancements
     * Fixes
         * Re-added ``TimeSeriesPipeline.should_skip_featurization`` to fix bug where data would get featurized unnecessarily :pr:`3964`
+        * Allow float categories to be passed into CatBoost estimators :pr:`3966`
     * Changes
         * Update pyproject.toml to correctly specify the data filepaths :pr:`3967`
     * Documentation Changes
diff --git a/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py b/evalml/pipelines/components/estimators/classifiers/catboost_classifier.py
@@ -9,6 +9,7 @@
 from evalml.model_family import ModelFamily
 from evalml.pipelines.components.estimators import Estimator
 from evalml.pipelines.components.transformers import LabelEncoder
+from evalml.pipelines.components.utils import handle_float_categories_for_catboost
 from evalml.problem_types import ProblemTypes
 from evalml.utils import import_or_raise, infer_feature_types
 
@@ -119,6 +120,8 @@ def fit(self, X, y=None):
         if y.nunique() <= 2:
             self._label_encoder = LabelEncoder()
             y = self._label_encoder.fit_transform(None, y)[1]
+
+        X = handle_float_categories_for_catboost(X)
         self._component_obj.fit(X, y, silent=True, cat_features=cat_cols)
         return self
 
@@ -129,9 +132,10 @@ def predict(self, X):
             X (pd.DataFrame): Data of shape [n_samples, n_features].
 
         Returns:
-            pd.DataFrame: Predicted values.
+            pd.Series: Predicted values.
         """
         X = infer_feature_types(X)
+        X = handle_float_categories_for_catboost(X)
         predictions = self._component_obj.predict(X)
         if predictions.ndim == 2 and predictions.shape[1] == 1:
             predictions = predictions.flatten()
@@ -143,6 +147,20 @@ def predict(self, X):
         predictions.index = X.index
         return predictions
 
+    def predict_proba(self, X):
+        """Make prediction probabilities using the fitted CatBoost classifier.
+
+        Args:
+            X (pd.DataFrame): Data of shape [n_samples, n_features].
+
+        Returns:
+            pd.DataFrame: Predicted probability values.
+        """
+        X = infer_feature_types(X)
+        X = handle_float_categories_for_catboost(X)
+        predictions = super().predict_proba(X)
+        return predictions
+
     @property
     def feature_importance(self):
         """Feature importance of fitted CatBoost classifier."""
diff --git a/evalml/pipelines/components/estimators/regressors/catboost_regressor.py b/evalml/pipelines/components/estimators/regressors/catboost_regressor.py
@@ -7,6 +7,7 @@
 
 from evalml.model_family import ModelFamily
 from evalml.pipelines.components.estimators import Estimator
+from evalml.pipelines.components.utils import handle_float_categories_for_catboost
 from evalml.problem_types import ProblemTypes
 from evalml.utils import (
     downcast_int_nullable_to_double,
@@ -113,9 +114,25 @@ def fit(self, X, y=None):
         self.input_feature_names = list(X.columns)
         X, y = super()._manage_woodwork(X, y)
         X = downcast_int_nullable_to_double(X)
+
+        X = handle_float_categories_for_catboost(X)
         self._component_obj.fit(X, y, silent=True, cat_features=cat_cols)
         return self
 
+    def predict(self, X):
+        """Make predictions using the fitted CatBoost regressor.
+
+        Args:
+            X (pd.DataFrame): Data of shape [n_samples, n_features].
+
+        Returns:
+            pd.DataFrame: Predicted values.
+        """
+        X = infer_feature_types(X)
+        X = handle_float_categories_for_catboost(X)
+        predictions = super().predict(X)
+        return predictions
+
     @property
     def feature_importance(self):
         """Feature importance of fitted CatBoost regressor."""
diff --git a/evalml/pipelines/components/utils.py b/evalml/pipelines/components/utils.py
@@ -471,3 +471,61 @@ def make_balancing_dictionary(y, sampling_ratio):
             # this class is already larger than the ratio, don't change
             class_dic[index] = value_counts[index]
     return class_dic
+
+
+def handle_float_categories_for_catboost(X):
+    """Updates input data to be compatible with CatBoost estimators.
+
+    CatBoost cannot handle data in X that is the Categorical Woodwork logical type with floating point categories.
+    This utility determines if the floating point categories can be converted to integers
+    without truncating any data, and if they can be, converts them to int64 categories.
+    Will not attempt to use values that are truly floating points.
+
+    Args:
+        X (pd.DataFrame): Input data to CatBoost that has Woodwork initialized
+
+    Returns:
+        DataFrame: Input data with exact same Woodwork typing info as the original but with any float categories
+            converted to be int64 when possible.
+
+    Raises:
+        ValueError: if the numeric categories are actual floats that cannot be converted to integers
+            without truncating data
+    """
+    original_schema = X.ww.schema
+    original_dtypes = X.dtypes
+
+    # Determine which categorical columns have float categories, which CatBoost would error on
+    categorical_columns = X.ww.select("category", return_schema=True).columns.keys()
+    cols_with_float_categories = [
+        col
+        for col in categorical_columns
+        if original_dtypes[col].categories.dtype == "float64"
+    ]
+
+    if not cols_with_float_categories:
+        return X
+
+    # determine which columns are really integers vs are actually floats
+    new_dtypes = {}
+    for col in cols_with_float_categories:
+        col_categories = original_dtypes[col].categories
+        floats_are_really_ints = (col_categories % 1 == 0).all()
+        if floats_are_really_ints:
+            # We can use non nullable int64 here because there will not be any nans at this point
+            new_categories = col_categories.astype("int64")
+            new_dtypes[col] = pd.CategoricalDtype(
+                categories=new_categories,
+                ordered=original_dtypes[col].ordered,
+            )
+        else:
+            # CatBoost explanation as to why they don't support float categories: https://catboost.ai/en/docs/concepts/faq#floating-point-values
+            # CatBoost bug keeping us from converting to string: https://github.com/catboost/catboost/issues/1965
+            # Pandas bug keeping us from converting `.astype("string").astype("object")`: https://github.com/pandas-dev/pandas/issues/51074
+            raise ValueError(
+                f"Invalid category found in {col}. CatBoost does not support floats as categories.",
+            )
+
+    X_t = X.astype(new_dtypes)
+    X_t.ww.init(schema=original_schema)
+    return X_t
diff --git a/evalml/tests/component_tests/test_catboost_classifier.py b/evalml/tests/component_tests/test_catboost_classifier.py
@@ -1,5 +1,8 @@
 import warnings
 
+import pandas as pd
+import woodwork as ww
+
 from evalml.pipelines.components import CatBoostClassifier
 from evalml.utils import SEED_BOUNDS
 
@@ -35,3 +38,28 @@ def test_catboost_classifier_init_thread_count():
         CatBoostClassifier(thread_count=2)
     assert len(w) == 1
     assert "Parameter 'thread_count' will be ignored. " in str(w[-1].message)
+
+
+def test_catboost_classifier_double_categories_in_y(categorical_floats_df):
+    X = categorical_floats_df
+    y = pd.Series(
+        [1.0, 2.0, 3.0, 4.0, 5.0] * 20,
+    )
+    ww.init_series(y, logical_type="Categorical")
+
+    clf = CatBoostClassifier()
+    fitted = clf.fit(X, y)
+    assert isinstance(fitted, CatBoostClassifier)
+
+
+def test_catboost_classifier_double_categories_in_X(categorical_floats_df):
+    X = categorical_floats_df
+    y = pd.Series([1, 2, 3, 4, 5] * 20)
+
+    clf = CatBoostClassifier()
+    fitted = clf.fit(X, y)
+    assert isinstance(fitted, CatBoostClassifier)
+    predictions = clf.predict(X)
+    assert isinstance(predictions, pd.Series)
+    predictions = clf.predict_proba(X)
+    assert isinstance(predictions, pd.DataFrame)
diff --git a/evalml/tests/component_tests/test_catboost_regressor.py b/evalml/tests/component_tests/test_catboost_regressor.py
@@ -1,5 +1,7 @@
 import warnings
 
+import pandas as pd
+
 from evalml.pipelines.components import CatBoostRegressor
 from evalml.utils import SEED_BOUNDS
 
@@ -35,3 +37,14 @@ def test_catboost_regressor_init_thread_count():
         CatBoostRegressor(thread_count=2)
     assert len(w) == 1
     assert "Parameter 'thread_count' will be ignored. " in str(w[-1].message)
+
+
+def test_catboost_regressor_double_categories_in_X(categorical_floats_df):
+    X = categorical_floats_df
+    y = pd.Series([1, 2, 3, 4, 5] * 20)
+
+    clf = CatBoostRegressor()
+    fitted = clf.fit(X, y)
+    assert isinstance(fitted, CatBoostRegressor)
+    predictions = clf.predict(X)
+    assert isinstance(predictions, pd.Series)
diff --git a/evalml/tests/component_tests/test_utils.py b/evalml/tests/component_tests/test_utils.py
@@ -19,6 +19,7 @@
     drop_natural_language_columns,
     estimator_unable_to_handle_nans,
     handle_component_class,
+    handle_float_categories_for_catboost,
     make_balancing_dictionary,
     scikit_learn_wrapped_estimator,
     set_boolean_columns_to_integer,
@@ -118,7 +119,9 @@ def test_all_components(
 ):
     if is_using_conda:
         # No prophet, ARIMA, and vowpalwabbit
-        expected_components = all_requirements_set.difference(not_supported_in_conda)
+        expected_components = all_requirements_set.difference(
+            not_supported_in_conda,
+        )
     else:
         expected_components = all_requirements_set
     all_component_names = [component.name for component in all_components()]
@@ -149,7 +152,9 @@ class NonComponent:
 
 
 def test_scikit_learn_wrapper_invalid_problem_type():
-    evalml_pipeline = MulticlassClassificationPipeline([RandomForestClassifier])
+    evalml_pipeline = MulticlassClassificationPipeline(
+        [RandomForestClassifier],
+    )
     evalml_pipeline.problem_type = None
     with pytest.raises(
         ValueError,
@@ -313,3 +318,52 @@ def test_set_boolean_columns_to_integer():
         X_e.ww.select(["IntegerNullable"]),
         check_dtype=False,
     )
+
+
+def test_handle_float_categories_for_catboost(categorical_floats_df):
+    X = categorical_floats_df
+    X_t = handle_float_categories_for_catboost(X)
+
+    # Since only the categories' changed, the woodwork schema should be equal before and after
+    # But the dtype Series' shouldn't
+    assert X.ww.schema == X_t.ww.schema
+    assert not X.dtypes.equals(X_t.dtypes)
+
+    expected_dtype_before_and_after = {
+        "double_int_cats": ("float64", "int64"),
+        # These shouldn't change
+        "string_cats": None,
+        "int_cats": None,
+        "int_col": None,
+        "double_col": None,
+    }
+
+    for col in X.columns:
+        if before_and_after := expected_dtype_before_and_after.get(col):
+            before_dtype, after_dtype = before_and_after
+            assert X.dtypes[col].categories.dtype == before_dtype
+            assert X_t.dtypes[col].categories.dtype == after_dtype
+            # Confirm that the numeric values are still equal - we didn't truncate anything
+            for i in range(len(X)):
+                assert X[col].iloc[i] == float(X_t[col].iloc[i])
+        else:
+            pd.testing.assert_series_equal(X[col], X_t[col])
+
+
+def test_handle_float_categories_for_catboost_actual_floats():
+    X = pd.DataFrame({"really_double_cats": pd.Series([1.2, 2.3, 3.9, 4.1, 5.5] * 20)})
+    X.ww.init(logical_types={"really_double_cats": "Categorical"})
+
+    error = "CatBoost does not support floats as categories."
+    with pytest.raises(ValueError, match=error):
+        handle_float_categories_for_catboost(X)
+
+
+def test_handle_float_categories_for_catboost_noop(
+    categorical_floats_df,
+):
+    X = categorical_floats_df.ww[["string_cats", "int_col", "int_cats"]]
+
+    X_t = handle_float_categories_for_catboost(X)
+    pd.testing.assert_frame_equal(X, X_t)
+    assert X.ww.schema == X_t.ww.schema
diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py
@@ -2312,3 +2312,27 @@ def _return_proper_func(real_or_synthetic):
             return generate_real_data
 
     return _return_proper_func
+
+
+@pytest.fixture
+def categorical_floats_df():
+    X = pd.DataFrame(
+        {
+            "double_int_cats": pd.Series([1.0, 2.0, 3.0, 4.0, 5.0] * 20),
+            "string_cats": pd.Series(["a", "b", "c", "d", "e"] * 20),
+            "int_cats": pd.Series([1, 2, 3, 4, 5] * 20),
+            "int_col": pd.Series([1, 2, 3, 4, 5] * 20),
+            "double_col": pd.Series([1.2, 2.3, 3.9, 4.1, 5.5] * 20),
+        },
+    )
+    X.ww.init(
+        logical_types={
+            "double_int_cats": "Categorical",
+            "string_cats": "Categorical",
+            "int_cats": "Categorical",
+            "int_col": "Integer",
+            "double_col": "Double",
+        },
+    )
+
+    return X