Skip to content

Commit 481c92c

Browse files
authored
Allow float categories to be passed into CatBoost estimators (#3966)
* add initial, messy, implementation of util to remove float cats * use fixture for tests and test predict * consolidate tests * Init woodwork at new predict method * raise error for true float case and refactor * Clean up * Add releaser note * remove arrow comment * Add predict proba to classifier
1 parent a409e17 commit 481c92c

File tree

8 files changed

+216
-3
lines changed

8 files changed

+216
-3
lines changed

docs/source/release_notes.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ Release Notes
44
* Enhancements
55
* Fixes
66
* Re-added ``TimeSeriesPipeline.should_skip_featurization`` to fix bug where data would get featurized unnecessarily :pr:`3964`
7+
* Allow float categories to be passed into CatBoost estimators :pr:`3966`
78
* Changes
89
* Update pyproject.toml to correctly specify the data filepaths :pr:`3967`
910
* Documentation Changes

evalml/pipelines/components/estimators/classifiers/catboost_classifier.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from evalml.model_family import ModelFamily
1010
from evalml.pipelines.components.estimators import Estimator
1111
from evalml.pipelines.components.transformers import LabelEncoder
12+
from evalml.pipelines.components.utils import handle_float_categories_for_catboost
1213
from evalml.problem_types import ProblemTypes
1314
from evalml.utils import import_or_raise, infer_feature_types
1415

@@ -119,6 +120,8 @@ def fit(self, X, y=None):
119120
if y.nunique() <= 2:
120121
self._label_encoder = LabelEncoder()
121122
y = self._label_encoder.fit_transform(None, y)[1]
123+
124+
X = handle_float_categories_for_catboost(X)
122125
self._component_obj.fit(X, y, silent=True, cat_features=cat_cols)
123126
return self
124127

@@ -129,9 +132,10 @@ def predict(self, X):
129132
X (pd.DataFrame): Data of shape [n_samples, n_features].
130133
131134
Returns:
132-
pd.DataFrame: Predicted values.
135+
pd.Series: Predicted values.
133136
"""
134137
X = infer_feature_types(X)
138+
X = handle_float_categories_for_catboost(X)
135139
predictions = self._component_obj.predict(X)
136140
if predictions.ndim == 2 and predictions.shape[1] == 1:
137141
predictions = predictions.flatten()
@@ -143,6 +147,20 @@ def predict(self, X):
143147
predictions.index = X.index
144148
return predictions
145149

150+
def predict_proba(self, X):
151+
"""Make prediction probabilities using the fitted CatBoost classifier.
152+
153+
Args:
154+
X (pd.DataFrame): Data of shape [n_samples, n_features].
155+
156+
Returns:
157+
pd.DataFrame: Predicted probability values.
158+
"""
159+
X = infer_feature_types(X)
160+
X = handle_float_categories_for_catboost(X)
161+
predictions = super().predict_proba(X)
162+
return predictions
163+
146164
@property
147165
def feature_importance(self):
148166
"""Feature importance of fitted CatBoost classifier."""

evalml/pipelines/components/estimators/regressors/catboost_regressor.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from evalml.model_family import ModelFamily
99
from evalml.pipelines.components.estimators import Estimator
10+
from evalml.pipelines.components.utils import handle_float_categories_for_catboost
1011
from evalml.problem_types import ProblemTypes
1112
from evalml.utils import (
1213
downcast_int_nullable_to_double,
@@ -113,9 +114,25 @@ def fit(self, X, y=None):
113114
self.input_feature_names = list(X.columns)
114115
X, y = super()._manage_woodwork(X, y)
115116
X = downcast_int_nullable_to_double(X)
117+
118+
X = handle_float_categories_for_catboost(X)
116119
self._component_obj.fit(X, y, silent=True, cat_features=cat_cols)
117120
return self
118121

122+
def predict(self, X):
123+
"""Make predictions using the fitted CatBoost regressor.
124+
125+
Args:
126+
X (pd.DataFrame): Data of shape [n_samples, n_features].
127+
128+
Returns:
129+
pd.DataFrame: Predicted values.
130+
"""
131+
X = infer_feature_types(X)
132+
X = handle_float_categories_for_catboost(X)
133+
predictions = super().predict(X)
134+
return predictions
135+
119136
@property
120137
def feature_importance(self):
121138
"""Feature importance of fitted CatBoost regressor."""

evalml/pipelines/components/utils.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -471,3 +471,61 @@ def make_balancing_dictionary(y, sampling_ratio):
471471
# this class is already larger than the ratio, don't change
472472
class_dic[index] = value_counts[index]
473473
return class_dic
474+
475+
476+
def handle_float_categories_for_catboost(X):
477+
"""Updates input data to be compatible with CatBoost estimators.
478+
479+
CatBoost cannot handle data in X that is the Categorical Woodwork logical type with floating point categories.
480+
This utility determines if the floating point categories can be converted to integers
481+
without truncating any data, and if they can be, converts them to int64 categories.
482+
Will not attempt to use values that are truly floating points.
483+
484+
Args:
485+
X (pd.DataFrame): Input data to CatBoost that has Woodwork initialized
486+
487+
Returns:
488+
DataFrame: Input data with exact same Woodwork typing info as the original but with any float categories
489+
converted to be int64 when possible.
490+
491+
Raises:
492+
ValueError: if the numeric categories are actual floats that cannot be converted to integers
493+
without truncating data
494+
"""
495+
original_schema = X.ww.schema
496+
original_dtypes = X.dtypes
497+
498+
# Determine which categorical columns have float categories, which CatBoost would error on
499+
categorical_columns = X.ww.select("category", return_schema=True).columns.keys()
500+
cols_with_float_categories = [
501+
col
502+
for col in categorical_columns
503+
if original_dtypes[col].categories.dtype == "float64"
504+
]
505+
506+
if not cols_with_float_categories:
507+
return X
508+
509+
# determine which columns are really integers vs are actually floats
510+
new_dtypes = {}
511+
for col in cols_with_float_categories:
512+
col_categories = original_dtypes[col].categories
513+
floats_are_really_ints = (col_categories % 1 == 0).all()
514+
if floats_are_really_ints:
515+
# We can use non nullable int64 here because there will not be any nans at this point
516+
new_categories = col_categories.astype("int64")
517+
new_dtypes[col] = pd.CategoricalDtype(
518+
categories=new_categories,
519+
ordered=original_dtypes[col].ordered,
520+
)
521+
else:
522+
# CatBoost explanation as to why they don't support float categories: https://catboost.ai/en/docs/concepts/faq#floating-point-values
523+
# CatBoost bug keeping us from converting to string: https://github.com/catboost/catboost/issues/1965
524+
# Pandas bug keeping us from converting `.astype("string").astype("object")`: https://github.com/pandas-dev/pandas/issues/51074
525+
raise ValueError(
526+
f"Invalid category found in {col}. CatBoost does not support floats as categories.",
527+
)
528+
529+
X_t = X.astype(new_dtypes)
530+
X_t.ww.init(schema=original_schema)
531+
return X_t

evalml/tests/component_tests/test_catboost_classifier.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
import warnings
22

3+
import pandas as pd
4+
import woodwork as ww
5+
36
from evalml.pipelines.components import CatBoostClassifier
47
from evalml.utils import SEED_BOUNDS
58

@@ -35,3 +38,28 @@ def test_catboost_classifier_init_thread_count():
3538
CatBoostClassifier(thread_count=2)
3639
assert len(w) == 1
3740
assert "Parameter 'thread_count' will be ignored. " in str(w[-1].message)
41+
42+
43+
def test_catboost_classifier_double_categories_in_y(categorical_floats_df):
44+
X = categorical_floats_df
45+
y = pd.Series(
46+
[1.0, 2.0, 3.0, 4.0, 5.0] * 20,
47+
)
48+
ww.init_series(y, logical_type="Categorical")
49+
50+
clf = CatBoostClassifier()
51+
fitted = clf.fit(X, y)
52+
assert isinstance(fitted, CatBoostClassifier)
53+
54+
55+
def test_catboost_classifier_double_categories_in_X(categorical_floats_df):
56+
X = categorical_floats_df
57+
y = pd.Series([1, 2, 3, 4, 5] * 20)
58+
59+
clf = CatBoostClassifier()
60+
fitted = clf.fit(X, y)
61+
assert isinstance(fitted, CatBoostClassifier)
62+
predictions = clf.predict(X)
63+
assert isinstance(predictions, pd.Series)
64+
predictions = clf.predict_proba(X)
65+
assert isinstance(predictions, pd.DataFrame)

evalml/tests/component_tests/test_catboost_regressor.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import warnings
22

3+
import pandas as pd
4+
35
from evalml.pipelines.components import CatBoostRegressor
46
from evalml.utils import SEED_BOUNDS
57

@@ -35,3 +37,14 @@ def test_catboost_regressor_init_thread_count():
3537
CatBoostRegressor(thread_count=2)
3638
assert len(w) == 1
3739
assert "Parameter 'thread_count' will be ignored. " in str(w[-1].message)
40+
41+
42+
def test_catboost_regressor_double_categories_in_X(categorical_floats_df):
43+
X = categorical_floats_df
44+
y = pd.Series([1, 2, 3, 4, 5] * 20)
45+
46+
clf = CatBoostRegressor()
47+
fitted = clf.fit(X, y)
48+
assert isinstance(fitted, CatBoostRegressor)
49+
predictions = clf.predict(X)
50+
assert isinstance(predictions, pd.Series)

evalml/tests/component_tests/test_utils.py

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
drop_natural_language_columns,
2020
estimator_unable_to_handle_nans,
2121
handle_component_class,
22+
handle_float_categories_for_catboost,
2223
make_balancing_dictionary,
2324
scikit_learn_wrapped_estimator,
2425
set_boolean_columns_to_integer,
@@ -118,7 +119,9 @@ def test_all_components(
118119
):
119120
if is_using_conda:
120121
# No prophet, ARIMA, and vowpalwabbit
121-
expected_components = all_requirements_set.difference(not_supported_in_conda)
122+
expected_components = all_requirements_set.difference(
123+
not_supported_in_conda,
124+
)
122125
else:
123126
expected_components = all_requirements_set
124127
all_component_names = [component.name for component in all_components()]
@@ -149,7 +152,9 @@ class NonComponent:
149152

150153

151154
def test_scikit_learn_wrapper_invalid_problem_type():
152-
evalml_pipeline = MulticlassClassificationPipeline([RandomForestClassifier])
155+
evalml_pipeline = MulticlassClassificationPipeline(
156+
[RandomForestClassifier],
157+
)
153158
evalml_pipeline.problem_type = None
154159
with pytest.raises(
155160
ValueError,
@@ -313,3 +318,52 @@ def test_set_boolean_columns_to_integer():
313318
X_e.ww.select(["IntegerNullable"]),
314319
check_dtype=False,
315320
)
321+
322+
323+
def test_handle_float_categories_for_catboost(categorical_floats_df):
324+
X = categorical_floats_df
325+
X_t = handle_float_categories_for_catboost(X)
326+
327+
# Since only the categories' changed, the woodwork schema should be equal before and after
328+
# But the dtype Series' shouldn't
329+
assert X.ww.schema == X_t.ww.schema
330+
assert not X.dtypes.equals(X_t.dtypes)
331+
332+
expected_dtype_before_and_after = {
333+
"double_int_cats": ("float64", "int64"),
334+
# These shouldn't change
335+
"string_cats": None,
336+
"int_cats": None,
337+
"int_col": None,
338+
"double_col": None,
339+
}
340+
341+
for col in X.columns:
342+
if before_and_after := expected_dtype_before_and_after.get(col):
343+
before_dtype, after_dtype = before_and_after
344+
assert X.dtypes[col].categories.dtype == before_dtype
345+
assert X_t.dtypes[col].categories.dtype == after_dtype
346+
# Confirm that the numeric values are still equal - we didn't truncate anything
347+
for i in range(len(X)):
348+
assert X[col].iloc[i] == float(X_t[col].iloc[i])
349+
else:
350+
pd.testing.assert_series_equal(X[col], X_t[col])
351+
352+
353+
def test_handle_float_categories_for_catboost_actual_floats():
354+
X = pd.DataFrame({"really_double_cats": pd.Series([1.2, 2.3, 3.9, 4.1, 5.5] * 20)})
355+
X.ww.init(logical_types={"really_double_cats": "Categorical"})
356+
357+
error = "CatBoost does not support floats as categories."
358+
with pytest.raises(ValueError, match=error):
359+
handle_float_categories_for_catboost(X)
360+
361+
362+
def test_handle_float_categories_for_catboost_noop(
363+
categorical_floats_df,
364+
):
365+
X = categorical_floats_df.ww[["string_cats", "int_col", "int_cats"]]
366+
367+
X_t = handle_float_categories_for_catboost(X)
368+
pd.testing.assert_frame_equal(X, X_t)
369+
assert X.ww.schema == X_t.ww.schema

evalml/tests/conftest.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2312,3 +2312,27 @@ def _return_proper_func(real_or_synthetic):
23122312
return generate_real_data
23132313

23142314
return _return_proper_func
2315+
2316+
2317+
@pytest.fixture
2318+
def categorical_floats_df():
2319+
X = pd.DataFrame(
2320+
{
2321+
"double_int_cats": pd.Series([1.0, 2.0, 3.0, 4.0, 5.0] * 20),
2322+
"string_cats": pd.Series(["a", "b", "c", "d", "e"] * 20),
2323+
"int_cats": pd.Series([1, 2, 3, 4, 5] * 20),
2324+
"int_col": pd.Series([1, 2, 3, 4, 5] * 20),
2325+
"double_col": pd.Series([1.2, 2.3, 3.9, 4.1, 5.5] * 20),
2326+
},
2327+
)
2328+
X.ww.init(
2329+
logical_types={
2330+
"double_int_cats": "Categorical",
2331+
"string_cats": "Categorical",
2332+
"int_cats": "Categorical",
2333+
"int_col": "Integer",
2334+
"double_col": "Double",
2335+
},
2336+
)
2337+
2338+
return X

0 commit comments

Comments
 (0)