Skip to content

Commit 1687ca9

Browse files
Copilotthinkall
andauthored
Fix eval_set preprocessing for XGBoost estimators with categorical features (#1470)
* Initial plan * Initial analysis - reproduced eval_set preprocessing bug Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Fix eval_set preprocessing for XGBoost estimators with categorical features Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Add eval_set tests to test_xgboost function Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Fix linting issues with ruff and black Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> Co-authored-by: Li Jiang <bnujli@gmail.com>
1 parent 7a597ad commit 1687ca9

File tree

2 files changed

+82
-0
lines changed

2 files changed

+82
-0
lines changed

flaml/default/estimator.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,27 @@ def suggest_hyperparams(self, X, y):
9595
def fit(self, X, y, *args, **params):
9696
hyperparams, estimator_name, X, y_transformed = self.suggest_hyperparams(X, y)
9797
self.set_params(**hyperparams)
98+
99+
# Transform eval_set if present
100+
if "eval_set" in params and params["eval_set"] is not None:
101+
transformed_eval_set = []
102+
for eval_X, eval_y in params["eval_set"]:
103+
# Transform features
104+
eval_X_transformed = self._feature_transformer.transform(eval_X)
105+
# Transform labels if applicable
106+
if self._label_transformer and estimator_name in [
107+
"rf",
108+
"extra_tree",
109+
"xgboost",
110+
"xgb_limitdepth",
111+
"choose_xgb",
112+
]:
113+
eval_y_transformed = self._label_transformer.transform(eval_y)
114+
transformed_eval_set.append((eval_X_transformed, eval_y_transformed))
115+
else:
116+
transformed_eval_set.append((eval_X_transformed, eval_y))
117+
params["eval_set"] = transformed_eval_set
118+
98119
if self._label_transformer and estimator_name in [
99120
"rf",
100121
"extra_tree",

test/default/test_defaults.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,8 @@ def test_lgbm():
183183

184184

185185
def test_xgboost():
186+
import numpy as np
187+
186188
from flaml.default import XGBClassifier, XGBRegressor
187189

188190
X_train, y_train = load_breast_cancer(return_X_y=True, as_frame=True)
@@ -200,6 +202,65 @@ def test_xgboost():
200202
regressor.predict(X_train)
201203
print(regressor)
202204

205+
# Test eval_set with categorical features (Issue: eval_set not preprocessed)
206+
np.random.seed(42)
207+
n = 500
208+
df = pd.DataFrame(
209+
{
210+
"num1": np.random.randn(n),
211+
"num2": np.random.rand(n) * 10,
212+
"cat1": np.random.choice(["A", "B", "C"], size=n),
213+
"cat2": np.random.choice(["X", "Y"], size=n),
214+
"target": np.random.choice([0, 1], size=n),
215+
}
216+
)
217+
218+
X = df.drop(columns="target")
219+
y = df["target"]
220+
221+
X_train_cat, X_valid_cat, y_train_cat, y_valid_cat = train_test_split(X, y, test_size=0.2, random_state=0)
222+
223+
# Convert categorical columns to pandas 'category' dtype
224+
for col in X_train_cat.select_dtypes(include="object").columns:
225+
X_train_cat[col] = X_train_cat[col].astype("category")
226+
X_valid_cat[col] = X_valid_cat[col].astype("category")
227+
228+
# Test XGBClassifier with eval_set
229+
classifier_eval = XGBClassifier(
230+
tree_method="hist",
231+
enable_categorical=True,
232+
eval_metric="logloss",
233+
use_label_encoder=False,
234+
early_stopping_rounds=10,
235+
random_state=0,
236+
n_estimators=10,
237+
)
238+
classifier_eval.fit(X_train_cat, y_train_cat, eval_set=[(X_valid_cat, y_valid_cat)], verbose=False)
239+
y_pred = classifier_eval.predict(X_valid_cat)
240+
assert len(y_pred) == len(y_valid_cat)
241+
242+
# Test XGBRegressor with eval_set
243+
y_reg = df["num1"] # Use num1 as target for regression
244+
X_reg = df.drop(columns=["num1", "target"])
245+
246+
X_train_reg, X_valid_reg, y_train_reg, y_valid_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=0)
247+
248+
for col in X_train_reg.select_dtypes(include="object").columns:
249+
X_train_reg[col] = X_train_reg[col].astype("category")
250+
X_valid_reg[col] = X_valid_reg[col].astype("category")
251+
252+
regressor_eval = XGBRegressor(
253+
tree_method="hist",
254+
enable_categorical=True,
255+
eval_metric="rmse",
256+
early_stopping_rounds=10,
257+
random_state=0,
258+
n_estimators=10,
259+
)
260+
regressor_eval.fit(X_train_reg, y_train_reg, eval_set=[(X_valid_reg, y_valid_reg)], verbose=False)
261+
y_pred = regressor_eval.predict(X_valid_reg)
262+
assert len(y_pred) == len(y_valid_reg)
263+
203264

204265
def test_nobudget():
205266
X_train, y_train = load_breast_cancer(return_X_y=True, as_frame=True)

0 commit comments

Comments
 (0)