Skip to content

Commit d9e7403

Browse files
Copilotthinkall
andauthored
Expose task-level and estimator-level preprocessors as public API (#1497)
* Initial plan * Add public preprocess() API methods for AutoML and estimators Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Add documentation for preprocess() API methods Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Add example script demonstrating preprocess() API usage Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Address code review feedback - fix type hints and simplify test logic Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Fix formatting issues with pre-commit hooks Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> * Remove example.py, make tests faster --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: thinkall <3197038+thinkall@users.noreply.github.com> Co-authored-by: Li Jiang <bnujli@gmail.com> Co-authored-by: Li Jiang <lijiang1@microsoft.com>
1 parent 7ec1414 commit d9e7403

File tree

4 files changed

+368
-1
lines changed

4 files changed

+368
-1
lines changed

flaml/automl/automl.py

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -789,7 +789,7 @@ def score(
789789

790790
def predict(
791791
self,
792-
X: np.array | DataFrame | list[str] | list[list[str]] | psDataFrame,
792+
X: np.ndarray | DataFrame | list[str] | list[list[str]] | psDataFrame,
793793
**pred_kwargs,
794794
):
795795
"""Predict label from features.
@@ -855,6 +855,50 @@ def predict_proba(self, X, **pred_kwargs):
855855
proba = self._trained_estimator.predict_proba(X, **pred_kwargs)
856856
return proba
857857

858+
def preprocess(
859+
self,
860+
X: np.ndarray | DataFrame | list[str] | list[list[str]] | psDataFrame,
861+
):
862+
"""Preprocess data using task-level preprocessing.
863+
864+
This method applies task-level preprocessing transformations to the input data,
865+
including handling of data types, sparse matrices, and feature transformations
866+
that were learned during the fit phase. This should be called before any
867+
estimator-level preprocessing.
868+
869+
Args:
870+
X: A numpy array or pandas dataframe or pyspark.pandas dataframe
871+
of featurized instances, shape n * m,
872+
or for time series forecast tasks:
873+
a pandas dataframe with the first column containing
874+
timestamp values (datetime type) or an integer n for
875+
the predict steps (only valid when the estimator is
876+
arima or sarimax). Other columns in the dataframe
877+
are assumed to be exogenous variables (categorical
878+
or numeric).
879+
880+
Returns:
881+
Preprocessed data in the same format as input (numpy array, DataFrame, etc.).
882+
883+
Raises:
884+
AttributeError: If the model has not been fitted yet.
885+
886+
Example:
887+
```python
888+
automl = AutoML()
889+
automl.fit(X_train, y_train, task="classification")
890+
891+
# Apply task-level preprocessing to new data
892+
X_test_preprocessed = automl.preprocess(X_test)
893+
```
894+
"""
895+
if not hasattr(self, "_state") or self._state is None:
896+
raise AttributeError("AutoML instance has not been fitted yet. Please call fit() first.")
897+
if not hasattr(self, "_transformer"):
898+
raise AttributeError("Transformer not initialized. Please call fit() first.")
899+
900+
return self._state.task.preprocess(X, self._transformer)
901+
858902
def add_learner(self, learner_name, learner_class):
859903
"""Add a customized learner.
860904

flaml/automl/model.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,35 @@ def fit(self, X_train, y_train, budget=None, free_mem_ratio=0, **kwargs):
295295
train_time = self._fit(X_train, y_train, **kwargs)
296296
return train_time
297297

298+
def preprocess(self, X):
299+
"""Preprocess data using estimator-level preprocessing.
300+
301+
This method applies estimator-specific preprocessing transformations to the input data.
302+
This is the second level of preprocessing that should be applied after task-level
303+
preprocessing (automl.preprocess()). Different estimator types may apply different
304+
preprocessing steps (e.g., sparse matrix conversion, dataframe handling).
305+
306+
Args:
307+
X: A numpy array or a dataframe of featurized instances, shape n*m.
308+
309+
Returns:
310+
Preprocessed data ready for the estimator's predict/fit methods.
311+
312+
Example:
313+
```python
314+
automl = AutoML()
315+
automl.fit(X_train, y_train, task="classification")
316+
317+
# First apply task-level preprocessing
318+
X_test_task = automl.preprocess(X_test)
319+
320+
# Then apply estimator-level preprocessing
321+
estimator = automl.model
322+
X_test_estimator = estimator.preprocess(X_test_task)
323+
```
324+
"""
325+
return self._preprocess(X)
326+
298327
def predict(self, X, **kwargs):
299328
"""Predict label from features.
300329

test/automl/test_preprocess_api.py

Lines changed: 236 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,236 @@
1+
"""Tests for the public preprocessor APIs."""
2+
import unittest
3+
4+
import numpy as np
5+
import pandas as pd
6+
from sklearn.datasets import load_breast_cancer, load_diabetes
7+
8+
from flaml import AutoML
9+
10+
11+
class TestPreprocessAPI(unittest.TestCase):
12+
"""Test cases for the public preprocess() API methods."""
13+
14+
def test_automl_preprocess_before_fit(self):
15+
"""Test that calling preprocess before fit raises an error."""
16+
automl = AutoML()
17+
X_test = np.array([[1, 2, 3], [4, 5, 6]])
18+
19+
with self.assertRaises(AttributeError) as context:
20+
automl.preprocess(X_test)
21+
# Check that an error is raised about not being fitted
22+
self.assertIn("fit()", str(context.exception))
23+
24+
def test_automl_preprocess_classification(self):
25+
"""Test task-level preprocessing for classification."""
26+
# Load dataset
27+
X, y = load_breast_cancer(return_X_y=True)
28+
X_train, y_train = X[:400], y[:400]
29+
X_test = X[400:450]
30+
31+
# Train AutoML
32+
automl = AutoML()
33+
automl_settings = {
34+
"max_iter": 5,
35+
"task": "classification",
36+
"metric": "accuracy",
37+
"estimator_list": ["lgbm"],
38+
"verbose": 0,
39+
}
40+
automl.fit(X_train, y_train, **automl_settings)
41+
42+
# Test task-level preprocessing
43+
X_preprocessed = automl.preprocess(X_test)
44+
45+
# Verify the output is not None and has the right shape
46+
self.assertIsNotNone(X_preprocessed)
47+
self.assertEqual(X_preprocessed.shape[0], X_test.shape[0])
48+
49+
def test_automl_preprocess_regression(self):
50+
"""Test task-level preprocessing for regression."""
51+
# Load dataset
52+
X, y = load_diabetes(return_X_y=True)
53+
X_train, y_train = X[:300], y[:300]
54+
X_test = X[300:350]
55+
56+
# Train AutoML
57+
automl = AutoML()
58+
automl_settings = {
59+
"max_iter": 5,
60+
"task": "regression",
61+
"metric": "r2",
62+
"estimator_list": ["lgbm"],
63+
"verbose": 0,
64+
}
65+
automl.fit(X_train, y_train, **automl_settings)
66+
67+
# Test task-level preprocessing
68+
X_preprocessed = automl.preprocess(X_test)
69+
70+
# Verify the output
71+
self.assertIsNotNone(X_preprocessed)
72+
self.assertEqual(X_preprocessed.shape[0], X_test.shape[0])
73+
74+
def test_automl_preprocess_with_dataframe(self):
75+
"""Test task-level preprocessing with pandas DataFrame."""
76+
# Create a simple dataset
77+
X_train = pd.DataFrame(
78+
{
79+
"feature1": [1, 2, 3, 4, 5] * 20,
80+
"feature2": [5, 4, 3, 2, 1] * 20,
81+
"category": ["a", "b", "a", "b", "a"] * 20,
82+
}
83+
)
84+
y_train = pd.Series([0, 1, 0, 1, 0] * 20)
85+
86+
X_test = pd.DataFrame(
87+
{
88+
"feature1": [6, 7, 8],
89+
"feature2": [1, 2, 3],
90+
"category": ["a", "b", "a"],
91+
}
92+
)
93+
94+
# Train AutoML
95+
automl = AutoML()
96+
automl_settings = {
97+
"max_iter": 5,
98+
"task": "classification",
99+
"metric": "accuracy",
100+
"estimator_list": ["lgbm"],
101+
"verbose": 0,
102+
}
103+
automl.fit(X_train, y_train, **automl_settings)
104+
105+
# Test preprocessing
106+
X_preprocessed = automl.preprocess(X_test)
107+
108+
# Verify the output - check the number of rows matches
109+
self.assertIsNotNone(X_preprocessed)
110+
preprocessed_len = len(X_preprocessed) if hasattr(X_preprocessed, "__len__") else X_preprocessed.shape[0]
111+
self.assertEqual(preprocessed_len, len(X_test))
112+
113+
def test_estimator_preprocess(self):
114+
"""Test estimator-level preprocessing."""
115+
# Load dataset
116+
X, y = load_breast_cancer(return_X_y=True)
117+
X_train, y_train = X[:400], y[:400]
118+
X_test = X[400:450]
119+
120+
# Train AutoML
121+
automl = AutoML()
122+
automl_settings = {
123+
"max_iter": 5,
124+
"task": "classification",
125+
"metric": "accuracy",
126+
"estimator_list": ["lgbm"],
127+
"verbose": 0,
128+
}
129+
automl.fit(X_train, y_train, **automl_settings)
130+
131+
# Get the trained estimator
132+
estimator = automl.model
133+
self.assertIsNotNone(estimator)
134+
135+
# First apply task-level preprocessing
136+
X_task_preprocessed = automl.preprocess(X_test)
137+
138+
# Then apply estimator-level preprocessing
139+
X_estimator_preprocessed = estimator.preprocess(X_task_preprocessed)
140+
141+
# Verify the output
142+
self.assertIsNotNone(X_estimator_preprocessed)
143+
self.assertEqual(X_estimator_preprocessed.shape[0], X_test.shape[0])
144+
145+
def test_preprocess_pipeline(self):
146+
"""Test the complete preprocessing pipeline (task-level then estimator-level)."""
147+
# Load dataset
148+
X, y = load_breast_cancer(return_X_y=True)
149+
X_train, y_train = X[:400], y[:400]
150+
X_test = X[400:450]
151+
152+
# Train AutoML
153+
automl = AutoML()
154+
automl_settings = {
155+
"max_iter": 5,
156+
"task": "classification",
157+
"metric": "accuracy",
158+
"estimator_list": ["lgbm"],
159+
"verbose": 0,
160+
}
161+
automl.fit(X_train, y_train, **automl_settings)
162+
163+
# Apply the complete preprocessing pipeline
164+
X_task_preprocessed = automl.preprocess(X_test)
165+
X_final = automl.model.preprocess(X_task_preprocessed)
166+
167+
# Verify predictions work with preprocessed data
168+
# The internal predict already does this preprocessing,
169+
# but we verify our manual preprocessing gives consistent results
170+
y_pred_manual = automl.model._model.predict(X_final)
171+
y_pred_auto = automl.predict(X_test)
172+
173+
# Both should give the same predictions
174+
np.testing.assert_array_equal(y_pred_manual, y_pred_auto)
175+
176+
def test_preprocess_with_mixed_types(self):
177+
"""Test preprocessing with mixed data types."""
178+
# Create dataset with mixed types
179+
X_train = pd.DataFrame(
180+
{
181+
"numeric1": np.random.rand(100),
182+
"numeric2": np.random.randint(0, 100, 100),
183+
"categorical": np.random.choice(["cat", "dog", "bird"], 100),
184+
"boolean": np.random.choice([True, False], 100),
185+
}
186+
)
187+
y_train = pd.Series(np.random.randint(0, 2, 100))
188+
189+
X_test = pd.DataFrame(
190+
{
191+
"numeric1": np.random.rand(10),
192+
"numeric2": np.random.randint(0, 100, 10),
193+
"categorical": np.random.choice(["cat", "dog", "bird"], 10),
194+
"boolean": np.random.choice([True, False], 10),
195+
}
196+
)
197+
198+
# Train AutoML
199+
automl = AutoML()
200+
automl_settings = {
201+
"max_iter": 5,
202+
"task": "classification",
203+
"metric": "accuracy",
204+
"estimator_list": ["lgbm"],
205+
"verbose": 0,
206+
}
207+
automl.fit(X_train, y_train, **automl_settings)
208+
209+
# Test preprocessing
210+
X_preprocessed = automl.preprocess(X_test)
211+
212+
# Verify the output
213+
self.assertIsNotNone(X_preprocessed)
214+
215+
def test_estimator_preprocess_without_automl(self):
216+
"""Test that estimator.preprocess() can be used independently."""
217+
from flaml.automl.model import LGBMEstimator
218+
219+
# Create a simple estimator
220+
X_train = np.random.rand(100, 5)
221+
y_train = np.random.randint(0, 2, 100)
222+
223+
estimator = LGBMEstimator(task="classification")
224+
estimator.fit(X_train, y_train)
225+
226+
# Test preprocessing
227+
X_test = np.random.rand(10, 5)
228+
X_preprocessed = estimator.preprocess(X_test)
229+
230+
# Verify the output
231+
self.assertIsNotNone(X_preprocessed)
232+
self.assertEqual(X_preprocessed.shape, X_test.shape)
233+
234+
235+
if __name__ == "__main__":
236+
unittest.main()

website/docs/Use-Cases/Task-Oriented-AutoML.md

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -726,6 +726,64 @@ plt.barh(
726726

727727
![png](images/feature_importance.png)
728728

729+
### Preprocess data
730+
731+
FLAML provides two levels of preprocessing that can be accessed as public APIs:
732+
733+
1. **Task-level preprocessing** (`automl.preprocess()`): This applies transformations that are specific to the task type, such as handling data types, sparse matrices, and feature transformations learned during training.
734+
735+
1. **Estimator-level preprocessing** (`estimator.preprocess()`): This applies transformations specific to the estimator type (e.g., LightGBM, XGBoost).
736+
737+
The task-level preprocessing should be applied before the estimator-level preprocessing.
738+
739+
#### Task-level preprocessing
740+
741+
```python
742+
from flaml import AutoML
743+
import numpy as np
744+
745+
# Train the model
746+
automl = AutoML()
747+
automl.fit(X_train, y_train, task="classification", time_budget=60)
748+
749+
# Apply task-level preprocessing to new data
750+
X_test_preprocessed = automl.preprocess(X_test)
751+
752+
# Now you can use this with the estimator
753+
predictions = automl.model.predict(X_test_preprocessed)
754+
```
755+
756+
#### Estimator-level preprocessing
757+
758+
```python
759+
# Get the trained estimator
760+
estimator = automl.model
761+
762+
# Apply task-level preprocessing first
763+
X_test_task = automl.preprocess(X_test)
764+
765+
# Then apply estimator-level preprocessing
766+
X_test_estimator = estimator.preprocess(X_test_task)
767+
768+
# Use the fully preprocessed data with the underlying model
769+
predictions = estimator._model.predict(X_test_estimator)
770+
```
771+
772+
#### Complete preprocessing pipeline
773+
774+
For most use cases, the `predict()` method already handles both levels of preprocessing internally. However, if you need to apply preprocessing separately (e.g., for custom inference pipelines or debugging), you can use:
775+
776+
```python
777+
# Complete preprocessing pipeline
778+
X_task_preprocessed = automl.preprocess(X_test)
779+
X_final = automl.model.preprocess(X_task_preprocessed)
780+
781+
# This is equivalent to what happens internally in:
782+
predictions = automl.predict(X_test)
783+
```
784+
785+
**Note**: The `preprocess()` methods can only be called after `fit()` has been executed, as they rely on the transformations learned during training.
786+
729787
### Get best configuration
730788

731789
We can find the best estimator's name and best configuration by:

0 commit comments

Comments
 (0)