Skip to content

Commit 11787ce

Browse files
authored
Merge branch 'main' into copilot/fix-training-test-set-overlap
2 parents 4a1ddda + fa1a32a commit 11787ce

File tree

4 files changed

+304
-19
lines changed

4 files changed

+304
-19
lines changed

flaml/automl/automl.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,8 @@ def __init__(self, **settings):
118118
e.g., 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_weighted',
119119
'roc_auc_ovo_weighted', 'roc_auc_ovr_weighted', 'f1', 'micro_f1', 'macro_f1',
120120
'log_loss', 'mae', 'mse', 'r2', 'mape'. Default is 'auto'.
121+
For a full list of supported built-in metrics, please refer to
122+
https://microsoft.github.io/FLAML/docs/Use-Cases/Task-Oriented-AutoML#optimization-metric
121123
If passing a customized metric function, the function needs to
122124
have the following input arguments:
123125
@@ -1765,6 +1767,8 @@ def fit(
17651767
e.g., 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_weighted',
17661768
'roc_auc_ovo_weighted', 'roc_auc_ovr_weighted', 'f1', 'micro_f1', 'macro_f1',
17671769
'log_loss', 'mae', 'mse', 'r2', 'mape'. Default is 'auto'.
1770+
For a full list of supported built-in metrics, please refer to
1771+
https://microsoft.github.io/FLAML/docs/Use-Cases/Task-Oriented-AutoML#optimization-metric
17681772
If passing a customized metric function, the function needs to
17691773
have the following input arguments:
17701774

flaml/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "2.4.2"
1+
__version__ = "2.5.0"

website/docs/Examples/Default-Flamlized.md

Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,82 @@ X_test.shape: (5160, 8), y_test.shape: (5160,)
6767

6868
[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/zeroshot_lightgbm.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/zeroshot_lightgbm.ipynb)
6969

70+
## Flamlized LGBMClassifier
71+
72+
### Prerequisites
73+
74+
This example requires the [autozero] option.
75+
76+
```bash
77+
pip install flaml[autozero] lightgbm openml
78+
```
79+
80+
### Zero-shot AutoML
81+
82+
```python
83+
from flaml.automl.data import load_openml_dataset
84+
from flaml.default import LGBMClassifier
85+
from flaml.automl.ml import sklearn_metric_loss_score
86+
87+
X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir="./")
88+
lgbm = LGBMClassifier()
89+
lgbm.fit(X_train, y_train)
90+
y_pred = lgbm.predict(X_test)
91+
print(
92+
"flamlized lgbm accuracy",
93+
"=",
94+
1 - sklearn_metric_loss_score("accuracy", y_pred, y_test),
95+
)
96+
print(lgbm)
97+
```
98+
99+
#### Sample output
100+
101+
```
102+
load dataset from ./openml_ds1169.pkl
103+
Dataset name: airlines
104+
X_train.shape: (404537, 7), y_train.shape: (404537,);
105+
X_test.shape: (134846, 7), y_test.shape: (134846,)
106+
flamlized lgbm accuracy = 0.6745
107+
LGBMClassifier(colsample_bytree=0.85, learning_rate=0.05, max_bin=255,
108+
min_child_samples=20, n_estimators=500, num_leaves=31,
109+
reg_alpha=0.01, reg_lambda=0.1, verbose=-1)
110+
```
111+
112+
## Flamlized XGBRegressor
113+
114+
### Prerequisites
115+
116+
This example requires xgboost, sklearn, openml==0.10.2.
117+
118+
### Zero-shot AutoML
119+
120+
```python
121+
from flaml.automl.data import load_openml_dataset
122+
from flaml.default import XGBRegressor
123+
from flaml.automl.ml import sklearn_metric_loss_score
124+
125+
X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=537, data_dir="./")
126+
xgb = XGBRegressor()
127+
xgb.fit(X_train, y_train)
128+
y_pred = xgb.predict(X_test)
129+
print("flamlized xgb r2", "=", 1 - sklearn_metric_loss_score("r2", y_pred, y_test))
130+
print(xgb)
131+
```
132+
133+
#### Sample output
134+
135+
```
136+
load dataset from ./openml_ds537.pkl
137+
Dataset name: houses
138+
X_train.shape: (15480, 8), y_train.shape: (15480,);
139+
X_test.shape: (5160, 8), y_test.shape: (5160,)
140+
flamlized xgb r2 = 0.8542
141+
XGBRegressor(colsample_bylevel=1, colsample_bytree=0.85, learning_rate=0.05,
142+
max_depth=6, n_estimators=500, reg_alpha=0.01, reg_lambda=1.0,
143+
subsample=0.9)
144+
```
145+
70146
## Flamlized XGBClassifier
71147

72148
### Prerequisites
@@ -112,3 +188,159 @@ XGBClassifier(base_score=0.5, booster='gbtree',
112188
scale_pos_weight=1, subsample=1.0, tree_method='hist',
113189
use_label_encoder=False, validate_parameters=1, verbosity=0)
114190
```
191+
192+
## Flamlized RandomForestRegressor
193+
194+
### Prerequisites
195+
196+
This example requires the [autozero] option.
197+
198+
```bash
199+
pip install flaml[autozero] scikit-learn openml
200+
```
201+
202+
### Zero-shot AutoML
203+
204+
```python
205+
from flaml.automl.data import load_openml_dataset
206+
from flaml.default import RandomForestRegressor
207+
from flaml.automl.ml import sklearn_metric_loss_score
208+
209+
X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=537, data_dir="./")
210+
rf = RandomForestRegressor()
211+
rf.fit(X_train, y_train)
212+
y_pred = rf.predict(X_test)
213+
print("flamlized rf r2", "=", 1 - sklearn_metric_loss_score("r2", y_pred, y_test))
214+
print(rf)
215+
```
216+
217+
#### Sample output
218+
219+
```
220+
load dataset from ./openml_ds537.pkl
221+
Dataset name: houses
222+
X_train.shape: (15480, 8), y_train.shape: (15480,);
223+
X_test.shape: (5160, 8), y_test.shape: (5160,)
224+
flamlized rf r2 = 0.8521
225+
RandomForestRegressor(max_features=0.8, min_samples_leaf=2, min_samples_split=5,
226+
n_estimators=500)
227+
```
228+
229+
## Flamlized RandomForestClassifier
230+
231+
### Prerequisites
232+
233+
This example requires the [autozero] option.
234+
235+
```bash
236+
pip install flaml[autozero] scikit-learn openml
237+
```
238+
239+
### Zero-shot AutoML
240+
241+
```python
242+
from flaml.automl.data import load_openml_dataset
243+
from flaml.default import RandomForestClassifier
244+
from flaml.automl.ml import sklearn_metric_loss_score
245+
246+
X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir="./")
247+
rf = RandomForestClassifier()
248+
rf.fit(X_train, y_train)
249+
y_pred = rf.predict(X_test)
250+
print(
251+
"flamlized rf accuracy",
252+
"=",
253+
1 - sklearn_metric_loss_score("accuracy", y_pred, y_test),
254+
)
255+
print(rf)
256+
```
257+
258+
#### Sample output
259+
260+
```
261+
load dataset from ./openml_ds1169.pkl
262+
Dataset name: airlines
263+
X_train.shape: (404537, 7), y_train.shape: (404537,);
264+
X_test.shape: (134846, 7), y_test.shape: (134846,)
265+
flamlized rf accuracy = 0.6701
266+
RandomForestClassifier(max_features=0.7, min_samples_leaf=3, min_samples_split=5,
267+
n_estimators=500)
268+
```
269+
270+
## Flamlized ExtraTreesRegressor
271+
272+
### Prerequisites
273+
274+
This example requires the [autozero] option.
275+
276+
```bash
277+
pip install flaml[autozero] scikit-learn openml
278+
```
279+
280+
### Zero-shot AutoML
281+
282+
```python
283+
from flaml.automl.data import load_openml_dataset
284+
from flaml.default import ExtraTreesRegressor
285+
from flaml.automl.ml import sklearn_metric_loss_score
286+
287+
X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=537, data_dir="./")
288+
et = ExtraTreesRegressor()
289+
et.fit(X_train, y_train)
290+
y_pred = et.predict(X_test)
291+
print("flamlized et r2", "=", 1 - sklearn_metric_loss_score("r2", y_pred, y_test))
292+
print(et)
293+
```
294+
295+
#### Sample output
296+
297+
```
298+
load dataset from ./openml_ds537.pkl
299+
Dataset name: houses
300+
X_train.shape: (15480, 8), y_train.shape: (15480,);
301+
X_test.shape: (5160, 8), y_test.shape: (5160,)
302+
flamlized et r2 = 0.8534
303+
ExtraTreesRegressor(max_features=0.75, min_samples_leaf=2, min_samples_split=5,
304+
n_estimators=500)
305+
```
306+
307+
## Flamlized ExtraTreesClassifier
308+
309+
### Prerequisites
310+
311+
This example requires the [autozero] option.
312+
313+
```bash
314+
pip install flaml[autozero] scikit-learn openml
315+
```
316+
317+
### Zero-shot AutoML
318+
319+
```python
320+
from flaml.automl.data import load_openml_dataset
321+
from flaml.default import ExtraTreesClassifier
322+
from flaml.automl.ml import sklearn_metric_loss_score
323+
324+
X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir="./")
325+
et = ExtraTreesClassifier()
326+
et.fit(X_train, y_train)
327+
y_pred = et.predict(X_test)
328+
print(
329+
"flamlized et accuracy",
330+
"=",
331+
1 - sklearn_metric_loss_score("accuracy", y_pred, y_test),
332+
)
333+
print(et)
334+
```
335+
336+
#### Sample output
337+
338+
```
339+
load dataset from ./openml_ds1169.pkl
340+
Dataset name: airlines
341+
X_train.shape: (404537, 7), y_train.shape: (404537,);
342+
X_test.shape: (134846, 7), y_test.shape: (134846,)
343+
flamlized et accuracy = 0.6698
344+
ExtraTreesClassifier(max_features=0.7, min_samples_leaf=3, min_samples_split=5,
345+
n_estimators=500)
346+
```

website/docs/Use-Cases/Task-Oriented-AutoML.md

Lines changed: 67 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ If users provide the minimal inputs only, `AutoML` uses the default settings for
5151
The optimization metric is specified via the `metric` argument. It can be either a string which refers to a built-in metric, or a user-defined function.
5252

5353
- Built-in metric.
54+
5455
- 'accuracy': 1 - accuracy as the corresponding metric to minimize.
5556
- 'log_loss': default metric for multiclass classification.
5657
- 'r2': 1 - r2_score as the corresponding metric to minimize. Default metric for regression.
@@ -70,6 +71,40 @@ The optimization metric is specified via the `metric` argument. It can be either
7071
- 'ap': minimize 1 - average_precision_score.
7172
- 'ndcg': minimize 1 - ndcg_score.
7273
- 'ndcg@k': minimize 1 - ndcg_score@k. k is an integer.
74+
- 'pr_auc': minimize 1 - precision-recall AUC score. (Spark-specific)
75+
- 'var': minimize variance. (Spark-specific)
76+
77+
- Built-in HuggingFace metrics (for NLP tasks).
78+
79+
- 'accuracy': minimize 1 - accuracy.
80+
- 'bertscore': minimize 1 - BERTScore.
81+
- 'bleu': minimize 1 - BLEU score.
82+
- 'bleurt': minimize 1 - BLEURT score.
83+
- 'cer': minimize character error rate.
84+
- 'chrf': minimize ChrF score.
85+
- 'code_eval': minimize 1 - code evaluation score.
86+
- 'comet': minimize 1 - COMET score.
87+
- 'competition_math': minimize 1 - competition math score.
88+
- 'coval': minimize 1 - CoVal score.
89+
- 'cuad': minimize 1 - CUAD score.
90+
- 'f1': minimize 1 - F1 score.
91+
- 'gleu': minimize 1 - GLEU score.
92+
- 'google_bleu': minimize 1 - Google BLEU score.
93+
- 'matthews_correlation': minimize 1 - Matthews correlation coefficient.
94+
- 'meteor': minimize 1 - METEOR score.
95+
- 'pearsonr': minimize 1 - Pearson correlation coefficient.
96+
- 'precision': minimize 1 - precision.
97+
- 'recall': minimize 1 - recall.
98+
- 'rouge': minimize 1 - ROUGE score.
99+
- 'rouge1': minimize 1 - ROUGE-1 score.
100+
- 'rouge2': minimize 1 - ROUGE-2 score.
101+
- 'sacrebleu': minimize 1 - SacreBLEU score.
102+
- 'sari': minimize 1 - SARI score.
103+
- 'seqeval': minimize 1 - SeqEval score.
104+
- 'spearmanr': minimize 1 - Spearman correlation coefficient.
105+
- 'ter': minimize translation error rate.
106+
- 'wer': minimize word error rate.
107+
73108
- User-defined function.
74109
A customized metric function that requires the following (input) signature, and returns the input config’s value in terms of the metric you want to minimize, and a dictionary of auxiliary information at your choice:
75110

@@ -207,6 +242,7 @@ To tune a custom estimator that is not built-in, you need to:
207242

208243
```python
209244
from flaml.automl.model import SKLearnEstimator
245+
210246
# SKLearnEstimator is derived from BaseEstimator
211247
import rgf
212248

@@ -215,31 +251,44 @@ class MyRegularizedGreedyForest(SKLearnEstimator):
215251
def __init__(self, task="binary", **config):
216252
super().__init__(task, **config)
217253

218-
if task in CLASSIFICATION:
219-
from rgf.sklearn import RGFClassifier
254+
if isinstance(task, str):
255+
from flaml.automl.task.factory import task_factory
256+
257+
task = task_factory(task)
220258

221-
self.estimator_class = RGFClassifier
259+
if task.is_classification():
260+
from rgf.sklearn import RGFClassifier
261+
262+
self.estimator_class = RGFClassifier
222263
else:
223-
from rgf.sklearn import RGFRegressor
264+
from rgf.sklearn import RGFRegressor
224265

225-
self.estimator_class = RGFRegressor
266+
self.estimator_class = RGFRegressor
226267

227268
@classmethod
228269
def search_space(cls, data_size, task):
229270
space = {
230-
"max_leaf": {
231-
"domain": tune.lograndint(lower=4, upper=data_size),
232-
"low_cost_init_value": 4,
233-
},
234-
"n_iter": {
235-
"domain": tune.lograndint(lower=1, upper=data_size),
236-
"low_cost_init_value": 1,
237-
},
238-
"learning_rate": {"domain": tune.loguniform(lower=0.01, upper=20.0)},
239-
"min_samples_leaf": {
240-
"domain": tune.lograndint(lower=1, upper=20),
241-
"init_value": 20,
242-
},
271+
"max_leaf": {
272+
"domain": tune.lograndint(lower=4, upper=data_size[0]),
273+
"init_value": 4,
274+
},
275+
"n_iter": {
276+
"domain": tune.lograndint(lower=1, upper=data_size[0]),
277+
"init_value": 1,
278+
},
279+
"n_tree_search": {
280+
"domain": tune.lograndint(lower=1, upper=32768),
281+
"init_value": 1,
282+
},
283+
"opt_interval": {
284+
"domain": tune.lograndint(lower=1, upper=10000),
285+
"init_value": 100,
286+
},
287+
"learning_rate": {"domain": tune.loguniform(lower=0.01, upper=20.0)},
288+
"min_samples_leaf": {
289+
"domain": tune.lograndint(lower=1, upper=20),
290+
"init_value": 20,
291+
},
243292
}
244293
return space
245294
```

0 commit comments

Comments
 (0)