Skip to content

Commit adc92ba

Browse files
authored
Auto-sklearn 2.0 (#893)
* add new meta-selection model * update PR * fix manifest * update the docs
1 parent 9a8ba56 commit adc92ba

18 files changed

+9426
-4
lines changed

MANIFEST.in

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,5 @@ recursive-include autosklearn/metalearning/files *.txt
44
include autosklearn/util/logging.yaml
55
recursive-include autosklearn *.pyx
66
include requirements.txt
7+
recursive-include autosklearn/experimental/askl2_portfolios *.json
8+
include autosklearn/experimental/askl2_training_data.json

README.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,28 @@ auto-sklearn is an automated machine learning toolkit and a drop-in replacement
44

55
Find the documentation [here](http://automl.github.io/auto-sklearn/)
66

7+
## Automated Machine Learning in four lines of code
8+
9+
```python
10+
import autosklearn.classification
11+
cls = autosklearn.classification.AutoSklearnClassifier()
12+
cls.fit(X_train, y_train)
13+
predictions = cls.predict(X_test)
14+
```
15+
16+
## Relevant publications
17+
18+
Efficient and Robust Automated Machine Learning
19+
Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter
20+
Advances in Neural Information Processing Systems 28 (2015)
21+
http://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf
22+
23+
Auto-Sklearn 2.0: The Next Generation
24+
Authors: Matthias Feurer, Katharina Eggensperger, Stefan Falkner, Marius Lindauer and Frank Hutter
25+
To appear
26+
27+
## Status
28+
729
Status for master branch
830

931
[![Build Status](https://travis-ci.org/automl/auto-sklearn.svg?branch=master)](https://travis-ci.org/automl/auto-sklearn)

autosklearn/experimental/__init__.py

Whitespace-only changes.

autosklearn/experimental/askl2.py

Lines changed: 297 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,297 @@
1+
import json
2+
import multiprocessing
3+
import os
4+
import pickle
5+
from typing import Any, Dict, Optional, Union
6+
7+
from ConfigSpace import Configuration
8+
import numpy as np
9+
import pandas as pd
10+
11+
from autosklearn.classification import AutoSklearnClassifier
12+
import autosklearn.experimental.selector
13+
from autosklearn.metrics import Scorer
14+
15+
16+
CALLBACK_COUNTER = multiprocessing.Value('i', 0)
17+
18+
19+
this_directory = os.path.abspath(os.path.dirname(__file__))
20+
selector_file = os.path.join(this_directory, 'selector.pkl')
21+
training_data_file = os.path.join(this_directory, 'askl2_training_data.json')
22+
with open(training_data_file) as fh:
23+
training_data = json.load(fh)
24+
metafeatures = pd.DataFrame(training_data['metafeatures'])
25+
y_values = np.array(training_data['y_values'])
26+
strategies = training_data['strategies']
27+
minima_for_methods = training_data['minima_for_methods']
28+
maxima_for_methods = training_data['maxima_for_methods']
29+
if not os.path.exists(selector_file):
30+
selector = autosklearn.experimental.selector.OneVSOneSelector(
31+
configuration=training_data['configuration'],
32+
default_strategy_idx=strategies.index('RF_SH-eta4-i_holdout_iterative_es_if'),
33+
rng=1,
34+
)
35+
selector.fit(
36+
X=metafeatures,
37+
y=y_values,
38+
methods=strategies,
39+
minima=minima_for_methods,
40+
maxima=maxima_for_methods,
41+
)
42+
with open(selector_file, 'wb') as fh:
43+
pickle.dump(selector, fh)
44+
45+
46+
def get_smac_object_callback(portfolio, lock):
47+
def get_smac_object(
48+
scenario_dict,
49+
seed,
50+
ta,
51+
ta_kwargs,
52+
backend,
53+
metalearning_configurations,
54+
):
55+
from smac.facade.smac_ac_facade import SMAC4AC
56+
from smac.runhistory.runhistory2epm import RunHistory2EPM4LogCost
57+
from smac.scenario.scenario import Scenario
58+
59+
scenario_dict['input_psmac_dirs'] = backend.get_smac_output_glob(
60+
smac_run_id=seed if not scenario_dict['shared-model'] else '*',
61+
)
62+
scenario = Scenario(scenario_dict)
63+
64+
lock.acquire()
65+
try:
66+
global CALLBACK_COUNTER
67+
print(CALLBACK_COUNTER.value, flush=True)
68+
if CALLBACK_COUNTER.value == 0:
69+
initial_configurations = [
70+
Configuration(configuration_space=scenario.cs, values=member)
71+
for member in portfolio.values()]
72+
else:
73+
initial_configurations = [scenario.cs.sample_configuration(size=1)]
74+
CALLBACK_COUNTER.value += 1
75+
finally:
76+
lock.release()
77+
78+
rh2EPM = RunHistory2EPM4LogCost
79+
return SMAC4AC(
80+
scenario=scenario,
81+
rng=seed,
82+
runhistory2epm=rh2EPM,
83+
tae_runner=ta,
84+
tae_runner_kwargs=ta_kwargs,
85+
initial_configurations=initial_configurations,
86+
run_id=seed,
87+
)
88+
return get_smac_object
89+
90+
91+
def get_sh_or_hb_object_callback(budget_type, bandit_strategy, eta, initial_budget, portfolio,
92+
lock):
93+
def get_smac_object(
94+
scenario_dict,
95+
seed,
96+
ta,
97+
ta_kwargs,
98+
backend,
99+
metalearning_configurations,
100+
):
101+
from smac.facade.smac_ac_facade import SMAC4AC
102+
from smac.intensification.successive_halving import SuccessiveHalving
103+
from smac.intensification.hyperband import Hyperband
104+
from smac.runhistory.runhistory2epm import RunHistory2EPM4LogCost
105+
from smac.scenario.scenario import Scenario
106+
107+
scenario_dict['input_psmac_dirs'] = backend.get_smac_output_glob(
108+
smac_run_id=seed if not scenario_dict['shared-model'] else '*',
109+
)
110+
scenario = Scenario(scenario_dict)
111+
112+
lock.acquire()
113+
try:
114+
global CALLBACK_COUNTER
115+
if CALLBACK_COUNTER.value == 0:
116+
initial_configurations = [
117+
Configuration(configuration_space=scenario.cs, values=member)
118+
for member in portfolio.values()]
119+
else:
120+
initial_configurations = [scenario.cs.sample_configuration(size=1)]
121+
CALLBACK_COUNTER.value += 1
122+
finally:
123+
lock.release()
124+
125+
rh2EPM = RunHistory2EPM4LogCost
126+
127+
ta_kwargs['budget_type'] = budget_type
128+
129+
if bandit_strategy == 'sh':
130+
bandit = SuccessiveHalving
131+
elif bandit_strategy == 'hb':
132+
bandit = Hyperband
133+
else:
134+
raise ValueError(bandit_strategy)
135+
136+
smac4ac = SMAC4AC(
137+
scenario=scenario,
138+
rng=seed,
139+
runhistory2epm=rh2EPM,
140+
tae_runner=ta,
141+
tae_runner_kwargs=ta_kwargs,
142+
initial_configurations=initial_configurations,
143+
run_id=seed,
144+
intensifier=bandit,
145+
intensifier_kwargs={
146+
'initial_budget': initial_budget,
147+
'max_budget': 100,
148+
'eta': eta,
149+
'min_chall': 1},
150+
)
151+
smac4ac.solver.epm_chooser.min_samples_model = int(
152+
len(scenario.cs.get_hyperparameters()) / 2
153+
)
154+
return smac4ac
155+
return get_smac_object
156+
157+
158+
class AutoSklearn2Classifier(AutoSklearnClassifier):
159+
160+
def __init__(
161+
self,
162+
time_left_for_this_task: int = 3600,
163+
ensemble_size: int = 50,
164+
ensemble_nbest: Union[float, int] = 50,
165+
max_models_on_disc: int = 50,
166+
ensemble_memory_limit: int = 1024,
167+
seed: int = 1,
168+
ml_memory_limit: int = 3072,
169+
tmp_folder: Optional[str] = None,
170+
output_folder: Optional[str] = None,
171+
delete_tmp_folder_after_terminate: bool = True,
172+
delete_output_folder_after_terminate: bool = True,
173+
shared_mode: bool = False,
174+
n_jobs: Optional[int] = None,
175+
disable_evaluator_output: bool = False,
176+
smac_scenario_args: Optional[Dict[str, Any]] = None,
177+
logging_config: Optional[Dict[str, Any]] = None,
178+
metric: Optional[Scorer] = None,
179+
):
180+
181+
include_estimators = [
182+
'extra_trees', 'passive_aggressive', 'random_forest', 'sgd', 'gradient_boosting',
183+
]
184+
include_preprocessors = ["no_preprocessing"]
185+
super().__init__(
186+
time_left_for_this_task=time_left_for_this_task,
187+
initial_configurations_via_metalearning=0,
188+
ensemble_size=ensemble_size,
189+
ensemble_nbest=ensemble_nbest,
190+
max_models_on_disc=max_models_on_disc,
191+
ensemble_memory_limit=ensemble_memory_limit,
192+
seed=seed,
193+
ml_memory_limit=ml_memory_limit,
194+
include_estimators=include_estimators,
195+
exclude_estimators=None,
196+
include_preprocessors=include_preprocessors,
197+
exclude_preprocessors=None,
198+
resampling_strategy=None,
199+
resampling_strategy_arguments=None,
200+
tmp_folder=tmp_folder,
201+
output_folder=output_folder,
202+
delete_tmp_folder_after_terminate=delete_tmp_folder_after_terminate,
203+
delete_output_folder_after_terminate=delete_output_folder_after_terminate,
204+
shared_mode=shared_mode,
205+
n_jobs=n_jobs,
206+
disable_evaluator_output=disable_evaluator_output,
207+
get_smac_object_callback=None,
208+
smac_scenario_args=smac_scenario_args,
209+
logging_config=logging_config,
210+
metadata_directory=None,
211+
metric=metric,
212+
)
213+
214+
def fit(self, X, y,
215+
X_test=None,
216+
y_test=None,
217+
metric=None,
218+
feat_type=None,
219+
dataset_name=None):
220+
221+
with open(selector_file, 'rb') as fh:
222+
selector = pickle.load(fh)
223+
224+
metafeatures = np.array([len(np.unique(y)), X.shape[1], X.shape[0]])
225+
selection = np.argmax(selector.predict(metafeatures))
226+
automl_policy = strategies[selection]
227+
228+
setting = {
229+
'RF_None_holdout_iterative_es_if': {
230+
'resampling_strategy': 'holdout-iterative-fit',
231+
'fidelity': None,
232+
},
233+
'RF_None_3CV_iterative_es_if': {
234+
'resampling_strategy': 'cv-iterative-fit',
235+
'folds': 3,
236+
'fidelity': None,
237+
},
238+
'RF_None_5CV_iterative_es_if': {
239+
'resampling_strategy': 'cv-iterative-fit',
240+
'folds': 5,
241+
'fidelity': None,
242+
},
243+
'RF_None_10CV_iterative_es_if': {
244+
'resampling_strategy': 'cv-iterative-fit',
245+
'folds': 10,
246+
'fidelity': None,
247+
},
248+
'RF_SH-eta4-i_holdout_iterative_es_if': {
249+
'resampling_strategy': 'holdout-iterative-fit',
250+
'fidelity': 'SH',
251+
},
252+
'RF_SH-eta4-i_3CV_iterative_es_if': {
253+
'resampling_strategy': 'cv-iterative-fit',
254+
'folds': 3,
255+
'fidelity': 'SH',
256+
},
257+
'RF_SH-eta4-i_5CV_iterative_es_if': {
258+
'resampling_strategy': 'cv-iterative-fit',
259+
'folds': 5,
260+
'fidelity': 'SH',
261+
},
262+
'RF_SH-eta4-i_10CV_iterative_es_if': {
263+
'resampling_strategy': 'cv-iterative-fit',
264+
'folds': 10,
265+
'fidelity': 'SH',
266+
}
267+
}[automl_policy]
268+
269+
resampling_strategy = setting['resampling_strategy']
270+
if resampling_strategy == 'cv-iterative-fit':
271+
resampling_strategy_kwargs = {'folds': setting['folds']}
272+
else:
273+
resampling_strategy_kwargs = None
274+
275+
portfolio_file = os.path.join(this_directory, 'askl2_portfolios', '%s.json' % automl_policy)
276+
with open(portfolio_file) as fh:
277+
portfolio_json = json.load(fh)
278+
portfolio = portfolio_json['portfolio']
279+
280+
lock = multiprocessing.Lock()
281+
if setting['fidelity'] == 'SH':
282+
smac_callback = get_sh_or_hb_object_callback('iterations', 'sh', 4, 5.0, portfolio,
283+
lock)
284+
else:
285+
smac_callback = get_smac_object_callback(portfolio, lock)
286+
287+
self.resampling_strategy = resampling_strategy
288+
self.resampling_strategy_arguments = resampling_strategy_kwargs
289+
self.get_smac_object_callback = smac_callback
290+
return super().fit(
291+
X=X,
292+
y=y,
293+
X_test=X_test,
294+
y_test=y_test,
295+
feat_type=feat_type,
296+
dataset_name=dataset_name,
297+
)

0 commit comments

Comments
 (0)