Skip to content

Commit 7a3f3a5

Browse files
authored
Merge pull request #964 from automl/development
Development
2 parents 70d1a30 + bf9a741 commit 7a3f3a5

File tree

601 files changed

+63708
-57169
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

601 files changed

+63708
-57169
lines changed

.travis.yml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,17 +26,12 @@ matrix:
2626
env: DISTRIB="conda" RUN_FLAKE8="true" SKIP_TESTS="true"
2727
- os: linux
2828
env: DISTRIB="conda" RUN_MYPY="true" SKIP_TESTS="true"
29-
- os: linux
30-
env: DISTRIB="conda" PYTHON="3.5"
3129
- os: linux
3230
env: DISTRIB="conda" COVERAGE="true" PYTHON="3.6"
3331
- os: linux
3432
env: DISTRIB="conda" TEST_DIST="true" PYTHON="3.7"
3533
- os: linux
3634
env: DISTRIB="conda" PYTHON="3.8"
37-
- os: linux
38-
python: 3.5
39-
env: DISTRIB="ubuntu"
4035
- os: linux
4136
python: 3.6
4237
env: DISTRIB="ubuntu"

Dockerfile

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,6 @@ FROM ubuntu:18.04
22

33
WORKDIR /auto-sklearn
44

5-
# Copy the checkout autosklearn version for installation
6-
ADD . /auto-sklearn/
7-
85
# install linux packages
96
RUN apt-get update
107

@@ -17,19 +14,29 @@ ENV LANG en_US.UTF-8
1714
ENV LANGUAGE en_US:en
1815
ENV LC_ALL en_US.UTF-8
1916

17+
# set environment variables to only use one core
18+
RUN export OPENBLAS_NUM_THREADS=1
19+
RUN export MKL_NUM_THREADS=1
20+
RUN export BLAS_NUM_THREADS=1
21+
RUN export OMP_NUM_THREADS=1
22+
23+
# install build requirements
2024
RUN apt install -y python3-dev python3-pip
2125
RUN pip3 install --upgrade setuptools
22-
RUN apt-get install -y build-essential curl
26+
RUN apt install -y build-essential
2327

2428
# https://github.com/automl/auto-sklearn/issues/314
25-
RUN apt-get install -y swig3.0
29+
RUN apt install -y swig3.0
2630
RUN ln -s /usr/bin/swig3.0 /usr/bin/swig
2731

32+
# Copy the checkout autosklearn version for installation
33+
ADD . /auto-sklearn/
34+
2835
# Upgrade pip then install dependencies
2936
RUN pip3 install --upgrade pip
3037
RUN pip3 install pytest==4.6.* pep8 codecov pytest-cov flake8 flaky openml
31-
RUN curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip3 install
38+
RUN cat /auto-sklearn/requirements.txt | xargs -n 1 -L 1 pip3 install
3239
RUN pip3 install jupyter
3340

3441
# Install
35-
RUN pip3 install -e /auto-sklearn/
42+
RUN pip3 install /auto-sklearn/

autosklearn/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
"""Version information."""
22

33
# The following line *must* be the last in the module, exactly as formatted:
4-
__version__ = "0.9.0"
4+
__version__ = "0.10.0"

autosklearn/automl.py

Lines changed: 26 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import warnings
1111

1212
from ConfigSpace.read_and_write import json as cs_json
13+
import dask.distributed
1314
import numpy as np
1415
import numpy.ma as ma
1516
import pandas as pd
@@ -18,7 +19,7 @@
1819
from sklearn.base import BaseEstimator
1920
from sklearn.model_selection._split import _RepeatedSplits, \
2021
BaseShuffleSplit, BaseCrossValidator
21-
from smac.tae.execute_ta_run import StatusType
22+
from smac.tae import StatusType
2223
from smac.stats.stats import Stats
2324
import joblib
2425
import sklearn.utils
@@ -110,7 +111,8 @@ def __init__(self,
110111
exclude_preprocessors=None,
111112
resampling_strategy='holdout-iterative-fit',
112113
resampling_strategy_arguments=None,
113-
shared_mode=False,
114+
n_jobs=None,
115+
dask_client: Optional[dask.distributed.Client] = None,
114116
precision=32,
115117
disable_evaluator_output=False,
116118
get_smac_object_callback=None,
@@ -167,7 +169,8 @@ def __init__(self,
167169
]\
168170
and 'folds' not in self._resampling_strategy_arguments:
169171
self._resampling_strategy_arguments['folds'] = 5
170-
self._shared_mode = shared_mode
172+
self._n_jobs = n_jobs
173+
self._dask_client = dask_client
171174
self.precision = precision
172175
self._disable_evaluator_output = disable_evaluator_output
173176
# Check arguments prior to doing anything!
@@ -287,8 +290,6 @@ def _do_dummy_prediction(self, datamanager, num_run):
287290
raise ValueError("Dummy prediction failed with run state %s and additional output: %s."
288291
% (str(status), str(additional_info)))
289292

290-
return ta.num_run
291-
292293
def fit(
293294
self,
294295
X: np.ndarray,
@@ -325,14 +326,6 @@ def fit(
325326
if not isinstance(self._metric, Scorer):
326327
raise ValueError('Metric must be instance of '
327328
'autosklearn.metrics.Scorer.')
328-
if self._shared_mode:
329-
# If this fails, it's likely that this is the first call to get
330-
# the data manager
331-
try:
332-
D = self._backend.load_datamanager()
333-
dataset_name = D.name
334-
except IOError:
335-
pass
336329

337330
if dataset_name is None:
338331
dataset_name = hash_array_or_matrix(X)
@@ -408,7 +401,7 @@ def fit(
408401
)
409402
self._logger.debug(' ensemble_size: %d', self._ensemble_size)
410403
self._logger.debug(' ensemble_nbest: %f', self._ensemble_nbest)
411-
self._logger.debug(' max_models_on_disc: %d', self._max_models_on_disc)
404+
self._logger.debug(' max_models_on_disc: %s', str(self._max_models_on_disc))
412405
self._logger.debug(' ensemble_memory_limit: %d', self._ensemble_memory_limit)
413406
self._logger.debug(' seed: %d', self._seed)
414407
self._logger.debug(' ml_memory_limit: %d', self._ml_memory_limit)
@@ -421,7 +414,8 @@ def fit(
421414
self._logger.debug(' resampling_strategy: %s', str(self._resampling_strategy))
422415
self._logger.debug(' resampling_strategy_arguments: %s',
423416
str(self._resampling_strategy_arguments))
424-
self._logger.debug(' shared_mode: %s', str(self._shared_mode))
417+
self._logger.debug(' n_jobs: %s', str(self._n_jobs))
418+
self._logger.debug(' dask_client: %s', str(self._dask_client))
425419
self._logger.debug(' precision: %s', str(self.precision))
426420
self._logger.debug(' disable_evaluator_output: %s', str(self._disable_evaluator_output))
427421
self._logger.debug(' get_smac_objective_callback: %s', str(self._get_smac_object_callback))
@@ -454,13 +448,11 @@ def fit(
454448
try:
455449
os.makedirs(self._backend.get_model_dir())
456450
except (OSError, FileExistsError):
457-
if not self._shared_mode:
458-
raise
451+
raise
459452
try:
460453
os.makedirs(self._backend.get_cv_model_dir())
461454
except (OSError, FileExistsError):
462-
if not self._shared_mode:
463-
raise
455+
raise
464456

465457
self._task = datamanager.info['task']
466458
self._label_num = datamanager.info['label_num']
@@ -479,8 +471,7 @@ def fit(
479471

480472
# == Perform dummy predictions
481473
num_run = 1
482-
# if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']:
483-
num_run = self._do_dummy_prediction(datamanager, num_run)
474+
self._do_dummy_prediction(datamanager, num_run)
484475

485476
# = Create a searchspace
486477
# Do this before One Hot Encoding to make sure that it creates a
@@ -592,6 +583,8 @@ def fit(
592583
memory_limit=self._ml_memory_limit,
593584
data_memory_limit=self._data_memory_limit,
594585
watcher=self._stopwatch,
586+
n_jobs=self._n_jobs,
587+
dask_client=self._dask_client,
595588
start_num_run=num_run,
596589
num_metalearning_cfgs=self._initial_configurations_via_metalearning,
597590
config_file=configspace_path,
@@ -600,7 +593,6 @@ def fit(
600593
metric=self._metric,
601594
resampling_strategy=self._resampling_strategy,
602595
resampling_strategy_args=self._resampling_strategy_arguments,
603-
shared_mode=self._shared_mode,
604596
include_estimators=self._include_estimators,
605597
exclude_estimators=self._exclude_estimators,
606598
include_preprocessors=self._include_preprocessors,
@@ -832,7 +824,6 @@ def _get_ensemble_process(self, time_left_for_ensembles,
832824
ensemble_nbest=ensemble_nbest,
833825
max_models_on_disc=self._max_models_on_disc,
834826
seed=self._seed,
835-
shared_mode=self._shared_mode,
836827
precision=precision,
837828
max_iterations=max_iterations,
838829
read_at_most=np.inf,
@@ -842,12 +833,7 @@ def _get_ensemble_process(self, time_left_for_ensembles,
842833
)
843834

844835
def _load_models(self):
845-
if self._shared_mode:
846-
seed = -1
847-
else:
848-
seed = self._seed
849-
850-
self.ensemble_ = self._backend.load_ensemble(seed)
836+
self.ensemble_ = self._backend.load_ensemble(self._seed)
851837

852838
# If no ensemble is loaded, try to get the best performing model
853839
if not self.ensemble_:
@@ -874,7 +860,7 @@ def _load_models(self):
874860
elif self._disable_evaluator_output is False or \
875861
(isinstance(self._disable_evaluator_output, list) and
876862
'model' not in self._disable_evaluator_output):
877-
model_names = self._backend.list_all_models(seed)
863+
model_names = self._backend.list_all_models(self._seed)
878864

879865
if len(model_names) == 0 and self._resampling_strategy not in \
880866
['partial-cv', 'partial-cv-iterative-fit']:
@@ -985,12 +971,6 @@ def cv_results_(self):
985971
config_id = run_key.config_id
986972
config = self.runhistory_.ids_config[config_id]
987973

988-
param_dict = config.get_dictionary()
989-
params.append(param_dict)
990-
mean_test_score.append(self._metric._optimum - (self._metric._sign * run_value.cost))
991-
mean_fit_time.append(run_value.time)
992-
budgets.append(run_key.budget)
993-
994974
s = run_value.status
995975
if s == StatusType.SUCCESS:
996976
status.append('Success')
@@ -1004,9 +984,19 @@ def cv_results_(self):
1004984
status.append('Abort')
1005985
elif s == StatusType.MEMOUT:
1006986
status.append('Memout')
987+
elif s == StatusType.RUNNING:
988+
continue
989+
elif s == StatusType.BUDGETEXHAUSTED:
990+
continue
1007991
else:
1008992
raise NotImplementedError(s)
1009993

994+
param_dict = config.get_dictionary()
995+
params.append(param_dict)
996+
mean_test_score.append(self._metric._optimum - (self._metric._sign * run_value.cost))
997+
mean_fit_time.append(run_value.time)
998+
budgets.append(run_key.budget)
999+
10101000
for hp_name in hp_names:
10111001
if hp_name in param_dict:
10121002
hp_value = param_dict[hp_name]

autosklearn/ensemble_builder.py

Lines changed: 30 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
Y_VALID = 1
2828
Y_TEST = 2
2929

30+
MODEL_FN_RE = r'_([0-9]*)_([0-9]*)_([0-9]{1,3}\.[0-9]*)\.npy'
31+
3032

3133
class EnsembleBuilder(multiprocessing.Process):
3234
def __init__(
@@ -41,7 +43,6 @@ def __init__(
4143
max_models_on_disc: int = 100,
4244
performance_range_threshold: float = 0,
4345
seed: int = 1,
44-
shared_mode: bool = False,
4546
max_iterations: int = None,
4647
precision: int = 32,
4748
sleep_duration: int = 2,
@@ -90,9 +91,6 @@ def __init__(
9091
and max_models_on_disc. Might return less
9192
seed: int
9293
random seed
93-
if set to -1, read files with any seed (e.g., for shared model mode)
94-
shared_model: bool
95-
auto-sklearn used shared model mode (aka pSMAC)
9694
max_iterations: int
9795
maximal number of iterations to run this script
9896
(default None --> deactivated)
@@ -113,6 +111,9 @@ def __init__(
113111
self.task_type = task_type
114112
self.metric = metric
115113
self.time_limit = limit # time limit
114+
# define time_left here so that it is defined in case the ensemble builder is called
115+
# without starting a separate process
116+
self.time_left = limit
116117
self.ensemble_size = ensemble_size
117118
self.performance_range_threshold = performance_range_threshold
118119

@@ -139,7 +140,6 @@ def __init__(
139140
self.max_resident_models = None
140141

141142
self.seed = seed
142-
self.shared_mode = shared_mode # pSMAC?
143143
self.max_iterations = max_iterations
144144
self.precision = precision
145145
self.sleep_duration = sleep_duration
@@ -178,7 +178,7 @@ def __init__(
178178
(ensemble_nbest, type(ensemble_nbest)))
179179

180180
self.start_time = 0
181-
self.model_fn_re = re.compile(r'_([0-9]*)_([0-9]*)_([0-9]{1,3}\.[0-9]*)\.npy')
181+
self.model_fn_re = re.compile(MODEL_FN_RE)
182182

183183
# already read prediction files
184184
# {"file name": {
@@ -230,8 +230,11 @@ def __init__(
230230

231231
def run(self):
232232
buffer_time = 5 # TODO: Buffer time should also be used in main!?
233+
process_start_time = time.time()
233234
while True:
234-
time_left = self.time_limit - buffer_time
235+
time_elapsed = time.time() - process_start_time
236+
time_left = self.time_limit - buffer_time - time_elapsed
237+
self.time_left = time_left
235238
safe_ensemble_script = pynisher.enforce_limits(
236239
wall_time_in_s=int(time_left),
237240
mem_in_mb=self.memory_limit,
@@ -286,7 +289,7 @@ def main(self, return_pred=False):
286289
self.logger.debug(
287290
'Starting iteration %d, time left: %f',
288291
iteration,
289-
self.time_limit - used_time,
292+
self.time_left - used_time,
290293
)
291294

292295
# populates self.read_preds
@@ -395,7 +398,9 @@ def get_disk_consumption(self, pred_path):
395398
pred_test_name = 'predictions_test' + _full_name
396399
pred_test_path = os.path.join(self.dir_test, pred_test_name)
397400

398-
paths = [model_path, pred_path]
401+
paths = [pred_path]
402+
if os.path.exists(model_path):
403+
paths.append(model_path)
399404
if os.path.exists(pred_valid_path):
400405
paths.append(pred_valid_path)
401406
if os.path.exists(pred_test_path):
@@ -428,17 +433,10 @@ def score_ensemble_preds(self):
428433
self.logger.debug("No ensemble dataset prediction directory found")
429434
return False
430435

431-
if self.shared_mode is False:
432-
pred_path = os.path.join(
433-
glob.escape(self.dir_ensemble),
434-
'predictions_ensemble_%s_*_*.npy*' % self.seed,
435-
)
436-
# pSMAC
437-
else:
438-
pred_path = os.path.join(
439-
glob.escape(self.dir_ensemble),
440-
'predictions_ensemble_*_*_*.npy*',
441-
)
436+
pred_path = os.path.join(
437+
glob.escape(self.dir_ensemble),
438+
'predictions_ensemble_%s_*_*.npy*' % self.seed,
439+
)
442440

443441
y_ens_files = glob.glob(pred_path)
444442
y_ens_files = [y_ens_file for y_ens_file in y_ens_files
@@ -450,14 +448,22 @@ def score_ensemble_preds(self):
450448
" %s" % pred_path)
451449
return False
452450

451+
done_path = os.path.join(
452+
glob.escape(self.backend.get_done_directory()), '%s_*' % self.seed
453+
)
454+
done = glob.glob(done_path)
455+
done = [os.path.split(d)[1] for d in done]
456+
453457
# First sort files chronologically
454458
to_read = []
455459
for y_ens_fn in self.y_ens_files:
456460
match = self.model_fn_re.search(y_ens_fn)
457461
_seed = int(match.group(1))
458462
_num_run = int(match.group(2))
459463
_budget = float(match.group(3))
460-
to_read.append([y_ens_fn, match, _seed, _num_run, _budget])
464+
465+
if '%s_%s' % (_seed, _num_run) in done:
466+
to_read.append([y_ens_fn, match, _seed, _num_run, _budget])
461467

462468
n_read_files = 0
463469
# Now read file wrt to num_run
@@ -1074,7 +1080,9 @@ def _delete_excess_models(self):
10741080
pred_test_name = 'predictions_test' + _full_name
10751081
pred_test_path = os.path.join(self.dir_test, pred_test_name)
10761082

1077-
paths = [model_path, pred_path]
1083+
paths = [pred_path]
1084+
if os.path.exists(model_path):
1085+
paths.append(model_path)
10781086
if os.path.exists(pred_valid_path):
10791087
paths.append(pred_valid_path)
10801088
if os.path.exists(pred_test_path):

0 commit comments

Comments
 (0)