automl
diff --git a/‎.travis.yml‎
Lines changed: 0 additions & 5 deletions b/‎.travis.yml‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎Dockerfile‎
Lines changed: 14 additions & 7 deletions b/‎Dockerfile‎
Lines changed: 14 additions & 7 deletions
diff --git a/‎autosklearn/__version__.py‎
Lines changed: 1 addition & 1 deletion b/‎autosklearn/__version__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎autosklearn/automl.py‎
Lines changed: 26 additions & 36 deletions b/‎autosklearn/automl.py‎
Lines changed: 26 additions & 36 deletions
diff --git a/‎autosklearn/ensemble_builder.py‎
Lines changed: 30 additions & 22 deletions b/‎autosklearn/ensemble_builder.py‎
Lines changed: 30 additions & 22 deletions
@@ -26,17 +26,12 @@ matrix:
     env: DISTRIB="conda" RUN_FLAKE8="true" SKIP_TESTS="true"
   - os: linux
     env: DISTRIB="conda" RUN_MYPY="true" SKIP_TESTS="true"
-  - os: linux
-    env: DISTRIB="conda" PYTHON="3.5"
   - os: linux
     env: DISTRIB="conda" COVERAGE="true" PYTHON="3.6"
   - os: linux
     env: DISTRIB="conda" TEST_DIST="true" PYTHON="3.7"
   - os: linux
     env: DISTRIB="conda" PYTHON="3.8"
-  - os: linux
-    python: 3.5
-    env: DISTRIB="ubuntu"
   - os: linux
     python: 3.6
     env: DISTRIB="ubuntu"
 
@@ -2,9 +2,6 @@ FROM ubuntu:18.04
 
 WORKDIR /auto-sklearn
 
-# Copy the checkout autosklearn version for installation
-ADD . /auto-sklearn/
-
 # install linux packages
 RUN apt-get update
 
@@ -17,19 +14,29 @@ ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US:en
 ENV LC_ALL en_US.UTF-8
 
+# set environment variables to only use one core
+RUN export OPENBLAS_NUM_THREADS=1
+RUN export MKL_NUM_THREADS=1
+RUN export BLAS_NUM_THREADS=1
+RUN export OMP_NUM_THREADS=1
+
+# install build requirements
 RUN apt install -y python3-dev python3-pip
 RUN pip3 install --upgrade setuptools
-RUN apt-get install -y build-essential curl 
+RUN apt install -y build-essential
 
 # https://github.com/automl/auto-sklearn/issues/314
-RUN apt-get install -y swig3.0
+RUN apt install -y swig3.0
 RUN ln -s /usr/bin/swig3.0 /usr/bin/swig
 
+# Copy the checkout autosklearn version for installation
+ADD . /auto-sklearn/
+
 # Upgrade pip then install dependencies
 RUN pip3 install --upgrade pip
 RUN pip3 install pytest==4.6.* pep8 codecov pytest-cov flake8 flaky openml
-RUN curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip3 install
+RUN cat /auto-sklearn/requirements.txt | xargs -n 1 -L 1 pip3 install
 RUN pip3 install jupyter
 
 # Install
-RUN pip3 install -e /auto-sklearn/
+RUN pip3 install /auto-sklearn/
@@ -1,4 +1,4 @@
 """Version information."""
 
 # The following line *must* be the last in the module, exactly as formatted:
-__version__ = "0.9.0"
+__version__ = "0.10.0"
@@ -10,6 +10,7 @@
 import warnings
 
 from ConfigSpace.read_and_write import json as cs_json
+import dask.distributed
 import numpy as np
 import numpy.ma as ma
 import pandas as pd
@@ -18,7 +19,7 @@
 from sklearn.base import BaseEstimator
 from sklearn.model_selection._split import _RepeatedSplits, \
     BaseShuffleSplit, BaseCrossValidator
-from smac.tae.execute_ta_run import StatusType
+from smac.tae import StatusType
 from smac.stats.stats import Stats
 import joblib
 import sklearn.utils
@@ -110,7 +111,8 @@ def __init__(self,
                  exclude_preprocessors=None,
                  resampling_strategy='holdout-iterative-fit',
                  resampling_strategy_arguments=None,
-                 shared_mode=False,
+                 n_jobs=None,
+                 dask_client: Optional[dask.distributed.Client] = None,
                  precision=32,
                  disable_evaluator_output=False,
                  get_smac_object_callback=None,
@@ -167,7 +169,8 @@ def __init__(self,
                                          ]\
            and 'folds' not in self._resampling_strategy_arguments:
             self._resampling_strategy_arguments['folds'] = 5
-        self._shared_mode = shared_mode
+        self._n_jobs = n_jobs
+        self._dask_client = dask_client
         self.precision = precision
         self._disable_evaluator_output = disable_evaluator_output
         # Check arguments prior to doing anything!
@@ -287,8 +290,6 @@ def _do_dummy_prediction(self, datamanager, num_run):
             raise ValueError("Dummy prediction failed with run state %s and additional output: %s."
                              % (str(status), str(additional_info)))
 
-        return ta.num_run
-
     def fit(
         self,
         X: np.ndarray,
@@ -325,14 +326,6 @@ def fit(
         if not isinstance(self._metric, Scorer):
             raise ValueError('Metric must be instance of '
                              'autosklearn.metrics.Scorer.')
-        if self._shared_mode:
-            # If this fails, it's likely that this is the first call to get
-            # the data manager
-            try:
-                D = self._backend.load_datamanager()
-                dataset_name = D.name
-            except IOError:
-                pass
 
         if dataset_name is None:
             dataset_name = hash_array_or_matrix(X)
@@ -408,7 +401,7 @@ def fit(
         )
         self._logger.debug('  ensemble_size: %d', self._ensemble_size)
         self._logger.debug('  ensemble_nbest: %f', self._ensemble_nbest)
-        self._logger.debug('  max_models_on_disc: %d', self._max_models_on_disc)
+        self._logger.debug('  max_models_on_disc: %s', str(self._max_models_on_disc))
         self._logger.debug('  ensemble_memory_limit: %d', self._ensemble_memory_limit)
         self._logger.debug('  seed: %d', self._seed)
         self._logger.debug('  ml_memory_limit: %d', self._ml_memory_limit)
@@ -421,7 +414,8 @@ def fit(
         self._logger.debug('  resampling_strategy: %s', str(self._resampling_strategy))
         self._logger.debug('  resampling_strategy_arguments: %s',
                            str(self._resampling_strategy_arguments))
-        self._logger.debug('  shared_mode: %s', str(self._shared_mode))
+        self._logger.debug('  n_jobs: %s', str(self._n_jobs))
+        self._logger.debug('  dask_client: %s', str(self._dask_client))
         self._logger.debug('  precision: %s', str(self.precision))
         self._logger.debug('  disable_evaluator_output: %s', str(self._disable_evaluator_output))
         self._logger.debug('  get_smac_objective_callback: %s', str(self._get_smac_object_callback))
@@ -454,13 +448,11 @@ def fit(
         try:
             os.makedirs(self._backend.get_model_dir())
         except (OSError, FileExistsError):
-            if not self._shared_mode:
-                raise
+            raise
         try:
             os.makedirs(self._backend.get_cv_model_dir())
         except (OSError, FileExistsError):
-            if not self._shared_mode:
-                raise
+            raise
 
         self._task = datamanager.info['task']
         self._label_num = datamanager.info['label_num']
@@ -479,8 +471,7 @@ def fit(
 
         # == Perform dummy predictions
         num_run = 1
-        # if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']:
-        num_run = self._do_dummy_prediction(datamanager, num_run)
+        self._do_dummy_prediction(datamanager, num_run)
 
         # = Create a searchspace
         # Do this before One Hot Encoding to make sure that it creates a
@@ -592,6 +583,8 @@ def fit(
                 memory_limit=self._ml_memory_limit,
                 data_memory_limit=self._data_memory_limit,
                 watcher=self._stopwatch,
+                n_jobs=self._n_jobs,
+                dask_client=self._dask_client,
                 start_num_run=num_run,
                 num_metalearning_cfgs=self._initial_configurations_via_metalearning,
                 config_file=configspace_path,
@@ -600,7 +593,6 @@ def fit(
                 metric=self._metric,
                 resampling_strategy=self._resampling_strategy,
                 resampling_strategy_args=self._resampling_strategy_arguments,
-                shared_mode=self._shared_mode,
                 include_estimators=self._include_estimators,
                 exclude_estimators=self._exclude_estimators,
                 include_preprocessors=self._include_preprocessors,
@@ -832,7 +824,6 @@ def _get_ensemble_process(self, time_left_for_ensembles,
             ensemble_nbest=ensemble_nbest,
             max_models_on_disc=self._max_models_on_disc,
             seed=self._seed,
-            shared_mode=self._shared_mode,
             precision=precision,
             max_iterations=max_iterations,
             read_at_most=np.inf,
@@ -842,12 +833,7 @@ def _get_ensemble_process(self, time_left_for_ensembles,
         )
 
     def _load_models(self):
-        if self._shared_mode:
-            seed = -1
-        else:
-            seed = self._seed
-
-        self.ensemble_ = self._backend.load_ensemble(seed)
+        self.ensemble_ = self._backend.load_ensemble(self._seed)
 
         # If no ensemble is loaded, try to get the best performing model
         if not self.ensemble_:
@@ -874,7 +860,7 @@ def _load_models(self):
         elif self._disable_evaluator_output is False or \
                 (isinstance(self._disable_evaluator_output, list) and
                  'model' not in self._disable_evaluator_output):
-            model_names = self._backend.list_all_models(seed)
+            model_names = self._backend.list_all_models(self._seed)
 
             if len(model_names) == 0 and self._resampling_strategy not in \
                     ['partial-cv', 'partial-cv-iterative-fit']:
@@ -985,12 +971,6 @@ def cv_results_(self):
             config_id = run_key.config_id
             config = self.runhistory_.ids_config[config_id]
 
-            param_dict = config.get_dictionary()
-            params.append(param_dict)
-            mean_test_score.append(self._metric._optimum - (self._metric._sign * run_value.cost))
-            mean_fit_time.append(run_value.time)
-            budgets.append(run_key.budget)
-
             s = run_value.status
             if s == StatusType.SUCCESS:
                 status.append('Success')
@@ -1004,9 +984,19 @@ def cv_results_(self):
                 status.append('Abort')
             elif s == StatusType.MEMOUT:
                 status.append('Memout')
+            elif s == StatusType.RUNNING:
+                continue
+            elif s == StatusType.BUDGETEXHAUSTED:
+                continue
             else:
                 raise NotImplementedError(s)
 
+            param_dict = config.get_dictionary()
+            params.append(param_dict)
+            mean_test_score.append(self._metric._optimum - (self._metric._sign * run_value.cost))
+            mean_fit_time.append(run_value.time)
+            budgets.append(run_key.budget)
+
             for hp_name in hp_names:
                 if hp_name in param_dict:
                     hp_value = param_dict[hp_name]
 
@@ -27,6 +27,8 @@
 Y_VALID = 1
 Y_TEST = 2
 
+MODEL_FN_RE = r'_([0-9]*)_([0-9]*)_([0-9]{1,3}\.[0-9]*)\.npy'
+
 
 class EnsembleBuilder(multiprocessing.Process):
     def __init__(
@@ -41,7 +43,6 @@ def __init__(
             max_models_on_disc: int = 100,
             performance_range_threshold: float = 0,
             seed: int = 1,
-            shared_mode: bool = False,
             max_iterations: int = None,
             precision: int = 32,
             sleep_duration: int = 2,
@@ -90,9 +91,6 @@ def __init__(
                 and max_models_on_disc. Might return less
             seed: int
                 random seed
-                if set to -1, read files with any seed (e.g., for shared model mode)
-            shared_model: bool
-                auto-sklearn used shared model mode (aka pSMAC)
             max_iterations: int
                 maximal number of iterations to run this script
                 (default None --> deactivated)
@@ -113,6 +111,9 @@ def __init__(
         self.task_type = task_type
         self.metric = metric
         self.time_limit = limit  # time limit
+        # define time_left here so that it is defined in case the ensemble builder is called
+        # without starting a separate process
+        self.time_left = limit
         self.ensemble_size = ensemble_size
         self.performance_range_threshold = performance_range_threshold
 
@@ -139,7 +140,6 @@ def __init__(
         self.max_resident_models = None
 
         self.seed = seed
-        self.shared_mode = shared_mode  # pSMAC?
         self.max_iterations = max_iterations
         self.precision = precision
         self.sleep_duration = sleep_duration
@@ -178,7 +178,7 @@ def __init__(
                               (ensemble_nbest, type(ensemble_nbest)))
 
         self.start_time = 0
-        self.model_fn_re = re.compile(r'_([0-9]*)_([0-9]*)_([0-9]{1,3}\.[0-9]*)\.npy')
+        self.model_fn_re = re.compile(MODEL_FN_RE)
 
         # already read prediction files
         # {"file name": {
@@ -230,8 +230,11 @@ def __init__(
 
     def run(self):
         buffer_time = 5  # TODO: Buffer time should also be used in main!?
+        process_start_time = time.time()
         while True:
-            time_left = self.time_limit - buffer_time
+            time_elapsed = time.time() - process_start_time
+            time_left = self.time_limit - buffer_time - time_elapsed
+            self.time_left = time_left
             safe_ensemble_script = pynisher.enforce_limits(
                 wall_time_in_s=int(time_left),
                 mem_in_mb=self.memory_limit,
@@ -286,7 +289,7 @@ def main(self, return_pred=False):
             self.logger.debug(
                 'Starting iteration %d, time left: %f',
                 iteration,
-                self.time_limit - used_time,
+                self.time_left - used_time,
             )
 
             # populates self.read_preds
@@ -395,7 +398,9 @@ def get_disk_consumption(self, pred_path):
         pred_test_name = 'predictions_test' + _full_name
         pred_test_path = os.path.join(self.dir_test, pred_test_name)
 
-        paths = [model_path, pred_path]
+        paths = [pred_path]
+        if os.path.exists(model_path):
+            paths.append(model_path)
         if os.path.exists(pred_valid_path):
             paths.append(pred_valid_path)
         if os.path.exists(pred_test_path):
@@ -428,17 +433,10 @@ def score_ensemble_preds(self):
             self.logger.debug("No ensemble dataset prediction directory found")
             return False
 
-        if self.shared_mode is False:
-            pred_path = os.path.join(
-                glob.escape(self.dir_ensemble),
-                'predictions_ensemble_%s_*_*.npy*' % self.seed,
-            )
-        # pSMAC
-        else:
-            pred_path = os.path.join(
-                glob.escape(self.dir_ensemble),
-                'predictions_ensemble_*_*_*.npy*',
-            )
+        pred_path = os.path.join(
+            glob.escape(self.dir_ensemble),
+            'predictions_ensemble_%s_*_*.npy*' % self.seed,
+        )
 
         y_ens_files = glob.glob(pred_path)
         y_ens_files = [y_ens_file for y_ens_file in y_ens_files
@@ -450,14 +448,22 @@ def score_ensemble_preds(self):
                               " %s" % pred_path)
             return False
 
+        done_path = os.path.join(
+            glob.escape(self.backend.get_done_directory()), '%s_*' % self.seed
+        )
+        done = glob.glob(done_path)
+        done = [os.path.split(d)[1] for d in done]
+
         # First sort files chronologically
         to_read = []
         for y_ens_fn in self.y_ens_files:
             match = self.model_fn_re.search(y_ens_fn)
             _seed = int(match.group(1))
             _num_run = int(match.group(2))
             _budget = float(match.group(3))
-            to_read.append([y_ens_fn, match, _seed, _num_run, _budget])
+
+            if '%s_%s' % (_seed, _num_run) in done:
+                to_read.append([y_ens_fn, match, _seed, _num_run, _budget])
 
         n_read_files = 0
         # Now read file wrt to num_run
@@ -1074,7 +1080,9 @@ def _delete_excess_models(self):
             pred_test_name = 'predictions_test' + _full_name
             pred_test_path = os.path.join(self.dir_test, pred_test_name)
 
-            paths = [model_path, pred_path]
+            paths = [pred_path]
+            if os.path.exists(model_path):
+                paths.append(model_path)
             if os.path.exists(pred_valid_path):
                 paths.append(pred_valid_path)
             if os.path.exists(pred_test_path):