automl
diff --git a/‎.github/workflows/pytest.yml‎
Lines changed: 4 additions & 5 deletions b/‎.github/workflows/pytest.yml‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎autosklearn/__version__.py‎
Lines changed: 1 addition & 1 deletion b/‎autosklearn/__version__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎autosklearn/automl.py‎
Lines changed: 84 additions & 12 deletions b/‎autosklearn/automl.py‎
Lines changed: 84 additions & 12 deletions
diff --git a/‎autosklearn/data/validation.py‎
Lines changed: 1 addition & 1 deletion b/‎autosklearn/data/validation.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎autosklearn/ensemble_builder.py‎
Lines changed: 5 additions & 5 deletions b/‎autosklearn/ensemble_builder.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎autosklearn/ensembles/ensemble_selection.py‎
Lines changed: 10 additions & 23 deletions b/‎autosklearn/ensembles/ensemble_selection.py‎
Lines changed: 10 additions & 23 deletions
diff --git a/‎autosklearn/evaluation/__init__.py‎
Lines changed: 20 additions & 4 deletions b/‎autosklearn/evaluation/__init__.py‎
Lines changed: 20 additions & 4 deletions
@@ -5,7 +5,7 @@ on: [push, pull_request]
 jobs:
   ubuntu:
 
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-18.04
     strategy:
       matrix:
         python-version: [3.6, 3.7, 3.8]
@@ -54,17 +54,15 @@ jobs:
         sudo apt-get install swig3.0
         sudo ln -s /usr/bin/swig3.0 /usr/bin/swig
         # We need to install for the dependencies, like pytest
-        pip install -e .[test]
-        # Then we remove autosklearn and install from DIST
-        pip uninstall --yes auto-sklearn
         python setup.py sdist
         last_dist=$(ls -t dist/auto-sklearn-*.tar.gz | head -n 1)
-        pip install $last_dist
+        pip install $last_dist[test]
     - name: Store repository status
       id: status-before
       run: |
         echo "::set-output name=BEFORE::$(git status --porcelain -b)"
     - name: Conda Run tests
+      timeout-minutes: 45
       if: matrix.use-conda == true
       run: |
         export OPENBLAS_NUM_THREADS=1
@@ -76,6 +74,7 @@ jobs:
         if [ ${{ matrix.code-cov }} ]; then codecov='--cov=autosklearn --cov-report=xml'; fi
         $CONDA/envs/testenv/bin/python3 -m pytest --durations=20 --timeout=300 --timeout-method=thread -v $codecov test
     - name: Run tests
+      timeout-minutes: 45
       if: matrix.use-conda == false
       run: |
         export OPENBLAS_NUM_THREADS=1
 
@@ -1,4 +1,4 @@
 """Version information."""
 
 # The following line *must* be the last in the module, exactly as formatted:
-__version__ = "0.12.0"
+__version__ = "0.12.0rc1"
@@ -43,9 +43,9 @@
 from autosklearn.util.backend import Backend
 from autosklearn.util.stopwatch import StopWatch
 from autosklearn.util.logging_ import (
-    get_logger,
     setup_logger,
     start_log_server,
+    get_named_client_logger,
 )
 from autosklearn.util import pipeline, RE_PATTERN
 from autosklearn.ensemble_builder import EnsembleBuilderManager
@@ -54,7 +54,8 @@
 from autosklearn.util.hash import hash_array_or_matrix
 from autosklearn.metrics import f1_macro, accuracy, r2
 from autosklearn.constants import MULTILABEL_CLASSIFICATION, MULTICLASS_CLASSIFICATION, \
-    REGRESSION_TASKS, REGRESSION, BINARY_CLASSIFICATION, MULTIOUTPUT_REGRESSION
+    REGRESSION_TASKS, REGRESSION, BINARY_CLASSIFICATION, MULTIOUTPUT_REGRESSION, \
+    CLASSIFICATION_TASKS
 from autosklearn.pipeline.components.classification import ClassifierChoice
 from autosklearn.pipeline.components.regression import RegressorChoice
 from autosklearn.pipeline.components.feature_preprocessing import FeaturePreprocessorChoice
@@ -228,6 +229,9 @@ def __init__(self,
             raise ValueError("per_run_time_limit not of type integer, but %s" %
                              str(type(self._per_run_time_limit)))
 
+        # By default try to use the TCP logging port or get a new port
+        self._logger_port = logging.handlers.DEFAULT_TCP_LOGGING_PORT
+
         # After assigning and checking variables...
         # self._backend = Backend(self._output_dir, self._tmp_dir)
 
@@ -313,7 +317,11 @@ def _get_logger(self, name):
 
         self._logger_port = int(port.value)
 
-        return get_logger(logger_name)
+        return get_named_client_logger(
+            name=logger_name,
+            host='localhost',
+            port=self._logger_port,
+        )
 
     def _clean_logger(self):
         if not hasattr(self, 'stop_logging_server') or self.stop_logging_server is None:
@@ -380,6 +388,7 @@ def _do_dummy_prediction(self, datamanager, num_run):
                                     disable_file_output=self._disable_evaluator_output,
                                     abort_on_first_run_crash=False,
                                     cost_for_crash=get_cost_of_crash(self._metric),
+                                    port=self._logger_port,
                                     **self._resampling_strategy_arguments)
 
         status, cost, runtime, additional_info = ta.run(num_run, cutoff=self._time_for_task)
@@ -428,6 +437,12 @@ def fit(
         only_return_configuration_space: Optional[bool] = False,
         load_models: bool = True,
     ):
+        if dataset_name is None:
+            dataset_name = hash_array_or_matrix(X)
+        # The first thing we have to do is create the logger to update the backend
+        self._logger = self._get_logger(dataset_name)
+        self._backend.setup_logger(self._logger_port)
+
         self._backend.save_start_time(self._seed)
         self._stopwatch = StopWatch()
 
@@ -445,6 +460,15 @@ def fit(
                 raise ValueError('Target value shapes do not match: %s vs %s'
                                  % (y.shape, y_test.shape))
 
+        X, y = self.subsample_if_too_large(
+            X=X,
+            y=y,
+            logger=self._logger,
+            seed=self._seed,
+            memory_limit=self._memory_limit,
+            task=self._task,
+        )
+
         # Reset learnt stuff
         self.models_ = None
         self.cv_models_ = None
@@ -459,12 +483,6 @@ def fit(
             raise ValueError('Metric must be instance of '
                              'autosklearn.metrics.Scorer.')
 
-        if dataset_name is None:
-            dataset_name = hash_array_or_matrix(X)
-        # By default try to use the TCP logging port or get a new port
-        self._logger_port = logging.handlers.DEFAULT_TCP_LOGGING_PORT
-        self._logger = self._get_logger(dataset_name)
-
         # If no dask client was provided, we create one, so that we can
         # start a ensemble process in parallel to smbo optimize
         if (
@@ -718,6 +736,7 @@ def fit(
                 get_smac_object_callback=self._get_smac_object_callback,
                 smac_scenario_args=self._smac_scenario_args,
                 scoring_functions=self._scoring_functions,
+                port=self._logger_port,
                 ensemble_callback=proc_ensemble,
             )
 
@@ -770,6 +789,59 @@ def fit(
 
         return self
 
+    @staticmethod
+    def subsample_if_too_large(X, y, logger, seed, memory_limit, task):
+        if isinstance(X, np.ndarray):
+            if X.dtype == np.float32:
+                multiplier = 4
+            elif X.dtype in (np.float64, np.float):
+                multiplier = 8
+            elif X.dtype == np.float128:
+                multiplier = 16
+            else:
+                # Just assuming some value - very unlikely
+                multiplier = 8
+                logger.warning('Unknown dtype for X: %s, assuming it takes 8 bit/number',
+                               str(X.dtype))
+            megabytes = X.shape[0] * X.shape[1] * multiplier / 1024 / 1024
+            if memory_limit <= megabytes * 10:
+                new_num_samples = int(
+                    memory_limit / (10 * X.shape[1] * multiplier / 1024 / 1024)
+                )
+                logger.warning(
+                    'Dataset too large for memory limit %dMB, reducing number of samples from '
+                    '%d to %d.',
+                    memory_limit,
+                    X.shape[0],
+                    new_num_samples,
+                )
+                if task in CLASSIFICATION_TASKS:
+                    try:
+                        X, _, y, _ = sklearn.model_selection.train_test_split(
+                            X, y,
+                            train_size=new_num_samples,
+                            random_state=seed,
+                            stratify=y,
+                        )
+                    except Exception:
+                        logger.warning(
+                            'Could not sample dataset in stratified manner, resorting to random '
+                            'sampling',
+                            exc_info=True
+                        )
+                        X, _, y, _ = sklearn.model_selection.train_test_split(
+                            X, y,
+                            train_size=new_num_samples,
+                            random_state=seed,
+                        )
+                else:
+                    X, _, y, _ = sklearn.model_selection.train_test_split(
+                        X, y,
+                        train_size=new_num_samples,
+                        random_state=seed,
+                    )
+        return X, y
+
     def refit(self, X, y):
 
         # Make sure input data is valid
@@ -1118,9 +1190,9 @@ def cv_results_(self):
                 status.append('Abort')
             elif s == StatusType.MEMOUT:
                 status.append('Memout')
-            elif s == StatusType.RUNNING:
-                continue
-            elif s == StatusType.BUDGETEXHAUSTED:
+            # TODO remove StatusType.RUNNING at some point in the future when the new SMAC 0.13.2
+            #  is the new minimum required version!
+            elif s in (StatusType.STOP, StatusType.RUNNING):
                 continue
             else:
                 raise NotImplementedError(s)
 
@@ -305,7 +305,7 @@ def _check_and_get_columns_to_encode(
                         "Cast it to a valid dtype before using it in Auto-Sklearn. "
                         "Valid types are numerical, categorical or boolean. "
                         "You can cast it to a valid dtype using "
-                        "pandas.Series.astype ."
+                        "pandas.Series.astype. "
                         "If working with string objects, the following "
                         "tutorial illustrates how to work with text data: "
                         "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html".format(  # noqa: E501
 
@@ -30,7 +30,7 @@
 from autosklearn.metrics import calculate_score, Scorer
 from autosklearn.ensembles.ensemble_selection import EnsembleSelection
 from autosklearn.ensembles.abstract_ensemble import AbstractEnsemble
-from autosklearn.util.logging_ import get_named_client_logger, get_logger
+from autosklearn.util.logging_ import get_named_client_logger
 
 Y_ENSEMBLE = 0
 Y_VALID = 1
@@ -162,7 +162,10 @@ def build_ensemble(
         # The second criteria is elapsed time
         elapsed_time = time.time() - self.start_time
 
-        logger = get_logger('EnsembleBuilder')
+        logger = get_named_client_logger(
+            name='EnsembleBuilder',
+            port=self.logger_port,
+        )
 
         # First test for termination conditions
         if self.time_left_for_ensembles < elapsed_time:
@@ -476,7 +479,6 @@ def __init__(
         self.logger = get_named_client_logger(
             name='EnsembleBuilder',
             port=self.logger_port,
-            output_dir=self.backend.temporary_directory,
         )
 
         if ensemble_nbest == 1:
@@ -582,7 +584,6 @@ def run(
         self.logger = get_named_client_logger(
             name='EnsembleBuilder',
             port=self.logger_port,
-            output_dir=self.backend.temporary_directory,
         )
 
         process_start_time = time.time()
@@ -659,7 +660,6 @@ def main(self, time_left, iteration, return_predictions):
         self.logger = get_named_client_logger(
             name='EnsembleBuilder',
             port=self.logger_port,
-            output_dir=self.backend.temporary_directory,
         )
 
         self.start_time = time.time()
 
@@ -105,40 +105,27 @@ def _fast(
                 dtype=np.float64,
             )
             s = len(ensemble)
-            if s == 0:
-                weighted_ensemble_prediction.fill(0.0)
-            else:
-                weighted_ensemble_prediction.fill(0.0)
-                for pred in ensemble:
-                    np.add(
-                        weighted_ensemble_prediction,
-                        pred,
-                        out=weighted_ensemble_prediction,
-                    )
-                np.multiply(
-                    weighted_ensemble_prediction,
-                    1/s,
-                    out=weighted_ensemble_prediction,
-                )
-                np.multiply(
+            if s > 0:
+                np.add(
                     weighted_ensemble_prediction,
-                    (s / float(s + 1)),
+                    ensemble[-1],
                     out=weighted_ensemble_prediction,
                 )
 
             # Memory-efficient averaging!
             for j, pred in enumerate(predictions):
-                # TODO: this could potentially be vectorized! - let's profile
-                # the script first!
-                fant_ensemble_prediction.fill(0.0)
+                # fant_ensemble_prediction is the prediction of the current ensemble
+                # and should be ([predictions[selected_prev_iterations] + predictions[j])/(s+1)
+                # We overwrite the contents of fant_ensemble_prediction
+                # directly with weighted_ensemble_prediction + new_prediction and then scale for avg
                 np.add(
-                    fant_ensemble_prediction,
                     weighted_ensemble_prediction,
+                    pred,
                     out=fant_ensemble_prediction
                 )
-                np.add(
+                np.multiply(
                     fant_ensemble_prediction,
-                    (1. / float(s + 1)) * pred,
+                    (1. / float(s + 1)),
                     out=fant_ensemble_prediction
                 )
 
 
@@ -1,5 +1,6 @@
 # -*- encoding: utf-8 -*-
 import functools
+import logging
 import json
 import math
 import multiprocessing
@@ -22,7 +23,7 @@
 import autosklearn.evaluation.train_evaluator
 import autosklearn.evaluation.test_evaluator
 import autosklearn.evaluation.util
-import autosklearn.util.logging_
+from autosklearn.util.logging_ import get_named_client_logger
 
 
 def fit_predict_try_except_decorator(ta, queue, cost_for_crash, **kwargs):
@@ -96,7 +97,7 @@ def _encode_exit_status(exit_status):
 class ExecuteTaFuncWithQueue(AbstractTAFunc):
 
     def __init__(self, backend, autosklearn_seed, resampling_strategy, metric,
-                 cost_for_crash, abort_on_first_run_crash,
+                 cost_for_crash, abort_on_first_run_crash, port,
                  initial_num_run=1, stats=None,
                  run_obj='quality', par_factor=1, scoring_functions=None,
                  output_y_hat_optimization=True, include=None, exclude=None,
@@ -175,8 +176,15 @@ def __init__(self, backend, autosklearn_seed, resampling_strategy, metric,
         else:
             self._get_test_loss = False
 
+        self.port = port
         self.pynisher_context = pynisher_context
-        self.logger = autosklearn.util.logging_.get_logger("TAE")
+        if self.port is None:
+            self.logger = logging.getLogger("TAE")
+        else:
+            self.logger = get_named_client_logger(
+                name="TAE",
+                port=self.port,
+            )
 
     def run_wrapper(
         self,
@@ -261,8 +269,15 @@ def run(
         if self.init_params is not None:
             init_params.update(self.init_params)
 
+        if self.port is None:
+            logger = logging.getLogger("pynisher")
+        else:
+            logger = get_named_client_logger(
+                name="pynisher",
+                port=self.port,
+            )
         arguments = dict(
-            logger=autosklearn.util.logging_.get_logger("pynisher"),
+            logger=logger,
             wall_time_in_s=cutoff,
             mem_in_mb=self.memory_limit,
             capture_output=True,
@@ -278,6 +293,7 @@ def run(
             queue=queue,
             config=config,
             backend=self.backend,
+            port=self.port,
             metric=self.metric,
             seed=self.autosklearn_seed,
             num_run=num_run,