Skip to content

Commit 08f32a8

Browse files
authored
Merge pull request #1043 from automl/development
Development
2 parents 40304b4 + da7d765 commit 08f32a8

File tree

611 files changed

+87522
-51967
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

611 files changed

+87522
-51967
lines changed

.github/workflows/pytest.yml

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ on: [push, pull_request]
55
jobs:
66
ubuntu:
77

8-
runs-on: ubuntu-latest
8+
runs-on: ubuntu-18.04
99
strategy:
1010
matrix:
1111
python-version: [3.6, 3.7, 3.8]
@@ -54,17 +54,15 @@ jobs:
5454
sudo apt-get install swig3.0
5555
sudo ln -s /usr/bin/swig3.0 /usr/bin/swig
5656
# We need to install for the dependencies, like pytest
57-
pip install -e .[test]
58-
# Then we remove autosklearn and install from DIST
59-
pip uninstall --yes auto-sklearn
6057
python setup.py sdist
6158
last_dist=$(ls -t dist/auto-sklearn-*.tar.gz | head -n 1)
62-
pip install $last_dist
59+
pip install $last_dist[test]
6360
- name: Store repository status
6461
id: status-before
6562
run: |
6663
echo "::set-output name=BEFORE::$(git status --porcelain -b)"
6764
- name: Conda Run tests
65+
timeout-minutes: 45
6866
if: matrix.use-conda == true
6967
run: |
7068
export OPENBLAS_NUM_THREADS=1
@@ -76,6 +74,7 @@ jobs:
7674
if [ ${{ matrix.code-cov }} ]; then codecov='--cov=autosklearn --cov-report=xml'; fi
7775
$CONDA/envs/testenv/bin/python3 -m pytest --durations=20 --timeout=300 --timeout-method=thread -v $codecov test
7876
- name: Run tests
77+
timeout-minutes: 45
7978
if: matrix.use-conda == false
8079
run: |
8180
export OPENBLAS_NUM_THREADS=1

autosklearn/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
"""Version information."""
22

33
# The following line *must* be the last in the module, exactly as formatted:
4-
__version__ = "0.12.0"
4+
__version__ = "0.12.0rc1"

autosklearn/automl.py

Lines changed: 84 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,9 @@
4343
from autosklearn.util.backend import Backend
4444
from autosklearn.util.stopwatch import StopWatch
4545
from autosklearn.util.logging_ import (
46-
get_logger,
4746
setup_logger,
4847
start_log_server,
48+
get_named_client_logger,
4949
)
5050
from autosklearn.util import pipeline, RE_PATTERN
5151
from autosklearn.ensemble_builder import EnsembleBuilderManager
@@ -54,7 +54,8 @@
5454
from autosklearn.util.hash import hash_array_or_matrix
5555
from autosklearn.metrics import f1_macro, accuracy, r2
5656
from autosklearn.constants import MULTILABEL_CLASSIFICATION, MULTICLASS_CLASSIFICATION, \
57-
REGRESSION_TASKS, REGRESSION, BINARY_CLASSIFICATION, MULTIOUTPUT_REGRESSION
57+
REGRESSION_TASKS, REGRESSION, BINARY_CLASSIFICATION, MULTIOUTPUT_REGRESSION, \
58+
CLASSIFICATION_TASKS
5859
from autosklearn.pipeline.components.classification import ClassifierChoice
5960
from autosklearn.pipeline.components.regression import RegressorChoice
6061
from autosklearn.pipeline.components.feature_preprocessing import FeaturePreprocessorChoice
@@ -228,6 +229,9 @@ def __init__(self,
228229
raise ValueError("per_run_time_limit not of type integer, but %s" %
229230
str(type(self._per_run_time_limit)))
230231

232+
# By default try to use the TCP logging port or get a new port
233+
self._logger_port = logging.handlers.DEFAULT_TCP_LOGGING_PORT
234+
231235
# After assigning and checking variables...
232236
# self._backend = Backend(self._output_dir, self._tmp_dir)
233237

@@ -313,7 +317,11 @@ def _get_logger(self, name):
313317

314318
self._logger_port = int(port.value)
315319

316-
return get_logger(logger_name)
320+
return get_named_client_logger(
321+
name=logger_name,
322+
host='localhost',
323+
port=self._logger_port,
324+
)
317325

318326
def _clean_logger(self):
319327
if not hasattr(self, 'stop_logging_server') or self.stop_logging_server is None:
@@ -380,6 +388,7 @@ def _do_dummy_prediction(self, datamanager, num_run):
380388
disable_file_output=self._disable_evaluator_output,
381389
abort_on_first_run_crash=False,
382390
cost_for_crash=get_cost_of_crash(self._metric),
391+
port=self._logger_port,
383392
**self._resampling_strategy_arguments)
384393

385394
status, cost, runtime, additional_info = ta.run(num_run, cutoff=self._time_for_task)
@@ -428,6 +437,12 @@ def fit(
428437
only_return_configuration_space: Optional[bool] = False,
429438
load_models: bool = True,
430439
):
440+
if dataset_name is None:
441+
dataset_name = hash_array_or_matrix(X)
442+
# The first thing we have to do is create the logger to update the backend
443+
self._logger = self._get_logger(dataset_name)
444+
self._backend.setup_logger(self._logger_port)
445+
431446
self._backend.save_start_time(self._seed)
432447
self._stopwatch = StopWatch()
433448

@@ -445,6 +460,15 @@ def fit(
445460
raise ValueError('Target value shapes do not match: %s vs %s'
446461
% (y.shape, y_test.shape))
447462

463+
X, y = self.subsample_if_too_large(
464+
X=X,
465+
y=y,
466+
logger=self._logger,
467+
seed=self._seed,
468+
memory_limit=self._memory_limit,
469+
task=self._task,
470+
)
471+
448472
# Reset learnt stuff
449473
self.models_ = None
450474
self.cv_models_ = None
@@ -459,12 +483,6 @@ def fit(
459483
raise ValueError('Metric must be instance of '
460484
'autosklearn.metrics.Scorer.')
461485

462-
if dataset_name is None:
463-
dataset_name = hash_array_or_matrix(X)
464-
# By default try to use the TCP logging port or get a new port
465-
self._logger_port = logging.handlers.DEFAULT_TCP_LOGGING_PORT
466-
self._logger = self._get_logger(dataset_name)
467-
468486
# If no dask client was provided, we create one, so that we can
469487
# start a ensemble process in parallel to smbo optimize
470488
if (
@@ -718,6 +736,7 @@ def fit(
718736
get_smac_object_callback=self._get_smac_object_callback,
719737
smac_scenario_args=self._smac_scenario_args,
720738
scoring_functions=self._scoring_functions,
739+
port=self._logger_port,
721740
ensemble_callback=proc_ensemble,
722741
)
723742

@@ -770,6 +789,59 @@ def fit(
770789

771790
return self
772791

792+
@staticmethod
793+
def subsample_if_too_large(X, y, logger, seed, memory_limit, task):
794+
if isinstance(X, np.ndarray):
795+
if X.dtype == np.float32:
796+
multiplier = 4
797+
elif X.dtype in (np.float64, np.float):
798+
multiplier = 8
799+
elif X.dtype == np.float128:
800+
multiplier = 16
801+
else:
802+
# Just assuming some value - very unlikely
803+
multiplier = 8
804+
logger.warning('Unknown dtype for X: %s, assuming it takes 8 bit/number',
805+
str(X.dtype))
806+
megabytes = X.shape[0] * X.shape[1] * multiplier / 1024 / 1024
807+
if memory_limit <= megabytes * 10:
808+
new_num_samples = int(
809+
memory_limit / (10 * X.shape[1] * multiplier / 1024 / 1024)
810+
)
811+
logger.warning(
812+
'Dataset too large for memory limit %dMB, reducing number of samples from '
813+
'%d to %d.',
814+
memory_limit,
815+
X.shape[0],
816+
new_num_samples,
817+
)
818+
if task in CLASSIFICATION_TASKS:
819+
try:
820+
X, _, y, _ = sklearn.model_selection.train_test_split(
821+
X, y,
822+
train_size=new_num_samples,
823+
random_state=seed,
824+
stratify=y,
825+
)
826+
except Exception:
827+
logger.warning(
828+
'Could not sample dataset in stratified manner, resorting to random '
829+
'sampling',
830+
exc_info=True
831+
)
832+
X, _, y, _ = sklearn.model_selection.train_test_split(
833+
X, y,
834+
train_size=new_num_samples,
835+
random_state=seed,
836+
)
837+
else:
838+
X, _, y, _ = sklearn.model_selection.train_test_split(
839+
X, y,
840+
train_size=new_num_samples,
841+
random_state=seed,
842+
)
843+
return X, y
844+
773845
def refit(self, X, y):
774846

775847
# Make sure input data is valid
@@ -1118,9 +1190,9 @@ def cv_results_(self):
11181190
status.append('Abort')
11191191
elif s == StatusType.MEMOUT:
11201192
status.append('Memout')
1121-
elif s == StatusType.RUNNING:
1122-
continue
1123-
elif s == StatusType.BUDGETEXHAUSTED:
1193+
# TODO remove StatusType.RUNNING at some point in the future when the new SMAC 0.13.2
1194+
# is the new minimum required version!
1195+
elif s in (StatusType.STOP, StatusType.RUNNING):
11241196
continue
11251197
else:
11261198
raise NotImplementedError(s)

autosklearn/data/validation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,7 @@ def _check_and_get_columns_to_encode(
305305
"Cast it to a valid dtype before using it in Auto-Sklearn. "
306306
"Valid types are numerical, categorical or boolean. "
307307
"You can cast it to a valid dtype using "
308-
"pandas.Series.astype ."
308+
"pandas.Series.astype. "
309309
"If working with string objects, the following "
310310
"tutorial illustrates how to work with text data: "
311311
"https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html".format( # noqa: E501

autosklearn/ensemble_builder.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
from autosklearn.metrics import calculate_score, Scorer
3131
from autosklearn.ensembles.ensemble_selection import EnsembleSelection
3232
from autosklearn.ensembles.abstract_ensemble import AbstractEnsemble
33-
from autosklearn.util.logging_ import get_named_client_logger, get_logger
33+
from autosklearn.util.logging_ import get_named_client_logger
3434

3535
Y_ENSEMBLE = 0
3636
Y_VALID = 1
@@ -162,7 +162,10 @@ def build_ensemble(
162162
# The second criteria is elapsed time
163163
elapsed_time = time.time() - self.start_time
164164

165-
logger = get_logger('EnsembleBuilder')
165+
logger = get_named_client_logger(
166+
name='EnsembleBuilder',
167+
port=self.logger_port,
168+
)
166169

167170
# First test for termination conditions
168171
if self.time_left_for_ensembles < elapsed_time:
@@ -476,7 +479,6 @@ def __init__(
476479
self.logger = get_named_client_logger(
477480
name='EnsembleBuilder',
478481
port=self.logger_port,
479-
output_dir=self.backend.temporary_directory,
480482
)
481483

482484
if ensemble_nbest == 1:
@@ -582,7 +584,6 @@ def run(
582584
self.logger = get_named_client_logger(
583585
name='EnsembleBuilder',
584586
port=self.logger_port,
585-
output_dir=self.backend.temporary_directory,
586587
)
587588

588589
process_start_time = time.time()
@@ -659,7 +660,6 @@ def main(self, time_left, iteration, return_predictions):
659660
self.logger = get_named_client_logger(
660661
name='EnsembleBuilder',
661662
port=self.logger_port,
662-
output_dir=self.backend.temporary_directory,
663663
)
664664

665665
self.start_time = time.time()

autosklearn/ensembles/ensemble_selection.py

Lines changed: 10 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -105,40 +105,27 @@ def _fast(
105105
dtype=np.float64,
106106
)
107107
s = len(ensemble)
108-
if s == 0:
109-
weighted_ensemble_prediction.fill(0.0)
110-
else:
111-
weighted_ensemble_prediction.fill(0.0)
112-
for pred in ensemble:
113-
np.add(
114-
weighted_ensemble_prediction,
115-
pred,
116-
out=weighted_ensemble_prediction,
117-
)
118-
np.multiply(
119-
weighted_ensemble_prediction,
120-
1/s,
121-
out=weighted_ensemble_prediction,
122-
)
123-
np.multiply(
108+
if s > 0:
109+
np.add(
124110
weighted_ensemble_prediction,
125-
(s / float(s + 1)),
111+
ensemble[-1],
126112
out=weighted_ensemble_prediction,
127113
)
128114

129115
# Memory-efficient averaging!
130116
for j, pred in enumerate(predictions):
131-
# TODO: this could potentially be vectorized! - let's profile
132-
# the script first!
133-
fant_ensemble_prediction.fill(0.0)
117+
# fant_ensemble_prediction is the prediction of the current ensemble
118+
# and should be ([predictions[selected_prev_iterations] + predictions[j])/(s+1)
119+
# We overwrite the contents of fant_ensemble_prediction
120+
# directly with weighted_ensemble_prediction + new_prediction and then scale for avg
134121
np.add(
135-
fant_ensemble_prediction,
136122
weighted_ensemble_prediction,
123+
pred,
137124
out=fant_ensemble_prediction
138125
)
139-
np.add(
126+
np.multiply(
140127
fant_ensemble_prediction,
141-
(1. / float(s + 1)) * pred,
128+
(1. / float(s + 1)),
142129
out=fant_ensemble_prediction
143130
)
144131

autosklearn/evaluation/__init__.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# -*- encoding: utf-8 -*-
22
import functools
3+
import logging
34
import json
45
import math
56
import multiprocessing
@@ -22,7 +23,7 @@
2223
import autosklearn.evaluation.train_evaluator
2324
import autosklearn.evaluation.test_evaluator
2425
import autosklearn.evaluation.util
25-
import autosklearn.util.logging_
26+
from autosklearn.util.logging_ import get_named_client_logger
2627

2728

2829
def fit_predict_try_except_decorator(ta, queue, cost_for_crash, **kwargs):
@@ -96,7 +97,7 @@ def _encode_exit_status(exit_status):
9697
class ExecuteTaFuncWithQueue(AbstractTAFunc):
9798

9899
def __init__(self, backend, autosklearn_seed, resampling_strategy, metric,
99-
cost_for_crash, abort_on_first_run_crash,
100+
cost_for_crash, abort_on_first_run_crash, port,
100101
initial_num_run=1, stats=None,
101102
run_obj='quality', par_factor=1, scoring_functions=None,
102103
output_y_hat_optimization=True, include=None, exclude=None,
@@ -175,8 +176,15 @@ def __init__(self, backend, autosklearn_seed, resampling_strategy, metric,
175176
else:
176177
self._get_test_loss = False
177178

179+
self.port = port
178180
self.pynisher_context = pynisher_context
179-
self.logger = autosklearn.util.logging_.get_logger("TAE")
181+
if self.port is None:
182+
self.logger = logging.getLogger("TAE")
183+
else:
184+
self.logger = get_named_client_logger(
185+
name="TAE",
186+
port=self.port,
187+
)
180188

181189
def run_wrapper(
182190
self,
@@ -261,8 +269,15 @@ def run(
261269
if self.init_params is not None:
262270
init_params.update(self.init_params)
263271

272+
if self.port is None:
273+
logger = logging.getLogger("pynisher")
274+
else:
275+
logger = get_named_client_logger(
276+
name="pynisher",
277+
port=self.port,
278+
)
264279
arguments = dict(
265-
logger=autosklearn.util.logging_.get_logger("pynisher"),
280+
logger=logger,
266281
wall_time_in_s=cutoff,
267282
mem_in_mb=self.memory_limit,
268283
capture_output=True,
@@ -278,6 +293,7 @@ def run(
278293
queue=queue,
279294
config=config,
280295
backend=self.backend,
296+
port=self.port,
281297
metric=self.metric,
282298
seed=self.autosklearn_seed,
283299
num_run=num_run,

0 commit comments

Comments
 (0)