Skip to content

Commit 202918e

Browse files
authored
Merge pull request #495 from automl/development
Development
2 parents 43eb3d8 + 0c30343 commit 202918e

File tree

127 files changed

+5026
-2222
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

127 files changed

+5026
-2222
lines changed

autosklearn/__init__.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
# -*- encoding: utf-8 -*-
2+
import os
3+
import sys
4+
25
from autosklearn.util import dependencies
36
from autosklearn.__version__ import __version__
47

@@ -13,3 +16,17 @@
1316
'''
1417

1518
dependencies.verify_packages(__MANDATORY_PACKAGES__)
19+
20+
if os.name != 'posix':
21+
raise ValueError(
22+
'Detected unsupported operating system: %s. Please check '
23+
'the compability information of auto-sklearn: http://automl.github.io'
24+
'/auto-sklearn/stable/installation.html#windows-osx-compability' %
25+
sys.platform
26+
)
27+
28+
if sys.version_info < (3, 5):
29+
raise ValueError(
30+
'Unsupported python version %s found. Auto-sklearn requires Python '
31+
'3.5 or higher.' % sys.version_info
32+
)

autosklearn/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
"""Version information."""
22

33
# The following line *must* be the last in the module, exactly as formatted:
4-
__version__ = "0.3.0"
4+
__version__ = "0.4.0"

autosklearn/automl.py

Lines changed: 126 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
import numpy.ma as ma
1111
import scipy.stats
1212
from sklearn.base import BaseEstimator
13+
from sklearn.model_selection._split import _RepeatedSplits, \
14+
BaseShuffleSplit, BaseCrossValidator
1315
from smac.tae.execute_ta_run import StatusType
1416
from smac.stats.stats import Stats
1517
from sklearn.externals import joblib
@@ -133,14 +135,17 @@ def __init__(self,
133135
# After assignging and checking variables...
134136
#self._backend = Backend(self._output_dir, self._tmp_dir)
135137

136-
def fit(self, X, y,
137-
task=MULTICLASS_CLASSIFICATION,
138-
metric=None,
139-
feat_type=None,
140-
dataset_name=None):
141-
if not self._shared_mode:
142-
self._backend.context.delete_directories()
143-
else:
138+
def fit(
139+
self, X, y,
140+
task,
141+
metric,
142+
X_test=None,
143+
y_test=None,
144+
feat_type=None,
145+
dataset_name=None,
146+
only_return_configuration_space=False,
147+
):
148+
if self._shared_mode:
144149
# If this fails, it's likely that this is the first call to get
145150
# the data manager
146151
try:
@@ -149,8 +154,6 @@ def fit(self, X, y,
149154
except IOError:
150155
pass
151156

152-
self._backend.context.create_directories()
153-
154157
if dataset_name is None:
155158
dataset_name = hash_array_or_matrix(X)
156159

@@ -181,13 +184,22 @@ def fit(self, X, y,
181184
'valid feature types, you passed `%s`' % ft)
182185

183186
self._data_memory_limit = None
184-
loaded_data_manager = XYDataManager(X, y,
185-
task=task,
186-
feat_type=feat_type,
187-
dataset_name=dataset_name)
187+
loaded_data_manager = XYDataManager(
188+
X, y,
189+
X_test=X_test,
190+
y_test=y_test,
191+
task=task,
192+
feat_type=feat_type,
193+
dataset_name=dataset_name,
194+
)
188195

189-
return self._fit(loaded_data_manager, metric)
196+
return self._fit(
197+
loaded_data_manager,
198+
metric,
199+
only_return_configuration_space,
200+
)
190201

202+
# TODO this is very old code which can be dropped!
191203
def fit_automl_dataset(self, dataset, metric):
192204
self._stopwatch = StopWatch()
193205
self._backend.save_start_time(self._seed)
@@ -280,7 +292,7 @@ def _do_dummy_prediction(self, datamanager, num_run):
280292

281293
return ta.num_run
282294

283-
def _fit(self, datamanager, metric):
295+
def _fit(self, datamanager, metric, only_return_configuration_space=False):
284296
# Reset learnt stuff
285297
self.models_ = None
286298
self.ensemble_ = None
@@ -296,9 +308,13 @@ def _fit(self, datamanager, metric):
296308
raise ValueError("List member '%s' for argument "
297309
"'disable_evaluator_output' must be one "
298310
"of " + str(allowed_elements))
299-
if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit',
300-
'cv', 'partial-cv',
301-
'partial-cv-iterative-fit']:
311+
if self._resampling_strategy not in [
312+
'holdout', 'holdout-iterative-fit',
313+
'cv', 'partial-cv',
314+
'partial-cv-iterative-fit'] \
315+
and not issubclass(self._resampling_strategy, BaseCrossValidator)\
316+
and not issubclass(self._resampling_strategy, _RepeatedSplits)\
317+
and not issubclass(self._resampling_strategy, BaseShuffleSplit):
302318
raise ValueError('Illegal resampling strategy: %s' %
303319
self._resampling_strategy)
304320
if self._resampling_strategy in ['partial-cv', 'partial-cv-iterative-fit'] \
@@ -354,6 +370,8 @@ def _fit(self, datamanager, metric):
354370
exclude_estimators=self._exclude_estimators,
355371
include_preprocessors=self._include_preprocessors,
356372
exclude_preprocessors=self._exclude_preprocessors)
373+
if only_return_configuration_space:
374+
return self.configuration_space
357375

358376
# == RUN ensemble builder
359377
# Do this before calculating the meta-features to make sure that the
@@ -367,9 +385,13 @@ def _fit(self, datamanager, metric):
367385
self._logger.info(
368386
'Start Ensemble with %5.2fsec time left' % time_left_for_ensembles)
369387
if time_left_for_ensembles <= 0:
370-
self._logger.warning("Not starting ensemble builder because there "
371-
"is no time left!")
372388
self._proc_ensemble = None
389+
# Fit only raises error when ensemble_size is not zero but
390+
# time_left_for_ensembles is zero.
391+
if self._ensemble_size > 0:
392+
raise ValueError("Not starting ensemble builder because there "
393+
"is no time left. Try increasing the value "
394+
"of time_left_for_this_task.")
373395
else:
374396
self._proc_ensemble = self._get_ensemble_process(time_left_for_ensembles)
375397
if self._ensemble_size > 0:
@@ -384,7 +406,7 @@ def _fit(self, datamanager, metric):
384406
del self._datamanager
385407
except Exception:
386408
pass
387-
409+
388410
# => RUN SMAC
389411
smac_task_name = 'runSMAC'
390412
self._stopwatch.start_task(smac_task_name)
@@ -465,14 +487,18 @@ def send_warnings_to_log(message, category, filename, lineno,
465487

466488
if self._keep_models is not True:
467489
raise ValueError(
468-
"Predict can only be called if 'keep_models==True'")
490+
"Refit can only be called if 'keep_models==True'")
469491
if self.models_ is None or len(self.models_) == 0 or \
470492
self.ensemble_ is None:
471493
self._load_models()
472494

495+
# Refit is not applicable when ensemble_size is set to zero.
496+
if self.ensemble_ is None:
497+
raise ValueError("Refit can only be called if 'ensemble_size != 0'")
498+
473499
random_state = np.random.RandomState(self._seed)
474500
for identifier in self.models_:
475-
if identifier in self.ensemble_.get_model_identifiers():
501+
if identifier in self.ensemble_.get_selected_model_identifiers():
476502
model = self.models_[identifier]
477503
# this updates the model inplace, it can then later be used in
478504
# predict method
@@ -528,11 +554,18 @@ def predict(self, X, batch_size=None, n_jobs=1):
528554
self.ensemble_ is None:
529555
self._load_models()
530556

557+
# If self.ensemble_ is None, it means that ensemble_size is set to zero.
558+
# In such cases, raise error because predict and predict_proba cannot
559+
# be called.
560+
if self.ensemble_ is None:
561+
raise ValueError("Predict and predict_proba can only be called "
562+
"if 'ensemble_size != 0'")
563+
531564
# Parallelize predictions across models with n_jobs processes.
532565
# Each process computes predictions in chunks of batch_size rows.
533566
all_predictions = joblib.Parallel(n_jobs=n_jobs)(
534567
joblib.delayed(_model_predict)(self, X, batch_size, identifier)
535-
for identifier in self.ensemble_.get_model_identifiers())
568+
for identifier in self.ensemble_.get_selected_model_identifiers())
536569

537570
if len(all_predictions) == 0:
538571
raise ValueError('Something went wrong generating the predictions. '
@@ -559,6 +592,7 @@ def fit_ensemble(self, y, task=None, metric=None, precision='32',
559592
ensemble_nbest=ensemble_nbest, ensemble_size=ensemble_size)
560593
self._proc_ensemble.main()
561594
self._proc_ensemble = None
595+
self._load_models()
562596
return self
563597

564598
def _get_ensemble_process(self, time_left_for_ensembles,
@@ -606,7 +640,8 @@ def _get_ensemble_process(self, time_left_for_ensembles,
606640
seed=self._seed,
607641
shared_mode=self._shared_mode,
608642
precision=precision,
609-
max_iterations=max_iterations)
643+
max_iterations=max_iterations,
644+
read_at_most=np.inf)
610645

611646
def _load_models(self):
612647
if self._shared_mode:
@@ -631,10 +666,10 @@ def _load_models(self):
631666
['partial-cv', 'partial-cv-iterative-fit']:
632667
raise ValueError('No models fitted!')
633668

634-
self.models = []
669+
self.models_ = []
635670

636671
else:
637-
self.models = []
672+
self.models_ = []
638673

639674
def score(self, X, y):
640675
# fix: Consider only index 1 of second dimension
@@ -747,10 +782,10 @@ def sprint_statistics(self):
747782
num_crash = sum([s == 'Crash' for s in cv_results['status']])
748783
sio.write(' Number of crashed target algorithm runs: %d\n' % num_crash)
749784
num_timeout = sum([s == 'Timeout' for s in cv_results['status']])
750-
sio.write(' Number of target algorithms that exceeded the memory '
785+
sio.write(' Number of target algorithms that exceeded the time '
751786
'limit: %d\n' % num_timeout)
752787
num_memout = sum([s == 'Memout' for s in cv_results['status']])
753-
sio.write(' Number of target algorithms that exceeded the time '
788+
sio.write(' Number of target algorithms that exceeded the memory '
754789
'limit: %d\n' % num_memout)
755790
return sio.getvalue()
756791

@@ -810,7 +845,8 @@ def __init__(self, *args, **kwargs):
810845

811846
def _perform_input_checks(self, X, y):
812847
X = self._check_X(X)
813-
y = self._check_y(y)
848+
if y is not None:
849+
y = self._check_y(y)
814850
return X, y
815851

816852
def _check_X(self, X):
@@ -864,12 +900,21 @@ def __init__(self, *args, **kwargs):
864900
'multiclass': MULTICLASS_CLASSIFICATION,
865901
'binary': BINARY_CLASSIFICATION}
866902

867-
def fit(self, X, y,
868-
metric=None,
869-
loss=None,
870-
feat_type=None,
871-
dataset_name=None):
903+
def fit(
904+
self, X, y,
905+
X_test=None,
906+
y_test=None,
907+
metric=None,
908+
feat_type=None,
909+
dataset_name=None,
910+
only_return_configuration_space=False,
911+
):
872912
X, y = self._perform_input_checks(X, y)
913+
if X_test is not None:
914+
X_test, y_test = self._perform_input_checks(X_test, y_test)
915+
if len(y.shape) != len(y_test.shape):
916+
raise ValueError('Target value shapes do not match: %s vs %s'
917+
% (y.shape, y_test.shape))
873918

874919
y_task = type_of_target(y)
875920
task = self._task_mapping.get(y_task)
@@ -883,8 +928,31 @@ def fit(self, X, y,
883928
metric = accuracy
884929

885930
y, self._classes, self._n_classes = self._process_target_classes(y)
886-
887-
return super().fit(X, y, task, metric, feat_type, dataset_name)
931+
if y_test is not None:
932+
# Map test values to actual values - TODO: copy to all kinds of
933+
# other parts in this code and test it!!!
934+
y_test_new = []
935+
for output_idx in range(len(self._classes)):
936+
mapping = {self._classes[output_idx][idx]: idx
937+
for idx in range(len(self._classes[output_idx]))}
938+
enumeration = y_test if len(self._classes) == 1 else y_test[output_idx]
939+
y_test_new.append(
940+
np.array([mapping[value] for value in enumeration])
941+
)
942+
y_test = np.array(y_test_new)
943+
if self._n_outputs == 1:
944+
y_test = y_test.flatten()
945+
946+
return super().fit(
947+
X, y,
948+
X_test=X_test,
949+
y_test=y_test,
950+
task=task,
951+
metric=metric,
952+
feat_type=feat_type,
953+
dataset_name=dataset_name,
954+
only_return_configuration_space=only_return_configuration_space,
955+
)
888956

889957
def fit_ensemble(self, y, task=None, metric=None, precision='32',
890958
dataset_name=None, ensemble_nbest=None,
@@ -917,7 +985,7 @@ def _process_target_classes(self, y):
917985
_classes.append(classes_k)
918986
_n_classes.append(classes_k.shape[0])
919987

920-
self._n_classes = np.array(_n_classes, dtype=np.int)
988+
_n_classes = np.array(_n_classes, dtype=np.int)
921989

922990
return y, _classes, _n_classes
923991

@@ -947,16 +1015,32 @@ def predict_proba(self, X, batch_size=None, n_jobs=1):
9471015

9481016

9491017
class AutoMLRegressor(BaseAutoML):
950-
def fit(self, X, y, metric=None, feat_type=None, dataset_name=None):
1018+
def fit(
1019+
self, X, y,
1020+
X_test=None,
1021+
y_test=None,
1022+
metric=None,
1023+
feat_type=None,
1024+
dataset_name=None,
1025+
only_return_configuration_space=False,
1026+
):
9511027
X, y = super()._perform_input_checks(X, y)
9521028
_n_outputs = 1 if len(y.shape) == 1 else y.shape[1]
9531029
if _n_outputs > 1:
9541030
raise NotImplementedError(
9551031
'Multi-output regression is not implemented.')
9561032
if metric is None:
9571033
metric = r2
958-
return super().fit(X, y, task=REGRESSION, metric=metric,
959-
feat_type=feat_type, dataset_name=dataset_name)
1034+
return super().fit(
1035+
X, y,
1036+
X_test=X_test,
1037+
y_test=y_test,
1038+
task=REGRESSION,
1039+
metric=metric,
1040+
feat_type=feat_type,
1041+
dataset_name=dataset_name,
1042+
only_return_configuration_space=only_return_configuration_space,
1043+
)
9601044

9611045
def fit_ensemble(self, y, task=None, metric=None, precision='32',
9621046
dataset_name=None, ensemble_nbest=None,

0 commit comments

Comments
 (0)