Skip to content

Commit ad2b4a1

Browse files
committed
Merge pull request #37 from automl/development
Development
2 parents 39974ba + 2892f2f commit ad2b4a1

File tree

158 files changed

+5015
-2826
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

158 files changed

+5015
-2826
lines changed

autosklearn/automl.py

Lines changed: 58 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,8 @@ def __init__(self,
187187
self._metric = None
188188
self._label_num = None
189189
self.models_ = None
190-
self.ensemble_indices_ = None
190+
self.ensemble_ = None
191+
self._can_predict = False
191192

192193
self._debug_mode = debug_mode
193194
self._backend = Backend(self._output_dir, self._tmp_dir)
@@ -242,9 +243,14 @@ def fit(self, X, y,
242243
raise ValueError('Array feat_type does not have same number of '
243244
'variables as X has features. %d vs %d.' %
244245
(len(feat_type), X.shape[1]))
245-
if feat_type is not None and not all([isinstance(f, bool)
246+
if feat_type is not None and not all([isinstance(f, str)
246247
for f in feat_type]):
247-
raise ValueError('Array feat_type must only contain bools.')
248+
raise ValueError('Array feat_type must only contain strings.')
249+
if feat_type is not None:
250+
for ft in feat_type:
251+
if ft.lower() not in ['categorical', 'numerical']:
252+
raise ValueError('Only `Categorical` and `Numerical` are '
253+
'valid feature types, you passed `%s`' % ft)
248254

249255
loaded_data_manager = XYDataManager(X, y,
250256
task=task,
@@ -298,16 +304,19 @@ def _print_load_time(basename, time_left_for_this_task,
298304
return time_for_load_data
299305

300306
def _do_dummy_prediction(self, datamanager):
307+
self._logger.info("Starting to create dummy predictions.")
301308
autosklearn.cli.base_interface.main(datamanager,
302309
self._resampling_strategy,
303310
None,
304311
None,
305-
mode_args=self._resampling_strategy_arguments)
312+
mode_args=self._resampling_strategy_arguments,
313+
output_dir=self._tmp_dir)
314+
self._logger.info("Finished creating dummy predictions.")
306315

307316
def _fit(self, datamanager):
308317
# Reset learnt stuff
309318
self.models_ = None
310-
self.ensemble_indices_ = None
319+
self.ensemble_ = None
311320

312321
# Check arguments prior to doing anything!
313322
if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit',
@@ -352,7 +361,8 @@ def _fit(self, datamanager):
352361
self._logger)
353362

354363
# == Perform dummy predictions
355-
self._do_dummy_prediction(datamanager)
364+
if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']:
365+
self._do_dummy_prediction(datamanager)
356366

357367
# = Create a searchspace
358368
# Do this before One Hot Encoding to make sure that it creates a
@@ -371,6 +381,12 @@ def _fit(self, datamanager):
371381
self._include_preprocessors)
372382
self.configuration_space_created_hook(datamanager)
373383

384+
# == RUN ensemble builder
385+
# Do this before calculating the meta-features to make sure that the
386+
# dummy predictions are actually included in the ensemble even if
387+
# calculating the meta-features takes very long
388+
proc_ensembles = self.run_ensemble_builder()
389+
374390
# == Calculate metafeatures
375391
meta_features = _calculate_metafeatures(
376392
data_feat_type=datamanager.feat_type,
@@ -481,9 +497,6 @@ def _fit(self, datamanager):
481497
resampling_strategy_arguments=self._resampling_strategy_arguments,
482498
shared_mode=self._shared_mode)
483499

484-
# == RUN ensemble builder
485-
proc_ensembles = self.run_ensemble_builder()
486-
487500
procs = []
488501

489502
if proc_smac is not None:
@@ -554,26 +567,43 @@ def run_ensemble_builder(self,
554567
'size 0.')
555568
return None
556569

570+
def refit(self, X, y):
571+
if self._keep_models is not True:
572+
raise ValueError(
573+
"Predict can only be called if 'keep_models==True'")
574+
if self.models_ is None or len(self.models_) == 0 or \
575+
self.ensemble_ is None:
576+
self._load_models()
577+
578+
for identifier in self.models_:
579+
if identifier in self.ensemble_.get_model_identifiers():
580+
model = self.models_[identifier]
581+
# this updates the model inplace, it can then later be used in
582+
# predict method
583+
model.fit(X.copy(), y.copy())
584+
585+
self._can_predict = True
586+
557587
def predict(self, X):
588+
return np.argmax(self.predict_proba(X), axis=1)
589+
590+
def predict_proba(self, X):
558591
if self._keep_models is not True:
559592
raise ValueError(
560593
"Predict can only be called if 'keep_models==True'")
561-
if self._resampling_strategy not in ['holdout',
562-
'holdout-iterative-fit']:
594+
if not self._can_predict and \
595+
self._resampling_strategy not in \
596+
['holdout', 'holdout-iterative-fit']:
563597
raise NotImplementedError(
564598
'Predict is currently only implemented for resampling '
565599
'strategy holdout.')
566600

567-
if self.models_ is None or len(self.models_) == 0 or len(
568-
self.ensemble_indices_) == 0:
601+
if self.models_ is None or len(self.models_) == 0 or \
602+
self.ensemble_ is None:
569603
self._load_models()
570604

571-
predictions = []
572-
for identifier in self.models_:
573-
if identifier not in self.ensemble_indices_:
574-
continue
575-
576-
weight = self.ensemble_indices_[identifier]
605+
all_predictions = []
606+
for identifier in self.ensemble_.get_model_identifiers():
577607
model = self.models_[identifier]
578608

579609
X_ = X.copy()
@@ -588,16 +618,16 @@ def predict(self, X):
588618
"while X_.shape is %s" %
589619
(model, str(prediction.shape),
590620
str(X_.shape)))
591-
predictions.append(prediction * weight)
621+
all_predictions.append(prediction)
592622

593-
if len(predictions) == 0:
623+
if len(all_predictions) == 0:
594624
raise ValueError('Something went wrong generating the predictions. '
595625
'The ensemble should consist of the following '
596626
'models: %s, the following models were loaded: '
597627
'%s' % (str(list(self.ensemble_indices_.keys())),
598628
str(list(self.models_.keys()))))
599629

600-
predictions = np.sum(np.array(predictions), axis=0)
630+
predictions = self.ensemble_.predict(all_predictions)
601631
return predictions
602632

603633
def _load_models(self):
@@ -610,42 +640,23 @@ def _load_models(self):
610640
if len(self.models_) == 0:
611641
raise ValueError('No models fitted!')
612642

613-
self.ensemble_indices_ = self._backend.load_ensemble_indices_weights(
614-
seed)
643+
self.ensemble_ = self._backend.load_ensemble(seed)
615644

616645
def score(self, X, y):
617646
# fix: Consider only index 1 of second dimension
618647
# Don't know if the reshaping should be done there or in calculate_score
619-
prediction = self.predict(X)
620-
if self._task == BINARY_CLASSIFICATION:
621-
prediction = prediction[:, 1].reshape((-1, 1))
648+
prediction = self.predict_proba(X)
622649
return calculate_score(y, prediction, self._task,
623650
self._metric, self._label_num,
624651
logger=self._logger)
625652

626653
def show_models(self):
627-
if self.models_ is None or len(self.models_) == 0 or len(
628-
self.ensemble_indices_) == 0:
629-
self._load_models()
630654

631-
output = []
632-
sio = six.StringIO()
633-
for identifier in self.models_:
634-
if identifier not in self.ensemble_indices_:
635-
continue
636-
637-
weight = self.ensemble_indices_[identifier]
638-
model = self.models_[identifier]
639-
output.append((weight, model))
640-
641-
output.sort(reverse=True)
642-
643-
sio.write("[")
644-
for weight, model in output:
645-
sio.write("(%f, %s),\n" % (weight, model))
646-
sio.write("]")
655+
if self.models_ is None or len(self.models_) == 0 or \
656+
self.ensemble_ is None:
657+
self._load_models()
647658

648-
return sio.getvalue()
659+
return self.ensemble_.pprint_ensemble_string(self.models_)
649660

650661
def _save_ensemble_data(self, X, y):
651662
"""Split dataset and store Data for the ensemble script.

autosklearn/cli/HPOlib_interface.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def parse_cli():
8282
return args, parameters
8383

8484

85-
def parse_args(dataset, mode, seed, params, fold, folds):
85+
def parse_args(dataset, mode, seed, params, fold, folds, output_dir=None):
8686
if seed is None:
8787
seed = 1
8888

@@ -107,10 +107,11 @@ def parse_args(dataset, mode, seed, params, fold, folds):
107107
mode_args = None
108108
else:
109109
raise ValueError(mode)
110-
base_interface.main(dataset, mode, seed, params, mode_args=mode_args)
110+
base_interface.main(dataset, mode, seed, params, mode_args=mode_args,
111+
output_dir=output_dir)
111112

112113

113-
def main():
114+
def main(output_dir=None):
114115
args, params = parse_cli()
115116
assert 'dataset' in args
116117
assert 'mode' in args
@@ -124,6 +125,7 @@ def main():
124125
params,
125126
int(args['fold']),
126127
int(args['folds']),
128+
output_dir=output_dir
127129
)
128130

129131

autosklearn/cli/SMAC_interface.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33

44
from autosklearn.cli import base_interface
55

6-
def main():
6+
7+
def main(output_dir=None):
78
instance_name = sys.argv[1]
89
instance_specific_information = sys.argv[2]
910
cutoff_time = float(sys.argv[3])
@@ -45,7 +46,7 @@ def main():
4546
raise ValueError(mode)
4647

4748
base_interface.main(instance_specific_information, mode,
48-
seed, params, mode_args=mode_args)
49+
seed, params, mode_args=mode_args, output_dir=output_dir)
4950

5051

5152
if __name__ == '__main__':

0 commit comments

Comments
 (0)