Skip to content

Commit 10c6b30

Browse files
authored
Merge pull request #717 from automl/development
Development
2 parents 1c6af59 + 47bbd11 commit 10c6b30

File tree

503 files changed

+39196
-47941
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

503 files changed

+39196
-47941
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ nosetests.xml
3535
# pycharm
3636
.idea
3737

38+
# VS code
39+
.vscode/
40+
3841
# Others
3942
*~
4043
*.dat
@@ -49,3 +52,4 @@ num_run
4952
number_submission
5053
.pypirc
5154
dmypy.json
55+
*.log

autosklearn/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
__MANDATORY_PACKAGES__ = '''
1010
numpy>=1.9
11-
scikit-learn>=0.19,<0.20
11+
scikit-learn>=0.21.0,<0.22
1212
lockfile>=0.10
1313
smac>=0.8,<0.9
1414
pyrfr>=0.6.1,<0.8

autosklearn/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
"""Version information."""
22

33
# The following line *must* be the last in the module, exactly as formatted:
4-
__version__ = "0.5.2"
4+
__version__ = "0.6.0"

autosklearn/automl.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
BaseShuffleSplit, BaseCrossValidator
1616
from smac.tae.execute_ta_run import StatusType
1717
from smac.stats.stats import Stats
18-
from sklearn.externals import joblib
18+
import joblib
1919
import sklearn.utils
2020
import scipy.sparse
2121
from sklearn.metrics.classification import type_of_target
@@ -1059,7 +1059,7 @@ def predict_proba(self, X, batch_size=None, n_jobs=1):
10591059
class AutoMLRegressor(BaseAutoML):
10601060
def __init__(self, *args, **kwargs):
10611061
super().__init__(*args, **kwargs)
1062-
1062+
10631063
def fit(
10641064
self,
10651065
X: np.ndarray,

autosklearn/evaluation/abstract_evaluator.py

Lines changed: 13 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,13 @@
1616
from autosklearn.pipeline.implementations.util import (
1717
convert_multioutput_multiclass_to_multilabel
1818
)
19-
from autosklearn.metrics import calculate_score
19+
from autosklearn.metrics import calculate_score, CLASSIFICATION_METRICS
2020
from autosklearn.util.logging_ import get_logger
2121

2222
from ConfigSpace import Configuration
2323

2424

25+
2526
__all__ = [
2627
'AbstractEvaluator'
2728
]
@@ -213,13 +214,17 @@ def _loss(self, y_true, y_hat, all_scoring_functions=None):
213214
all_scoring_functions=all_scoring_functions)
214215

215216
if hasattr(score, '__len__'):
216-
err = {key: self.metric._optimum - score[key] for key in score}
217+
# TODO: instead of using self.metric, it should use all metrics given by key.
218+
# But now this throws error...
219+
220+
err = {key: metric._optimum - score[key] for key, metric in
221+
CLASSIFICATION_METRICS.items() if key in score}
217222
else:
218223
err = self.metric._optimum - score
219224

220225
return err
221226

222-
def finish_up(self, loss, train_pred, opt_pred, valid_pred, test_pred,
227+
def finish_up(self, loss, train_loss, opt_pred, valid_pred, test_pred,
223228
additional_run_info, file_output, final_call):
224229
"""This function does everything necessary after the fitting is done:
225230
@@ -233,14 +238,14 @@ def finish_up(self, loss, train_pred, opt_pred, valid_pred, test_pred,
233238

234239
if file_output:
235240
loss_, additional_run_info_ = self.file_output(
236-
train_pred, opt_pred, valid_pred, test_pred,
241+
opt_pred, valid_pred, test_pred,
237242
)
238243
else:
239244
loss_ = None
240245
additional_run_info_ = {}
241246

242-
train_loss, validation_loss, test_loss = self.calculate_auxiliary_losses(
243-
train_pred, valid_pred, test_pred,
247+
validation_loss, test_loss = self.calculate_auxiliary_losses(
248+
valid_pred, test_pred,
244249
)
245250

246251
if loss_ is not None:
@@ -276,42 +281,9 @@ def finish_up(self, loss, train_pred, opt_pred, valid_pred, test_pred,
276281

277282
def calculate_auxiliary_losses(
278283
self,
279-
Y_train_pred,
280284
Y_valid_pred,
281285
Y_test_pred
282286
):
283-
# Second check makes unit tests easier as it is not necessary to
284-
# actually inject data to compare against for calculating a loss
285-
if Y_train_pred is not None and self.Y_actual_train is not None:
286-
if len(self.Y_actual_train.shape) > 1:
287-
assert (
288-
np.sum(np.isfinite(self.Y_actual_train[:, 0]))
289-
== Y_train_pred.shape[0]
290-
), (
291-
np.sum(np.isfinite(self.Y_actual_train[:, 0])),
292-
Y_train_pred.shape[0],
293-
)
294-
else:
295-
assert (
296-
np.sum(np.isfinite(self.Y_actual_train))
297-
== Y_train_pred.shape[0]
298-
), (
299-
np.sum(np.isfinite(self.Y_actual_train)),
300-
Y_train_pred.shape[0],
301-
)
302-
Y_true_tmp = self.Y_actual_train
303-
if len(Y_true_tmp.shape) == 1:
304-
Y_true_tmp = Y_true_tmp[np.isfinite(self.Y_actual_train)]
305-
else:
306-
Y_true_tmp = Y_true_tmp[np.isfinite(self.Y_actual_train[:, 0])]
307-
train_loss = self._loss(
308-
Y_true_tmp,
309-
Y_train_pred,
310-
all_scoring_functions=False,
311-
)
312-
else:
313-
train_loss = None
314-
315287
if Y_valid_pred is not None:
316288
if self.y_valid is not None:
317289
validation_loss = self._loss(self.y_valid, Y_valid_pred)
@@ -332,11 +304,10 @@ def calculate_auxiliary_losses(
332304
else:
333305
test_loss = None
334306

335-
return train_loss, validation_loss, test_loss
307+
return validation_loss, test_loss
336308

337309
def file_output(
338310
self,
339-
Y_train_pred,
340311
Y_optimization_pred,
341312
Y_valid_pred,
342313
Y_test_pred
@@ -360,7 +331,7 @@ def file_output(
360331
)
361332

362333
for y, s in [
363-
[Y_train_pred, 'train'],
334+
# Y_train_pred deleted here. Fix unittest accordingly.
364335
[Y_optimization_pred, 'optimization'],
365336
[Y_valid_pred, 'validation'],
366337
[Y_test_pred, 'test']

autosklearn/evaluation/test_evaluator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def fit_predict_and_loss(self):
5151
loss, Y_pred, _, _ = self.predict_and_loss()
5252
self.finish_up(
5353
loss=loss,
54-
train_pred=None,
54+
train_loss=None,
5555
opt_pred=Y_pred,
5656
valid_pred=None,
5757
test_pred=None,

autosklearn/evaluation/train_evaluator.py

Lines changed: 69 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,13 @@
3535
'TimeSeriesSplit': {'n_splits': 3,
3636
'max_train_size': None},
3737
'GroupShuffleSplit': {'n_splits': 5,
38-
'test_size': 'default',
38+
'test_size': None,
3939
'random_state': None},
4040
'StratifiedShuffleSplit': {'n_splits': 10,
41-
'test_size': 'default',
41+
'test_size': None,
4242
'random_state': None},
4343
'ShuffleSplit': {'n_splits': 10,
44-
'test_size': 'default',
44+
'test_size': None,
4545
'random_state': None}
4646
}
4747

@@ -137,6 +137,12 @@ def fit_predict_and_loss(self, iterative=False):
137137
train_splits = [None] * self.cv_folds
138138

139139
y = _get_y_array(self.Y_train, self.task_type)
140+
141+
train_losses = [] # stores train loss of each fold.
142+
train_fold_weights = [] # used as weights when averaging train losses.
143+
opt_losses = [] # stores opt (validation) loss of each fold.
144+
opt_fold_weights = [] # weights for opt_losses.
145+
140146
# TODO: mention that no additional run info is possible in this
141147
# case! -> maybe remove full CV from the train evaluator anyway and
142148
# make the user implement this!
@@ -179,30 +185,57 @@ def fit_predict_and_loss(self, iterative=False):
179185
Y_test_pred[i] = test_pred
180186
train_splits[i] = train_split
181187

188+
# Compute train loss of this fold and store it. train_loss could
189+
# either be a scalar or a dict of scalars with metrics as keys.
190+
train_loss = self._loss(
191+
self.Y_train_targets[train_split],
192+
train_pred,
193+
)
194+
train_losses.append(train_loss)
195+
# number of training data points for this fold. Used for weighting
196+
# the average.
197+
train_fold_weights.append(len(train_split))
198+
199+
# Compute validation loss of this fold and store it.
200+
optimization_loss = self._loss(
201+
self.Y_targets[i],
202+
opt_pred,
203+
)
204+
opt_losses.append(optimization_loss)
205+
# number of optimization data points for this fold. Used for weighting
206+
# the average.
207+
opt_fold_weights.append(len(test_split))
208+
209+
# Compute weights of each fold based on the number of samples in each
210+
# fold.
211+
train_fold_weights = [w / sum(train_fold_weights) for w in train_fold_weights]
212+
opt_fold_weights = [w / sum(opt_fold_weights) for w in opt_fold_weights]
213+
214+
# train_losses is a list of either scalars or dicts. If it contains dicts,
215+
# then train_loss is computed using the target metric (self.metric).
216+
if all(isinstance(elem, dict) for elem in train_losses):
217+
train_loss = np.average([train_losses[i][str(self.metric)]
218+
for i in range(self.cv_folds)],
219+
weights=train_fold_weights,
220+
)
221+
else:
222+
train_loss = np.average(train_losses, weights=train_fold_weights)
223+
224+
# if all_scoring_function is true, return a dict of opt_loss. Otherwise,
225+
# return a scalar.
226+
if self.all_scoring_functions is True:
227+
opt_loss = {}
228+
for metric in opt_losses[0].keys():
229+
opt_loss[metric] = np.average([opt_losses[i][metric]
230+
for i in range(self.cv_folds)],
231+
weights=opt_fold_weights,
232+
)
233+
else:
234+
opt_loss = np.average(opt_losses, weights=opt_fold_weights)
235+
182236
Y_targets = self.Y_targets
183237
Y_train_targets = self.Y_train_targets
184238

185-
Y_train_pred_full = np.array(
186-
[
187-
np.ones(
188-
(self.Y_train.shape[0], Y_train_pred[i].shape[1])
189-
) * np.NaN
190-
for _ in range(self.cv_folds) if Y_train_pred[i] is not None
191-
]
192-
)
193-
for i in range(self.cv_folds):
194-
if Y_train_pred[i] is None:
195-
continue
196-
Y_train_pred_full[i][train_splits[i]] = Y_train_pred[i]
197-
Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)
198-
if self.cv_folds == 1:
199-
Y_train_pred = Y_train_pred[
200-
# if the first column is np.NaN, all other columns have
201-
# to be np.NaN as well
202-
np.isfinite(Y_train_pred[:, 0])
203-
]
204-
205-
206239
Y_optimization_pred = np.concatenate(
207240
[Y_optimization_pred[i] for i in range(self.cv_folds)
208241
if Y_optimization_pred[i] is not None])
@@ -240,8 +273,8 @@ def fit_predict_and_loss(self, iterative=False):
240273
self._added_empty_model = True
241274

242275
self.finish_up(
243-
loss=loss,
244-
train_pred=Y_train_pred,
276+
loss=opt_loss,
277+
train_loss=train_loss,
245278
opt_pred=Y_optimization_pred,
246279
valid_pred=Y_valid_pred,
247280
test_pred=Y_test_pred,
@@ -282,6 +315,7 @@ def partial_fit_predict_and_loss(self, fold, iterative=False):
282315
iterative=iterative,
283316
)
284317
)
318+
train_loss = self._loss(self.Y_actual_train, train_pred)
285319
loss = self._loss(self.Y_targets[fold], opt_pred)
286320

287321
if self.cv_folds > 1:
@@ -292,7 +326,7 @@ def partial_fit_predict_and_loss(self, fold, iterative=False):
292326

293327
self.finish_up(
294328
loss=loss,
295-
train_pred=train_pred,
329+
train_loss=train_loss,
296330
opt_pred=opt_pred,
297331
valid_pred=valid_pred,
298332
test_pred=test_pred,
@@ -345,6 +379,9 @@ def _partial_fit_and_predict(self, fold, train_indices, test_indices,
345379
if self.cv_folds == 1:
346380
self.model = model
347381

382+
train_loss = self._loss(self.Y_train[train_indices],
383+
Y_train_pred,
384+
)
348385
loss = self._loss(self.Y_train[test_indices], Y_optimization_pred)
349386
additional_run_info = model.get_additional_run_info()
350387

@@ -354,7 +391,7 @@ def _partial_fit_and_predict(self, fold, train_indices, test_indices,
354391
final_call = False
355392
self.finish_up(
356393
loss=loss,
357-
train_pred=Y_train_pred,
394+
train_loss=train_loss,
358395
opt_pred=Y_optimization_pred,
359396
valid_pred=Y_valid_pred,
360397
test_pred=Y_test_pred,
@@ -386,11 +423,14 @@ def _partial_fit_and_predict(self, fold, train_indices, test_indices,
386423
train_indices=train_indices,
387424
test_indices=test_indices
388425
)
426+
train_loss = self._loss(self.Y_train[train_indices],
427+
Y_train_pred,
428+
)
389429
loss = self._loss(self.Y_train[test_indices], Y_optimization_pred)
390430
additional_run_info = model.get_additional_run_info()
391431
self.finish_up(
392432
loss=loss,
393-
train_pred=Y_train_pred,
433+
train_loss=train_loss,
394434
opt_pred=Y_optimization_pred,
395435
valid_pred=Y_valid_pred,
396436
test_pred=Y_test_pred,

0 commit comments

Comments
 (0)