Skip to content

Commit ffead2b

Browse files
Mb disc space limit for ensemble (#874)
* Mb disc space limit for ensemble * track disc consumption * Solved artifacts of rebase * py3.5 compatible print message * Don't be pessimistic in Gb calc * Incomporate comments * Handle failure cases in ensemble disk space
1 parent 536a16c commit ffead2b

File tree

6 files changed

+162
-12
lines changed

6 files changed

+162
-12
lines changed

autosklearn/ensemble_builder.py

Lines changed: 96 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# -*- encoding: utf-8 -*-
2+
import math
23
import numbers
34
import multiprocessing
45
import glob
@@ -71,7 +72,13 @@ def __init__(
7172
If performance_range_threshold > 0, might return less models
7273
max_models_on_disc: int
7374
Defines the maximum number of models that are kept in the disc.
74-
If int, it must be greater or equal than 1. If None, feature is disabled.
75+
If int, it must be greater or equal than 1, and dictates the max number of
76+
models to keep.
77+
If float, it will be interpreted as the max megabytes allowed of disc space. That
78+
is, if the number of ensemble candidates require more disc space than this float
79+
value, the worst models will be deleted to keep within this budget.
80+
Models and predictions of the worst-performing models will be deleted then.
81+
If None, the feature is disabled.
7582
It defines an upper bound on the models that can be used in the ensemble.
7683
performance_range_threshold: float
7784
Keep only models that are better than:
@@ -118,11 +125,17 @@ def __init__(
118125

119126
self.ensemble_nbest = ensemble_nbest
120127

121-
if max_models_on_disc is not None and max_models_on_disc < 1:
128+
# max_models_on_disc can be a float, in such case we need to
129+
# remember the user specified Megabytes and translate this to
130+
# max number of ensemble models. max_resident_models keeps the
131+
# maximum number of models in disc
132+
if max_models_on_disc is not None and max_models_on_disc < 0:
122133
raise ValueError(
123134
"max_models_on_disc has to be a positive number or None"
124135
)
125136
self.max_models_on_disc = max_models_on_disc
137+
self.max_resident_models = None
138+
126139
self.seed = seed
127140
self.shared_mode = shared_mode # pSMAC?
128141
self.max_iterations = max_iterations
@@ -319,6 +332,38 @@ def main(self, return_pred=False):
319332
if return_pred:
320333
return valid_pred, test_pred
321334

335+
def get_disk_consumption(self, pred_path):
336+
"""
337+
gets the cost of a model being on disc
338+
"""
339+
340+
match = self.model_fn_re.search(pred_path)
341+
if not match:
342+
raise ValueError("Invalid path format %s" % pred_path)
343+
_full_name = match.group(0)
344+
_seed = match.group(1)
345+
_num_run = match.group(2)
346+
_budget = match.group(3)
347+
348+
# Besides the prediction, we have to take care of three other files: model,
349+
# validation and test.
350+
model_name = '%s.%s.%s.model' % (_seed, _num_run, _budget)
351+
model_path = os.path.join(self.dir_models, model_name)
352+
pred_valid_name = 'predictions_valid' + _full_name
353+
pred_valid_path = os.path.join(self.dir_valid, pred_valid_name)
354+
pred_test_name = 'predictions_test' + _full_name
355+
pred_test_path = os.path.join(self.dir_test, pred_test_name)
356+
357+
paths = [model_path, pred_path]
358+
if os.path.exists(pred_valid_path):
359+
paths.append(pred_valid_path)
360+
if os.path.exists(pred_test_path):
361+
paths.append(pred_test_path)
362+
this_model_cost = sum([os.path.getsize(path) for path in paths])
363+
364+
# get the megabytes
365+
return round(this_model_cost / math.pow(1024, 2), 2)
366+
322367
def score_ensemble_preds(self):
323368
"""
324369
score predictions on ensemble building data set;
@@ -395,6 +440,7 @@ def score_ensemble_preds(self):
395440
"seed": _seed,
396441
"num_run": _num_run,
397442
"budget": _budget,
443+
"disc_space_cost_mb": None,
398444
Y_ENSEMBLE: None,
399445
Y_VALID: None,
400446
Y_TEST: None,
@@ -438,6 +484,9 @@ def score_ensemble_preds(self):
438484
y_ens_fn
439485
)
440486
self.read_preds[y_ens_fn]["loaded"] = 2
487+
self.read_preds[y_ens_fn]["disc_space_cost_mb"] = self.get_disk_consumption(
488+
y_ens_fn
489+
)
441490

442491
n_read_files += 1
443492

@@ -514,14 +563,51 @@ def get_n_best_preds(self):
514563
self.logger.debug("Library Pruning: using for ensemble only "
515564
" %d (out of %d) models" % (keep_nbest, len(sorted_keys)))
516565

566+
# If max_models_on_disc is None, do nothing
517567
# One can only read at most max_models_on_disc models
518-
if self.max_models_on_disc is not None and keep_nbest > self.max_models_on_disc:
568+
if self.max_models_on_disc is not None:
569+
if not isinstance(self.max_models_on_disc, numbers.Integral):
570+
consumption = [
571+
[
572+
v["ens_score"],
573+
v["disc_space_cost_mb"],
574+
] for v in self.read_preds.values() if v["disc_space_cost_mb"] is not None
575+
]
576+
max_consumption = max(i[1] for i in consumption)
577+
578+
# We are pessimistic with the consumption limit indicated by
579+
# max_models_on_disc by 1 model. Such model is assumed to spend
580+
# max_consumption megabytes
581+
if (sum(i[1] for i in consumption) + max_consumption) > self.max_models_on_disc:
582+
583+
# just leave the best -- higher is better!
584+
# This list is in descending order, to preserve the best models
585+
sorted_cum_consumption = np.cumsum([
586+
i[1] for i in list(reversed(sorted(consumption)))
587+
])
588+
max_models = np.argmax(sorted_cum_consumption > self.max_models_on_disc)
589+
590+
# Make sure that at least 1 model survives
591+
self.max_resident_models = max(1, max_models)
592+
self.logger.warning(
593+
"Limiting num of models via float max_models_on_disc={}"
594+
" as accumulated={} worst={} num_models={}".format(
595+
self.max_models_on_disc,
596+
(sum(i[1] for i in consumption) + max_consumption),
597+
max_consumption,
598+
self.max_resident_models
599+
)
600+
)
601+
else:
602+
self.max_resident_models = self.max_models_on_disc
603+
604+
if self.max_resident_models is not None and keep_nbest > self.max_resident_models:
519605
self.logger.debug(
520606
"Restricting the number of models to %d instead of %d due to argument "
521607
"max_models_on_disc",
522-
self.max_models_on_disc, keep_nbest,
608+
self.max_resident_models, keep_nbest,
523609
)
524-
keep_nbest = self.max_models_on_disc
610+
keep_nbest = self.max_resident_models
525611

526612
# consider performance_range_threshold
527613
if self.performance_range_threshold > 0:
@@ -692,7 +778,7 @@ def fit_ensemble(self, selected_keys: list):
692778
)
693779

694780
# Delete files of non-candidate models
695-
if self.max_models_on_disc is not None:
781+
if self.max_resident_models is not None:
696782
self._delete_excess_models()
697783

698784
return None
@@ -734,7 +820,7 @@ def fit_ensemble(self, selected_keys: list):
734820
return None
735821

736822
# Delete files of non-candidate models
737-
if self.max_models_on_disc is not None:
823+
if self.max_resident_models is not None:
738824
self._delete_excess_models()
739825

740826
return ensemble
@@ -839,14 +925,14 @@ def _delete_excess_models(self):
839925
sorted_keys = self._get_list_of_sorted_preds()
840926
sorted_keys = list(map(lambda x: x[0], sorted_keys))
841927

842-
if len(sorted_keys) <= self.max_models_on_disc:
928+
if len(sorted_keys) <= self.max_resident_models:
843929
# Don't waste time if not enough models to delete
844930
return
845931

846-
# The top self.max_models_on_disc models would be the candidates
932+
# The top self.max_resident_models models would be the candidates
847933
# Any other low performance model will be deleted
848934
# The list is in ascending order of score
849-
candidates = sorted_keys[:self.max_models_on_disc]
935+
candidates = sorted_keys[:self.max_resident_models]
850936

851937
# Loop through the files currently in the directory
852938
for pred_path in self.y_ens_files:

test/test_ensemble_builder/data/.auto-sklearn/models/0.1.0.0.model

Whitespace-only changes.

test/test_ensemble_builder/data/.auto-sklearn/models/0.2.0.0.model

Whitespace-only changes.

test/test_ensemble_builder/data/.auto-sklearn/models/0.3.0.0.model

Whitespace-only changes.

test/test_ensemble_builder/data/.auto-sklearn/models/0.3.100.0.model

Whitespace-only changes.

test/test_ensemble_builder/test_ensemble.py

Lines changed: 66 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def testRead(self):
7474

7575
@unittest.skipIf(sys.version_info[0:2] <= (3, 5), "Only works with Python > 3.5")
7676
def testNBest(self):
77-
for ensemble_nbest, models_in_disc, exp in (
77+
for ensemble_nbest, models_on_disc, exp in (
7878
(1, None, 1),
7979
(1.0, None, 2),
8080
(0.1, None, 1),
@@ -90,7 +90,7 @@ def testNBest(self):
9090
limit=-1, # not used,
9191
seed=0, # important to find the test files
9292
ensemble_nbest=ensemble_nbest,
93-
max_models_on_disc=models_in_disc,
93+
max_models_on_disc=models_on_disc,
9494
)
9595

9696
ensbuilder.score_ensemble_preds()
@@ -104,6 +104,70 @@ def testNBest(self):
104104
)
105105
self.assertEqual(sel_keys[0], fixture)
106106

107+
@unittest.skipIf(sys.version_info[0:2] <= (3, 5), "Only works with Python > 3.5")
108+
def testMaxModelsOnDisc(self):
109+
110+
ensemble_nbest = 4
111+
for (test_case, exp) in [
112+
# If None, no reduction
113+
(None, 2),
114+
# If Int, limit only on exceed
115+
(4, 2),
116+
(1, 1),
117+
# If Float, translate float to # models.
118+
# below, mock of each file is 100 Mb and
119+
# 4 files .model and .npy (test/val/pred) exist
120+
(700.0, 1),
121+
(800.0, 2),
122+
(9999.0, 2),
123+
]:
124+
ensbuilder = EnsembleBuilder(
125+
backend=self.backend,
126+
dataset_name="TEST",
127+
task_type=1, # Binary Classification
128+
metric=roc_auc,
129+
limit=-1, # not used,
130+
seed=0, # important to find the test files
131+
ensemble_nbest=ensemble_nbest,
132+
max_models_on_disc=test_case,
133+
)
134+
135+
with unittest.mock.patch('os.path.getsize') as mock:
136+
mock.return_value = 100*1024*1024
137+
ensbuilder.score_ensemble_preds()
138+
sel_keys = ensbuilder.get_n_best_preds()
139+
self.assertEqual(len(sel_keys), exp)
140+
141+
# Test for Extreme scenarios
142+
# Make sure that the best predictions are kept
143+
ensbuilder = EnsembleBuilder(
144+
backend=self.backend,
145+
dataset_name="TEST",
146+
task_type=1, # Binary Classification
147+
metric=roc_auc,
148+
limit=-1, # not used,
149+
seed=0, # important to find the test files
150+
ensemble_nbest=50,
151+
max_models_on_disc=10000.0,
152+
)
153+
ensbuilder.read_preds = {}
154+
for i in range(50):
155+
ensbuilder.read_preds['pred'+str(i)] = {
156+
'ens_score': i*10,
157+
'num_run': i,
158+
0: True,
159+
'loaded': 1,
160+
"seed": 1,
161+
"disc_space_cost_mb": 50*i,
162+
}
163+
sel_keys = ensbuilder.get_n_best_preds()
164+
self.assertListEqual(['pred49', 'pred48', 'pred47', 'pred46'], sel_keys)
165+
166+
# Make sure at least one model is kept alive
167+
ensbuilder.max_models_on_disc = 0.0
168+
sel_keys = ensbuilder.get_n_best_preds()
169+
self.assertListEqual(['pred49'], sel_keys)
170+
107171
@unittest.skipIf(sys.version_info[0:2] <= (3, 5), "Only works with Python > 3.5")
108172
def testPerformanceRangeThreshold(self):
109173
to_test = ((0.0, 4), (0.1, 4), (0.3, 3), (0.5, 2), (0.6, 2), (0.8, 1),

0 commit comments

Comments
 (0)