Skip to content

Commit 66d9f09

Browse files
committed
FIXES #517 - add seed to ensemble builder
1 parent 8bdcba1 commit 66d9f09

File tree

3 files changed

+81
-63
lines changed

3 files changed

+81
-63
lines changed

autosklearn/automl.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -651,6 +651,7 @@ def _get_ensemble_process(self, time_left_for_ensembles,
651651
max_iterations=max_iterations,
652652
read_at_most=np.inf,
653653
memory_limit=self._ensemble_memory_limit,
654+
random_state=self._seed,
654655
)
655656

656657
def _load_models(self):

autosklearn/ensemble_builder.py

Lines changed: 67 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,15 @@
66
import re
77
import time
88
import traceback
9+
from typing import Optional, Union
910

1011
import numpy as np
1112
import pynisher
13+
from sklearn.utils.validation import check_random_state
1214

1315
from autosklearn.util.backend import Backend
1416
from autosklearn.constants import BINARY_CLASSIFICATION
15-
from autosklearn.metrics import calculate_score
17+
from autosklearn.metrics import calculate_score, Scorer
1618
from autosklearn.ensembles.ensemble_selection import EnsembleSelection
1719
from autosklearn.ensembles.abstract_ensemble import AbstractEnsemble
1820
from autosklearn.util.logging_ import get_logger
@@ -28,7 +30,7 @@ def __init__(
2830
backend: Backend,
2931
dataset_name: str,
3032
task_type: int,
31-
metric: str,
33+
metric: Scorer,
3234
limit: int,
3335
ensemble_size: int=10,
3436
ensemble_nbest: int=100,
@@ -39,10 +41,11 @@ def __init__(
3941
sleep_duration: int=2,
4042
memory_limit: int=1000,
4143
read_at_most: int=5,
44+
random_state: Optional[Union[int, np.random.RandomState]]=None,
4245
):
4346
"""
4447
Constructor
45-
48+
4649
Parameters
4750
----------
4851
backend: util.backend.Backend
@@ -68,12 +71,12 @@ def __init__(
6871
maximal number of iterations to run this script
6972
(default None --> deactivated)
7073
precision: ["16","32","64","128"]
71-
precision of floats to read the predictions
74+
precision of floats to read the predictions
7275
sleep_duration: int
7376
duration of sleeping time between two iterations of this script (in sec)
7477
memory_limit: int
7578
memory limit in mb
76-
read_at_most: int
79+
read_at_most: int
7780
read at most n new prediction files in each iteration
7881
"""
7982

@@ -93,7 +96,8 @@ def __init__(
9396
self.sleep_duration = sleep_duration
9497
self.memory_limit = memory_limit
9598
self.read_at_most = read_at_most
96-
99+
self.random_state = check_random_state(random_state)
100+
97101
# part of the original training set
98102
# used to build the ensemble
99103
self.dir_ensemble = os.path.join(
@@ -120,7 +124,7 @@ def __init__(
120124

121125
self.start_time = 0
122126
self.model_fn_re = re.compile(r'_([0-9]*)_([0-9]*)\.npy')
123-
127+
124128
# already read prediction files
125129
# {"file name": {
126130
# "ens_score": float
@@ -167,7 +171,7 @@ def main(self):
167171

168172
self.start_time = time.time()
169173
iteration = 0
170-
174+
171175
while True:
172176

173177
#maximal number of iterations
@@ -176,29 +180,29 @@ def main(self):
176180
self.logger.info("Terminate ensemble building because of max iterations: %d of %d",
177181
self.max_iterations,
178182
iteration)
179-
break
180-
183+
break
184+
181185
used_time = time.time() - self.start_time
182186
self.logger.debug(
183187
'Starting iteration %d, time left: %f',
184188
iteration,
185189
self.time_limit - used_time,
186190
)
187-
191+
188192
# populates self.read_preds
189193
if not self.read_ensemble_preds():
190194
time.sleep(self.sleep_duration)
191195
continue
192-
196+
193197
selected_models = self.get_n_best_preds()
194198
if not selected_models: # nothing selected
195199
continue
196-
200+
197201
# populates predictions in self.read_preds
198202
# reduces selected models if file reading failed
199203
n_sel_valid, n_sel_test = self.\
200204
get_valid_test_preds(selected_keys=selected_models)
201-
205+
202206
selected_models_set = set(selected_models)
203207
if selected_models_set.intersection(n_sel_test):
204208
selected_models = list(selected_models_set.intersection(n_sel_test))
@@ -207,35 +211,35 @@ def main(self):
207211
else:
208212
# use selected_models only defined by ensemble data set
209213
pass
210-
214+
211215
# train ensemble
212216
ensemble = self.fit_ensemble(selected_keys=selected_models)
213-
217+
214218
if ensemble is not None:
215-
219+
216220
self.predict(set_="valid",
217221
ensemble=ensemble,
218222
selected_keys=n_sel_valid,
219223
n_preds=len(selected_models),
220224
index_run=iteration)
221225
# TODO if predictions fails, build the model again during the
222226
# next iteration!
223-
self.predict(set_="test",
224-
ensemble=ensemble,
225-
selected_keys=n_sel_test,
226-
n_preds=len(selected_models),
227+
self.predict(set_="test",
228+
ensemble=ensemble,
229+
selected_keys=n_sel_test,
230+
n_preds=len(selected_models),
227231
index_run=iteration)
228232
iteration += 1
229233
else:
230234
time.sleep(self.sleep_duration)
231-
235+
232236
def read_ensemble_preds(self):
233237
"""
234-
reading predictions on ensemble building data set;
238+
reading predictions on ensemble building data set;
235239
populates self.read_preds
236240
"""
237241
self.logger.debug("Read ensemble data set predictions")
238-
242+
239243
if self.y_true_ensemble is None:
240244
try:
241245
self.y_true_ensemble = self.backend.load_targets_ensemble()
@@ -245,12 +249,12 @@ def read_ensemble_preds(self):
245249
traceback.format_exc(),
246250
)
247251
return False
248-
252+
249253
# no validation predictions so far -- no dir
250254
if not os.path.isdir(self.dir_ensemble):
251255
self.logger.debug("No ensemble dataset prediction directory found")
252256
return False
253-
257+
254258
if self.shared_mode is False:
255259
pred_path = os.path.join(
256260
self.dir_ensemble,
@@ -267,23 +271,23 @@ def read_ensemble_preds(self):
267271
self.logger.debug("Found no prediction files on ensemble data set:"
268272
" %s" % pred_path)
269273
return False
270-
274+
271275
n_read_files = 0
272276
for y_ens_fn in y_ens_files:
273-
277+
274278
if self.read_at_most and n_read_files >= self.read_at_most:
275-
# limit the number of files that will be read
279+
# limit the number of files that will be read
276280
# to limit memory consumption
277281
break
278-
282+
279283
if not y_ens_fn.endswith(".npy"):
280284
self.logger.info('Error loading file (not .npy): %s', y_ens_fn)
281285
continue
282-
286+
283287
match = self.model_fn_re.search(y_ens_fn)
284288
_seed = int(match.group(1))
285289
_num_run = int(match.group(2))
286-
290+
287291
if not self.read_preds.get(y_ens_fn):
288292
self.read_preds[y_ens_fn] = {
289293
"ens_score": -1,
@@ -301,7 +305,7 @@ def read_ensemble_preds(self):
301305
# 2 - loaded but dropped again
302306
"loaded": 0
303307
}
304-
308+
305309
if self.read_preds[y_ens_fn]["mtime_ens"] == os.path.getmtime(y_ens_fn):
306310
# same time stamp; nothing changed;
307311
continue
@@ -351,13 +355,13 @@ def read_ensemble_preds(self):
351355
np.sum([pred["loaded"] > 0 for pred in self.read_preds.values()])
352356
)
353357
return True
354-
358+
355359
def get_n_best_preds(self):
356360
"""
357361
get best n predictions (i.e., keys of self.read_preds)
358-
according to score on "ensemble set"
362+
according to score on "ensemble set"
359363
n: self.ensemble_nbest
360-
364+
361365
Side effect: delete predictions of non-winning models
362366
"""
363367

@@ -377,7 +381,7 @@ def get_n_best_preds(self):
377381
sorted_keys = filter(lambda x: x[1] > dummy_score[1], sorted_keys)
378382
# remove Dummy Classifier
379383
sorted_keys = list(filter(lambda x: x[2] > 1, sorted_keys))
380-
if not sorted_keys:
384+
if not sorted_keys:
381385
# no model left; try to use dummy score (num_run==0)
382386
self.logger.warning("No models better than random - "
383387
"using Dummy Score!")
@@ -446,7 +450,7 @@ def get_valid_test_preds(self, selected_keys: list):
446450
"""
447451
success_keys_valid = []
448452
success_keys_test = []
449-
453+
450454
for k in selected_keys:
451455
valid_fn = glob.glob(
452456
os.path.join(self.dir_valid, 'predictions_valid_%d_%d.npy'
@@ -456,7 +460,7 @@ def get_valid_test_preds(self, selected_keys: list):
456460
os.path.join(self.dir_test, 'predictions_test_%d_%d.npy' %
457461
(self.read_preds[k]["seed"],
458462
self.read_preds[k]["num_run"])))
459-
463+
460464
# TODO don't read valid and test if not changed
461465
if len(valid_fn) == 0:
462466
# self.logger.debug("Not found validation prediction file "
@@ -478,7 +482,7 @@ def get_valid_test_preds(self, selected_keys: list):
478482
except Exception as e:
479483
self.logger.warning('Error loading %s: %s',
480484
valid_fn, traceback.format_exc())
481-
485+
482486
if len(test_fn) == 0:
483487
# self.logger.debug("Not found test prediction file (although "
484488
# "ensemble predictions available):%s" %
@@ -500,18 +504,18 @@ def get_valid_test_preds(self, selected_keys: list):
500504
except Exception as e:
501505
self.logger.warning('Error loading %s: %s',
502506
test_fn, traceback.format_exc())
503-
507+
504508
return success_keys_valid, success_keys_test
505-
509+
506510
def fit_ensemble(self, selected_keys:list):
507511
"""
508-
fit ensemble
509-
512+
fit ensemble
513+
510514
Parameters
511515
---------
512516
selected_keys: list
513517
list of selected keys of self.read_preds
514-
518+
515519
Returns
516520
-------
517521
ensemble: EnsembleSelection
@@ -520,7 +524,7 @@ def fit_ensemble(self, selected_keys:list):
520524

521525
predictions_train = np.array([self.read_preds[k][Y_ENSEMBLE] for k in selected_keys])
522526
include_num_runs = [(self.read_preds[k]["seed"], self.read_preds[k]["num_run"]) for k in selected_keys]
523-
527+
524528
# check hash if ensemble training data changed
525529
current_hash = hash(predictions_train.data.tobytes())
526530
if self.last_hash == current_hash:
@@ -531,11 +535,14 @@ def fit_ensemble(self, selected_keys:list):
531535
)
532536
return None
533537
self.last_hash = current_hash
534-
535-
ensemble = EnsembleSelection(ensemble_size=self.ensemble_size,
536-
task_type=self.task_type,
537-
metric=self.metric)
538-
538+
539+
ensemble = EnsembleSelection(
540+
ensemble_size=self.ensemble_size,
541+
task_type=self.task_type,
542+
metric=self.metric,
543+
random_state=self.random_state,
544+
)
545+
539546
try:
540547
self.logger.debug(
541548
"Fitting the ensemble on %d models.",
@@ -563,17 +570,17 @@ def fit_ensemble(self, selected_keys:list):
563570
self.logger.error('Caught IndexError: %s' + traceback.format_exc())
564571
time.sleep(self.sleep_duration)
565572
return None
566-
573+
567574
return ensemble
568-
575+
569576
def predict(self, set_: str,
570577
ensemble: AbstractEnsemble,
571-
selected_keys: list,
572-
n_preds:int,
578+
selected_keys: list,
579+
n_preds:int,
573580
index_run:int):
574581
"""
575582
save preditions on ensemble, validation and test data on disc
576-
583+
577584
Parameters
578585
----------
579586
set_: ["valid","test"]
@@ -587,13 +594,13 @@ def predict(self, set_: str,
587594
same number of predictions on valid and test are necessary
588595
index_run: int
589596
n-th time that ensemble predictions are written to disc
590-
597+
591598
Return
592599
------
593600
y: np.ndarray
594601
"""
595602
self.logger.debug("Predicting the %s set with the ensemble!", set_)
596-
603+
597604
# Save the ensemble for later use in the main auto-sklearn module!
598605
if self.SAVE2DISC:
599606
self.backend.save_ensemble(ensemble, index_run, self.seed)
@@ -602,7 +609,7 @@ def predict(self, set_: str,
602609
self.read_preds[k][Y_VALID if set_ == 'valid' else Y_TEST]
603610
for k in selected_keys
604611
])
605-
612+
606613
if n_preds == predictions.shape[0]:
607614
y = ensemble.predict(predictions)
608615
if self.task_type == BINARY_CLASSIFICATION:
@@ -626,7 +633,7 @@ def predict(self, set_: str,
626633
)
627634
return None
628635
# TODO: ADD saving of predictions on "ensemble data"
629-
636+
630637
def _read_np_fn(self, fp):
631638
if self.precision is "16":
632639
predictions = np.load(fp).astype(dtype=np.float16)

0 commit comments

Comments
 (0)