66import re
77import time
88import traceback
9+ from typing import Optional , Union
910
1011import numpy as np
1112import pynisher
13+ from sklearn .utils .validation import check_random_state
1214
1315from autosklearn .util .backend import Backend
1416from autosklearn .constants import BINARY_CLASSIFICATION
15- from autosklearn .metrics import calculate_score
17+ from autosklearn .metrics import calculate_score , Scorer
1618from autosklearn .ensembles .ensemble_selection import EnsembleSelection
1719from autosklearn .ensembles .abstract_ensemble import AbstractEnsemble
1820from autosklearn .util .logging_ import get_logger
@@ -28,7 +30,7 @@ def __init__(
2830 backend : Backend ,
2931 dataset_name : str ,
3032 task_type : int ,
31- metric : str ,
33+ metric : Scorer ,
3234 limit : int ,
3335 ensemble_size : int = 10 ,
3436 ensemble_nbest : int = 100 ,
@@ -39,10 +41,11 @@ def __init__(
3941 sleep_duration : int = 2 ,
4042 memory_limit : int = 1000 ,
4143 read_at_most : int = 5 ,
44+ random_state : Optional [Union [int , np .random .RandomState ]]= None ,
4245 ):
4346 """
4447 Constructor
45-
48+
4649 Parameters
4750 ----------
4851 backend: util.backend.Backend
@@ -68,12 +71,12 @@ def __init__(
6871 maximal number of iterations to run this script
6972 (default None --> deactivated)
7073 precision: ["16","32","64","128"]
71- precision of floats to read the predictions
74+ precision of floats to read the predictions
7275 sleep_duration: int
7376 duration of sleeping time between two iterations of this script (in sec)
7477 memory_limit: int
7578 memory limit in mb
76- read_at_most: int
79+ read_at_most: int
7780 read at most n new prediction files in each iteration
7881 """
7982
@@ -93,7 +96,8 @@ def __init__(
9396 self .sleep_duration = sleep_duration
9497 self .memory_limit = memory_limit
9598 self .read_at_most = read_at_most
96-
99+ self .random_state = check_random_state (random_state )
100+
97101 # part of the original training set
98102 # used to build the ensemble
99103 self .dir_ensemble = os .path .join (
@@ -120,7 +124,7 @@ def __init__(
120124
121125 self .start_time = 0
122126 self .model_fn_re = re .compile (r'_([0-9]*)_([0-9]*)\.npy' )
123-
127+
124128 # already read prediction files
125129 # {"file name": {
126130 # "ens_score": float
@@ -167,7 +171,7 @@ def main(self):
167171
168172 self .start_time = time .time ()
169173 iteration = 0
170-
174+
171175 while True :
172176
173177 #maximal number of iterations
@@ -176,29 +180,29 @@ def main(self):
176180 self .logger .info ("Terminate ensemble building because of max iterations: %d of %d" ,
177181 self .max_iterations ,
178182 iteration )
179- break
180-
183+ break
184+
181185 used_time = time .time () - self .start_time
182186 self .logger .debug (
183187 'Starting iteration %d, time left: %f' ,
184188 iteration ,
185189 self .time_limit - used_time ,
186190 )
187-
191+
188192 # populates self.read_preds
189193 if not self .read_ensemble_preds ():
190194 time .sleep (self .sleep_duration )
191195 continue
192-
196+
193197 selected_models = self .get_n_best_preds ()
194198 if not selected_models : # nothing selected
195199 continue
196-
200+
197201 # populates predictions in self.read_preds
198202 # reduces selected models if file reading failed
199203 n_sel_valid , n_sel_test = self .\
200204 get_valid_test_preds (selected_keys = selected_models )
201-
205+
202206 selected_models_set = set (selected_models )
203207 if selected_models_set .intersection (n_sel_test ):
204208 selected_models = list (selected_models_set .intersection (n_sel_test ))
@@ -207,35 +211,35 @@ def main(self):
207211 else :
208212 # use selected_models only defined by ensemble data set
209213 pass
210-
214+
211215 # train ensemble
212216 ensemble = self .fit_ensemble (selected_keys = selected_models )
213-
217+
214218 if ensemble is not None :
215-
219+
216220 self .predict (set_ = "valid" ,
217221 ensemble = ensemble ,
218222 selected_keys = n_sel_valid ,
219223 n_preds = len (selected_models ),
220224 index_run = iteration )
221225 # TODO if predictions fails, build the model again during the
222226 # next iteration!
223- self .predict (set_ = "test" ,
224- ensemble = ensemble ,
225- selected_keys = n_sel_test ,
226- n_preds = len (selected_models ),
227+ self .predict (set_ = "test" ,
228+ ensemble = ensemble ,
229+ selected_keys = n_sel_test ,
230+ n_preds = len (selected_models ),
227231 index_run = iteration )
228232 iteration += 1
229233 else :
230234 time .sleep (self .sleep_duration )
231-
235+
232236 def read_ensemble_preds (self ):
233237 """
234- reading predictions on ensemble building data set;
238+ reading predictions on ensemble building data set;
235239 populates self.read_preds
236240 """
237241 self .logger .debug ("Read ensemble data set predictions" )
238-
242+
239243 if self .y_true_ensemble is None :
240244 try :
241245 self .y_true_ensemble = self .backend .load_targets_ensemble ()
@@ -245,12 +249,12 @@ def read_ensemble_preds(self):
245249 traceback .format_exc (),
246250 )
247251 return False
248-
252+
249253 # no validation predictions so far -- no dir
250254 if not os .path .isdir (self .dir_ensemble ):
251255 self .logger .debug ("No ensemble dataset prediction directory found" )
252256 return False
253-
257+
254258 if self .shared_mode is False :
255259 pred_path = os .path .join (
256260 self .dir_ensemble ,
@@ -267,23 +271,23 @@ def read_ensemble_preds(self):
267271 self .logger .debug ("Found no prediction files on ensemble data set:"
268272 " %s" % pred_path )
269273 return False
270-
274+
271275 n_read_files = 0
272276 for y_ens_fn in y_ens_files :
273-
277+
274278 if self .read_at_most and n_read_files >= self .read_at_most :
275- # limit the number of files that will be read
279+ # limit the number of files that will be read
276280 # to limit memory consumption
277281 break
278-
282+
279283 if not y_ens_fn .endswith (".npy" ):
280284 self .logger .info ('Error loading file (not .npy): %s' , y_ens_fn )
281285 continue
282-
286+
283287 match = self .model_fn_re .search (y_ens_fn )
284288 _seed = int (match .group (1 ))
285289 _num_run = int (match .group (2 ))
286-
290+
287291 if not self .read_preds .get (y_ens_fn ):
288292 self .read_preds [y_ens_fn ] = {
289293 "ens_score" : - 1 ,
@@ -301,7 +305,7 @@ def read_ensemble_preds(self):
301305 # 2 - loaded but dropped again
302306 "loaded" : 0
303307 }
304-
308+
305309 if self .read_preds [y_ens_fn ]["mtime_ens" ] == os .path .getmtime (y_ens_fn ):
306310 # same time stamp; nothing changed;
307311 continue
@@ -351,13 +355,13 @@ def read_ensemble_preds(self):
351355 np .sum ([pred ["loaded" ] > 0 for pred in self .read_preds .values ()])
352356 )
353357 return True
354-
358+
355359 def get_n_best_preds (self ):
356360 """
357361 get best n predictions (i.e., keys of self.read_preds)
358- according to score on "ensemble set"
362+ according to score on "ensemble set"
359363 n: self.ensemble_nbest
360-
364+
361365 Side effect: delete predictions of non-winning models
362366 """
363367
@@ -377,7 +381,7 @@ def get_n_best_preds(self):
377381 sorted_keys = filter (lambda x : x [1 ] > dummy_score [1 ], sorted_keys )
378382 # remove Dummy Classifier
379383 sorted_keys = list (filter (lambda x : x [2 ] > 1 , sorted_keys ))
380- if not sorted_keys :
384+ if not sorted_keys :
381385 # no model left; try to use dummy score (num_run==0)
382386 self .logger .warning ("No models better than random - "
383387 "using Dummy Score!" )
@@ -446,7 +450,7 @@ def get_valid_test_preds(self, selected_keys: list):
446450 """
447451 success_keys_valid = []
448452 success_keys_test = []
449-
453+
450454 for k in selected_keys :
451455 valid_fn = glob .glob (
452456 os .path .join (self .dir_valid , 'predictions_valid_%d_%d.npy'
@@ -456,7 +460,7 @@ def get_valid_test_preds(self, selected_keys: list):
456460 os .path .join (self .dir_test , 'predictions_test_%d_%d.npy' %
457461 (self .read_preds [k ]["seed" ],
458462 self .read_preds [k ]["num_run" ])))
459-
463+
460464 # TODO don't read valid and test if not changed
461465 if len (valid_fn ) == 0 :
462466 # self.logger.debug("Not found validation prediction file "
@@ -478,7 +482,7 @@ def get_valid_test_preds(self, selected_keys: list):
478482 except Exception as e :
479483 self .logger .warning ('Error loading %s: %s' ,
480484 valid_fn , traceback .format_exc ())
481-
485+
482486 if len (test_fn ) == 0 :
483487 # self.logger.debug("Not found test prediction file (although "
484488 # "ensemble predictions available):%s" %
@@ -500,18 +504,18 @@ def get_valid_test_preds(self, selected_keys: list):
500504 except Exception as e :
501505 self .logger .warning ('Error loading %s: %s' ,
502506 test_fn , traceback .format_exc ())
503-
507+
504508 return success_keys_valid , success_keys_test
505-
509+
506510 def fit_ensemble (self , selected_keys :list ):
507511 """
508- fit ensemble
509-
512+ fit ensemble
513+
510514 Parameters
511515 ---------
512516 selected_keys: list
513517 list of selected keys of self.read_preds
514-
518+
515519 Returns
516520 -------
517521 ensemble: EnsembleSelection
@@ -520,7 +524,7 @@ def fit_ensemble(self, selected_keys:list):
520524
521525 predictions_train = np .array ([self .read_preds [k ][Y_ENSEMBLE ] for k in selected_keys ])
522526 include_num_runs = [(self .read_preds [k ]["seed" ], self .read_preds [k ]["num_run" ]) for k in selected_keys ]
523-
527+
524528 # check hash if ensemble training data changed
525529 current_hash = hash (predictions_train .data .tobytes ())
526530 if self .last_hash == current_hash :
@@ -531,11 +535,14 @@ def fit_ensemble(self, selected_keys:list):
531535 )
532536 return None
533537 self .last_hash = current_hash
534-
535- ensemble = EnsembleSelection (ensemble_size = self .ensemble_size ,
536- task_type = self .task_type ,
537- metric = self .metric )
538-
538+
539+ ensemble = EnsembleSelection (
540+ ensemble_size = self .ensemble_size ,
541+ task_type = self .task_type ,
542+ metric = self .metric ,
543+ random_state = self .random_state ,
544+ )
545+
539546 try :
540547 self .logger .debug (
541548 "Fitting the ensemble on %d models." ,
@@ -563,17 +570,17 @@ def fit_ensemble(self, selected_keys:list):
563570 self .logger .error ('Caught IndexError: %s' + traceback .format_exc ())
564571 time .sleep (self .sleep_duration )
565572 return None
566-
573+
567574 return ensemble
568-
575+
569576 def predict (self , set_ : str ,
570577 ensemble : AbstractEnsemble ,
571- selected_keys : list ,
572- n_preds :int ,
578+ selected_keys : list ,
579+ n_preds :int ,
573580 index_run :int ):
574581 """
575582 save preditions on ensemble, validation and test data on disc
576-
583+
577584 Parameters
578585 ----------
579586 set_: ["valid","test"]
@@ -587,13 +594,13 @@ def predict(self, set_: str,
587594 same number of predictions on valid and test are necessary
588595 index_run: int
589596 n-th time that ensemble predictions are written to disc
590-
597+
591598 Return
592599 ------
593600 y: np.ndarray
594601 """
595602 self .logger .debug ("Predicting the %s set with the ensemble!" , set_ )
596-
603+
597604 # Save the ensemble for later use in the main auto-sklearn module!
598605 if self .SAVE2DISC :
599606 self .backend .save_ensemble (ensemble , index_run , self .seed )
@@ -602,7 +609,7 @@ def predict(self, set_: str,
602609 self .read_preds [k ][Y_VALID if set_ == 'valid' else Y_TEST ]
603610 for k in selected_keys
604611 ])
605-
612+
606613 if n_preds == predictions .shape [0 ]:
607614 y = ensemble .predict (predictions )
608615 if self .task_type == BINARY_CLASSIFICATION :
@@ -626,7 +633,7 @@ def predict(self, set_: str,
626633 )
627634 return None
628635 # TODO: ADD saving of predictions on "ensemble data"
629-
636+
630637 def _read_np_fn (self , fp ):
631638 if self .precision is "16" :
632639 predictions = np .load (fp ).astype (dtype = np .float16 )
0 commit comments