1919from autosklearn .data .data_manager_factory import get_data_manager
2020from autosklearn .data .competition_data_manager import CompetitionDataManager
2121from autosklearn .data .xy_data_manager import XYDataManager
22- from autosklearn .evaluation import resampling , eval_with_limits
22+ from autosklearn .evaluation import resampling , ExecuteTaFuncWithQueue
2323from autosklearn .evaluation import calculate_score
2424from autosklearn .util import StopWatch , get_logger , setup_logger , \
2525 pipeline
2626from autosklearn .ensemble_builder import EnsembleBuilder
2727from autosklearn .smbo import AutoMLSMBO
28+ from autosklearn .util .hash import hash_numpy_array
2829
2930
3031class AutoML (BaseEstimator ):
@@ -71,7 +72,8 @@ def __init__(self,
7172 self ._include_estimators = include_estimators
7273 self ._include_preprocessors = include_preprocessors
7374 self ._resampling_strategy = resampling_strategy
74- self ._resampling_strategy_arguments = resampling_strategy_arguments
75+ self ._resampling_strategy_arguments = resampling_strategy_arguments \
76+ if resampling_strategy_arguments is not None else {}
7577 self ._max_iter_smac = max_iter_smac
7678 #self.delete_tmp_folder_after_terminate = \
7779 # delete_tmp_folder_after_terminate
@@ -147,9 +149,7 @@ def fit(self, X, y,
147149 self ._backend .context .create_directories ()
148150
149151 if dataset_name is None :
150- m = hashlib .md5 ()
151- m .update (X .data )
152- dataset_name = m .hexdigest ()
152+ dataset_name = hash_numpy_array (X )
153153
154154 self ._backend .save_start_time (self ._seed )
155155 self ._stopwatch = StopWatch ()
@@ -232,37 +232,32 @@ def _print_load_time(basename, time_left_for_this_task,
232232 def _do_dummy_prediction (self , datamanager , num_run ):
233233
234234 self ._logger .info ("Starting to create dummy predictions." )
235- time_limit = int (self ._time_for_task / 6. )
235+ # time_limit = int(self._time_for_task / 6.)
236236 memory_limit = int (self ._ml_memory_limit )
237-
238- _info = eval_with_limits (datamanager , self ._backend , 1 ,
239- self ._seed , num_run ,
240- self ._resampling_strategy ,
241- self ._resampling_strategy_arguments ,
242- memory_limit , time_limit ,
243- logger = self ._logger )
244- if _info [4 ] == StatusType .SUCCESS :
245- self ._logger .info ("Finished creating dummy prediction 1/2." )
246- else :
247- self ._logger .error ('Error creating dummy prediction 1/2:%s ' ,
248- _info [3 ])
249-
250- num_run += 1
251-
252- _info = eval_with_limits (datamanager , self ._backend , 2 ,
253- self ._seed , num_run ,
254- self ._resampling_strategy ,
255- self ._resampling_strategy_arguments ,
256- memory_limit , time_limit ,
257- logger = self ._logger )
258- if _info [4 ] == StatusType .SUCCESS :
259- self ._logger .info ("Finished creating dummy prediction 2/2." )
237+ ta = ExecuteTaFuncWithQueue (backend = self ._backend ,
238+ autosklearn_seed = self ._seed ,
239+ resampling_strategy = self ._resampling_strategy ,
240+ initial_num_run = num_run ,
241+ logger = self ._logger ,
242+ ** self ._resampling_strategy_arguments )
243+
244+ status , cost , runtime , additional_info = \
245+ ta .run (1 , cutoff = self ._time_for_task , memory_limit = memory_limit )
246+ if status == StatusType .SUCCESS :
247+ self ._logger .info ("Finished creating dummy predictions." )
260248 else :
261- self ._logger .error ('Error creating dummy prediction 2/2 %s ' ,
262- _info [ 3 ] )
249+ self ._logger .error ('Error creating dummy predictions:%s ' ,
250+ additional_info )
263251
264- num_run += 1
265- return num_run
252+ #status, cost, runtime, additional_info = \
253+ # ta.run(2, cutoff=time_limit, memory_limit=memory_limit)
254+ #if status == StatusType.SUCCESS:
255+ # self._logger.info("Finished creating dummy prediction 2/2.")
256+ #else:
257+ # self._logger.error('Error creating dummy prediction 2/2 %s',
258+ # additional_info)
259+
260+ return ta .num_run
266261
267262 def _fit (self , datamanager ):
268263 # Reset learnt stuff
@@ -374,7 +369,7 @@ def _fit(self, datamanager):
374369 if time_left_for_smac <= 0 :
375370 self ._logger .warning ("Not starting SMAC because there is no time "
376371 "left." )
377- self . _proc_smac = None
372+ _proc_smac = None
378373 else :
379374 if self ._per_run_time_limit is None or \
380375 self ._per_run_time_limit > time_left_for_smac :
@@ -385,25 +380,25 @@ def _fit(self, datamanager):
385380 else :
386381 per_run_time_limit = self ._per_run_time_limit
387382
388- self . _proc_smac = AutoMLSMBO (config_space = self .configuration_space ,
389- dataset_name = self ._dataset_name ,
390- backend = self ._backend ,
391- total_walltime_limit = time_left_for_smac ,
392- func_eval_time_limit = per_run_time_limit ,
393- memory_limit = self ._ml_memory_limit ,
394- data_memory_limit = self ._data_memory_limit ,
395- watcher = self ._stopwatch ,
396- start_num_run = num_run ,
397- num_metalearning_cfgs = self ._initial_configurations_via_metalearning ,
398- config_file = configspace_path ,
399- smac_iters = self ._max_iter_smac ,
400- seed = self ._seed ,
401- metadata_directory = self ._metadata_directory ,
402- resampling_strategy = self ._resampling_strategy ,
403- resampling_strategy_args = self ._resampling_strategy_arguments ,
404- acquisition_function = self .acquisition_function ,
405- shared_mode = self ._shared_mode )
406- self ._proc_smac .run_smbo ()
383+ _proc_smac = AutoMLSMBO (config_space = self .configuration_space ,
384+ dataset_name = self ._dataset_name ,
385+ backend = self ._backend ,
386+ total_walltime_limit = time_left_for_smac ,
387+ func_eval_time_limit = per_run_time_limit ,
388+ memory_limit = self ._ml_memory_limit ,
389+ data_memory_limit = self ._data_memory_limit ,
390+ watcher = self ._stopwatch ,
391+ start_num_run = num_run ,
392+ num_metalearning_cfgs = self ._initial_configurations_via_metalearning ,
393+ config_file = configspace_path ,
394+ smac_iters = self ._max_iter_smac ,
395+ seed = self ._seed ,
396+ metadata_directory = self ._metadata_directory ,
397+ resampling_strategy = self ._resampling_strategy ,
398+ resampling_strategy_args = self ._resampling_strategy_arguments ,
399+ acquisition_function = self .acquisition_function ,
400+ shared_mode = self ._shared_mode )
401+ self .runhistory_ = _proc_smac .run_smbo ()
407402
408403 self ._proc_ensemble = None
409404 self ._load_models ()
@@ -418,12 +413,25 @@ def refit(self, X, y):
418413 self .ensemble_ is None :
419414 self ._load_models ()
420415
416+ random_state = np .random .RandomState (self ._seed )
421417 for identifier in self .models_ :
422418 if identifier in self .ensemble_ .get_model_identifiers ():
423419 model = self .models_ [identifier ]
424420 # this updates the model inplace, it can then later be used in
425421 # predict method
426- model .fit (X .copy (), y .copy ())
422+
423+ # try to fit the model. If it fails, shuffle the data. This
424+ # could alleviate the problem in algorithms that depend on
425+ # the ordering of the data.
426+ for i in range (10 ):
427+ try :
428+ model .fit (X .copy (), y .copy ())
429+ break
430+ except ValueError :
431+ indices = list (range (X .shape [0 ]))
432+ random_state .shuffle (indices )
433+ X = X [indices ]
434+ y = y [indices ]
427435
428436 self ._can_predict = True
429437 return self
@@ -561,8 +569,8 @@ def grid_scores_(self):
561569 scores_per_config = defaultdict (list )
562570 config_list = list ()
563571
564- for run_key in self ._proc_smac . runhistory .data :
565- run_value = self ._proc_smac . runhistory .data [run_key ]
572+ for run_key in self .runhistory_ .data :
573+ run_value = self .runhistory_ .data [run_key ]
566574
567575 config_id = run_key .config_id
568576 cost = run_value .cost
@@ -575,7 +583,7 @@ def grid_scores_(self):
575583 for config_id in config_list :
576584 scores = [1 - score for score in scores_per_config [config_id ]]
577585 mean_score = np .mean (scores )
578- config = self ._proc_smac . runhistory .ids_config [config_id ]
586+ config = self .runhistory_ .ids_config [config_id ]
579587
580588 grid_score = _CVScoreTuple (config .get_dictionary (), mean_score ,
581589 scores )
@@ -616,10 +624,10 @@ def cv_results_(self):
616624 mean_fit_time = []
617625 params = []
618626 status = []
619- for run_key in self ._proc_smac . runhistory .data :
620- run_value = self ._proc_smac . runhistory .data [run_key ]
627+ for run_key in self .runhistory_ .data :
628+ run_value = self .runhistory_ .data [run_key ]
621629 config_id = run_key .config_id
622- config = self ._proc_smac . runhistory .ids_config [config_id ]
630+ config = self .runhistory_ .ids_config [config_id ]
623631
624632 param_dict = config .get_dictionary ()
625633 params .append (param_dict )
0 commit comments