1010import numpy .ma as ma
1111import scipy .stats
1212from sklearn .base import BaseEstimator
13+ from sklearn .model_selection ._split import _RepeatedSplits , \
14+ BaseShuffleSplit , BaseCrossValidator
1315from smac .tae .execute_ta_run import StatusType
1416from smac .stats .stats import Stats
1517from sklearn .externals import joblib
@@ -133,14 +135,17 @@ def __init__(self,
133135 # After assignging and checking variables...
134136 #self._backend = Backend(self._output_dir, self._tmp_dir)
135137
136- def fit (self , X , y ,
137- task = MULTICLASS_CLASSIFICATION ,
138- metric = None ,
139- feat_type = None ,
140- dataset_name = None ):
141- if not self ._shared_mode :
142- self ._backend .context .delete_directories ()
143- else :
138+ def fit (
139+ self , X , y ,
140+ task ,
141+ metric ,
142+ X_test = None ,
143+ y_test = None ,
144+ feat_type = None ,
145+ dataset_name = None ,
146+ only_return_configuration_space = False ,
147+ ):
148+ if self ._shared_mode :
144149 # If this fails, it's likely that this is the first call to get
145150 # the data manager
146151 try :
@@ -149,8 +154,6 @@ def fit(self, X, y,
149154 except IOError :
150155 pass
151156
152- self ._backend .context .create_directories ()
153-
154157 if dataset_name is None :
155158 dataset_name = hash_array_or_matrix (X )
156159
@@ -181,13 +184,22 @@ def fit(self, X, y,
181184 'valid feature types, you passed `%s`' % ft )
182185
183186 self ._data_memory_limit = None
184- loaded_data_manager = XYDataManager (X , y ,
185- task = task ,
186- feat_type = feat_type ,
187- dataset_name = dataset_name )
187+ loaded_data_manager = XYDataManager (
188+ X , y ,
189+ X_test = X_test ,
190+ y_test = y_test ,
191+ task = task ,
192+ feat_type = feat_type ,
193+ dataset_name = dataset_name ,
194+ )
188195
189- return self ._fit (loaded_data_manager , metric )
196+ return self ._fit (
197+ loaded_data_manager ,
198+ metric ,
199+ only_return_configuration_space ,
200+ )
190201
202+ # TODO this is very old code which can be dropped!
191203 def fit_automl_dataset (self , dataset , metric ):
192204 self ._stopwatch = StopWatch ()
193205 self ._backend .save_start_time (self ._seed )
@@ -280,7 +292,7 @@ def _do_dummy_prediction(self, datamanager, num_run):
280292
281293 return ta .num_run
282294
283- def _fit (self , datamanager , metric ):
295+ def _fit (self , datamanager , metric , only_return_configuration_space = False ):
284296 # Reset learnt stuff
285297 self .models_ = None
286298 self .ensemble_ = None
@@ -296,9 +308,13 @@ def _fit(self, datamanager, metric):
296308 raise ValueError ("List member '%s' for argument "
297309 "'disable_evaluator_output' must be one "
298310 "of " + str (allowed_elements ))
299- if self ._resampling_strategy not in ['holdout' , 'holdout-iterative-fit' ,
300- 'cv' , 'partial-cv' ,
301- 'partial-cv-iterative-fit' ]:
311+ if self ._resampling_strategy not in [
312+ 'holdout' , 'holdout-iterative-fit' ,
313+ 'cv' , 'partial-cv' ,
314+ 'partial-cv-iterative-fit' ] \
315+ and not issubclass (self ._resampling_strategy , BaseCrossValidator )\
316+ and not issubclass (self ._resampling_strategy , _RepeatedSplits )\
317+ and not issubclass (self ._resampling_strategy , BaseShuffleSplit ):
302318 raise ValueError ('Illegal resampling strategy: %s' %
303319 self ._resampling_strategy )
304320 if self ._resampling_strategy in ['partial-cv' , 'partial-cv-iterative-fit' ] \
@@ -354,6 +370,8 @@ def _fit(self, datamanager, metric):
354370 exclude_estimators = self ._exclude_estimators ,
355371 include_preprocessors = self ._include_preprocessors ,
356372 exclude_preprocessors = self ._exclude_preprocessors )
373+ if only_return_configuration_space :
374+ return self .configuration_space
357375
358376 # == RUN ensemble builder
359377 # Do this before calculating the meta-features to make sure that the
@@ -367,9 +385,13 @@ def _fit(self, datamanager, metric):
367385 self ._logger .info (
368386 'Start Ensemble with %5.2fsec time left' % time_left_for_ensembles )
369387 if time_left_for_ensembles <= 0 :
370- self ._logger .warning ("Not starting ensemble builder because there "
371- "is no time left!" )
372388 self ._proc_ensemble = None
389+ # Fit only raises error when ensemble_size is not zero but
390+ # time_left_for_ensembles is zero.
391+ if self ._ensemble_size > 0 :
392+ raise ValueError ("Not starting ensemble builder because there "
393+ "is no time left. Try increasing the value "
394+ "of time_left_for_this_task." )
373395 else :
374396 self ._proc_ensemble = self ._get_ensemble_process (time_left_for_ensembles )
375397 if self ._ensemble_size > 0 :
@@ -384,7 +406,7 @@ def _fit(self, datamanager, metric):
384406 del self ._datamanager
385407 except Exception :
386408 pass
387-
409+
388410 # => RUN SMAC
389411 smac_task_name = 'runSMAC'
390412 self ._stopwatch .start_task (smac_task_name )
@@ -465,14 +487,18 @@ def send_warnings_to_log(message, category, filename, lineno,
465487
466488 if self ._keep_models is not True :
467489 raise ValueError (
468- "Predict can only be called if 'keep_models==True'" )
490+ "Refit can only be called if 'keep_models==True'" )
469491 if self .models_ is None or len (self .models_ ) == 0 or \
470492 self .ensemble_ is None :
471493 self ._load_models ()
472494
495+ # Refit is not applicable when ensemble_size is set to zero.
496+ if self .ensemble_ is None :
497+ raise ValueError ("Refit can only be called if 'ensemble_size != 0'" )
498+
473499 random_state = np .random .RandomState (self ._seed )
474500 for identifier in self .models_ :
475- if identifier in self .ensemble_ .get_model_identifiers ():
501+ if identifier in self .ensemble_ .get_selected_model_identifiers ():
476502 model = self .models_ [identifier ]
477503 # this updates the model inplace, it can then later be used in
478504 # predict method
@@ -528,11 +554,18 @@ def predict(self, X, batch_size=None, n_jobs=1):
528554 self .ensemble_ is None :
529555 self ._load_models ()
530556
557+ # If self.ensemble_ is None, it means that ensemble_size is set to zero.
558+ # In such cases, raise error because predict and predict_proba cannot
559+ # be called.
560+ if self .ensemble_ is None :
561+ raise ValueError ("Predict and predict_proba can only be called "
562+ "if 'ensemble_size != 0'" )
563+
531564 # Parallelize predictions across models with n_jobs processes.
532565 # Each process computes predictions in chunks of batch_size rows.
533566 all_predictions = joblib .Parallel (n_jobs = n_jobs )(
534567 joblib .delayed (_model_predict )(self , X , batch_size , identifier )
535- for identifier in self .ensemble_ .get_model_identifiers ())
568+ for identifier in self .ensemble_ .get_selected_model_identifiers ())
536569
537570 if len (all_predictions ) == 0 :
538571 raise ValueError ('Something went wrong generating the predictions. '
@@ -559,6 +592,7 @@ def fit_ensemble(self, y, task=None, metric=None, precision='32',
559592 ensemble_nbest = ensemble_nbest , ensemble_size = ensemble_size )
560593 self ._proc_ensemble .main ()
561594 self ._proc_ensemble = None
595+ self ._load_models ()
562596 return self
563597
564598 def _get_ensemble_process (self , time_left_for_ensembles ,
@@ -606,7 +640,8 @@ def _get_ensemble_process(self, time_left_for_ensembles,
606640 seed = self ._seed ,
607641 shared_mode = self ._shared_mode ,
608642 precision = precision ,
609- max_iterations = max_iterations )
643+ max_iterations = max_iterations ,
644+ read_at_most = np .inf )
610645
611646 def _load_models (self ):
612647 if self ._shared_mode :
@@ -631,10 +666,10 @@ def _load_models(self):
631666 ['partial-cv' , 'partial-cv-iterative-fit' ]:
632667 raise ValueError ('No models fitted!' )
633668
634- self .models = []
669+ self .models_ = []
635670
636671 else :
637- self .models = []
672+ self .models_ = []
638673
639674 def score (self , X , y ):
640675 # fix: Consider only index 1 of second dimension
@@ -747,10 +782,10 @@ def sprint_statistics(self):
747782 num_crash = sum ([s == 'Crash' for s in cv_results ['status' ]])
748783 sio .write (' Number of crashed target algorithm runs: %d\n ' % num_crash )
749784 num_timeout = sum ([s == 'Timeout' for s in cv_results ['status' ]])
750- sio .write (' Number of target algorithms that exceeded the memory '
785+ sio .write (' Number of target algorithms that exceeded the time '
751786 'limit: %d\n ' % num_timeout )
752787 num_memout = sum ([s == 'Memout' for s in cv_results ['status' ]])
753- sio .write (' Number of target algorithms that exceeded the time '
788+ sio .write (' Number of target algorithms that exceeded the memory '
754789 'limit: %d\n ' % num_memout )
755790 return sio .getvalue ()
756791
@@ -810,7 +845,8 @@ def __init__(self, *args, **kwargs):
810845
811846 def _perform_input_checks (self , X , y ):
812847 X = self ._check_X (X )
813- y = self ._check_y (y )
848+ if y is not None :
849+ y = self ._check_y (y )
814850 return X , y
815851
816852 def _check_X (self , X ):
@@ -864,12 +900,21 @@ def __init__(self, *args, **kwargs):
864900 'multiclass' : MULTICLASS_CLASSIFICATION ,
865901 'binary' : BINARY_CLASSIFICATION }
866902
867- def fit (self , X , y ,
868- metric = None ,
869- loss = None ,
870- feat_type = None ,
871- dataset_name = None ):
903+ def fit (
904+ self , X , y ,
905+ X_test = None ,
906+ y_test = None ,
907+ metric = None ,
908+ feat_type = None ,
909+ dataset_name = None ,
910+ only_return_configuration_space = False ,
911+ ):
872912 X , y = self ._perform_input_checks (X , y )
913+ if X_test is not None :
914+ X_test , y_test = self ._perform_input_checks (X_test , y_test )
915+ if len (y .shape ) != len (y_test .shape ):
916+ raise ValueError ('Target value shapes do not match: %s vs %s'
917+ % (y .shape , y_test .shape ))
873918
874919 y_task = type_of_target (y )
875920 task = self ._task_mapping .get (y_task )
@@ -883,8 +928,31 @@ def fit(self, X, y,
883928 metric = accuracy
884929
885930 y , self ._classes , self ._n_classes = self ._process_target_classes (y )
886-
887- return super ().fit (X , y , task , metric , feat_type , dataset_name )
931+ if y_test is not None :
932+ # Map test values to actual values - TODO: copy to all kinds of
933+ # other parts in this code and test it!!!
934+ y_test_new = []
935+ for output_idx in range (len (self ._classes )):
936+ mapping = {self ._classes [output_idx ][idx ]: idx
937+ for idx in range (len (self ._classes [output_idx ]))}
938+ enumeration = y_test if len (self ._classes ) == 1 else y_test [output_idx ]
939+ y_test_new .append (
940+ np .array ([mapping [value ] for value in enumeration ])
941+ )
942+ y_test = np .array (y_test_new )
943+ if self ._n_outputs == 1 :
944+ y_test = y_test .flatten ()
945+
946+ return super ().fit (
947+ X , y ,
948+ X_test = X_test ,
949+ y_test = y_test ,
950+ task = task ,
951+ metric = metric ,
952+ feat_type = feat_type ,
953+ dataset_name = dataset_name ,
954+ only_return_configuration_space = only_return_configuration_space ,
955+ )
888956
889957 def fit_ensemble (self , y , task = None , metric = None , precision = '32' ,
890958 dataset_name = None , ensemble_nbest = None ,
@@ -917,7 +985,7 @@ def _process_target_classes(self, y):
917985 _classes .append (classes_k )
918986 _n_classes .append (classes_k .shape [0 ])
919987
920- self . _n_classes = np .array (_n_classes , dtype = np .int )
988+ _n_classes = np .array (_n_classes , dtype = np .int )
921989
922990 return y , _classes , _n_classes
923991
@@ -947,16 +1015,32 @@ def predict_proba(self, X, batch_size=None, n_jobs=1):
9471015
9481016
9491017class AutoMLRegressor (BaseAutoML ):
950- def fit (self , X , y , metric = None , feat_type = None , dataset_name = None ):
1018+ def fit (
1019+ self , X , y ,
1020+ X_test = None ,
1021+ y_test = None ,
1022+ metric = None ,
1023+ feat_type = None ,
1024+ dataset_name = None ,
1025+ only_return_configuration_space = False ,
1026+ ):
9511027 X , y = super ()._perform_input_checks (X , y )
9521028 _n_outputs = 1 if len (y .shape ) == 1 else y .shape [1 ]
9531029 if _n_outputs > 1 :
9541030 raise NotImplementedError (
9551031 'Multi-output regression is not implemented.' )
9561032 if metric is None :
9571033 metric = r2
958- return super ().fit (X , y , task = REGRESSION , metric = metric ,
959- feat_type = feat_type , dataset_name = dataset_name )
1034+ return super ().fit (
1035+ X , y ,
1036+ X_test = X_test ,
1037+ y_test = y_test ,
1038+ task = REGRESSION ,
1039+ metric = metric ,
1040+ feat_type = feat_type ,
1041+ dataset_name = dataset_name ,
1042+ only_return_configuration_space = only_return_configuration_space ,
1043+ )
9601044
9611045 def fit_ensemble (self , y , task = None , metric = None , precision = '32' ,
9621046 dataset_name = None , ensemble_nbest = None ,
0 commit comments