1717from gpytorch .priors import LogNormalPrior
1818from scipy .stats .qmc import Sobol
1919from scipy .spatial import KDTree
20+ from sklearn .linear_model import LogisticRegression
2021
2122EPSILON = 1e-6
2223
@@ -126,17 +127,24 @@ def unnormalize(self, value):
126127 log_spaced = zero_one * (math .log (1 - self .max , self .base ) - math .log (1 - self .min , self .base )) + math .log (1 - self .min , self .base )
127128 return 1 - self .base ** log_spaced
128129
129- def _params_from_puffer_sweep (sweep_config ):
130+ def _params_from_puffer_sweep (sweep_config , only_include = None ):
130131 param_spaces = {}
132+
133+ if 'sweep_only' in sweep_config :
134+ only_include = [p .strip () for p in sweep_config ['sweep_only' ].split (',' )]
135+
131136 for name , param in sweep_config .items ():
132- if name in ('method' , 'metric' , 'goal' , 'downsample' , 'use_gpu' , 'prune_pareto' ):
137+ if name in ('method' , 'metric' , 'goal' , 'downsample' , 'use_gpu' , 'prune_pareto' , 'sweep_only' , 'max_suggestion_cost' ):
133138 continue
134139
135140 assert isinstance (param , dict )
136141 if any (isinstance (param [k ], dict ) for k in param ):
137- param_spaces [name ] = _params_from_puffer_sweep (param )
142+ param_spaces [name ] = _params_from_puffer_sweep (param , only_include )
138143 continue
139144
145+ if only_include and not any (k in name for k in only_include ):
146+ continue
147+
140148 assert 'distribution' in param
141149 distribution = param ['distribution' ]
142150 search_center = param ['mean' ]
@@ -232,8 +240,8 @@ def _fill(self, params, spaces, flat_sample, idx=0):
232240 return idx
233241
234242 def get_flat_idx (self , flat_key ):
235- return list (self .flat_spaces .keys ()). index ( flat_key )
236-
243+ keys = list (self .flat_spaces .keys ())
244+ return keys . index ( flat_key ) if flat_key in keys else None
237245
238246def pareto_points (observations ):
239247 if not observations :
@@ -421,7 +429,7 @@ def __init__(self,
421429 sweep_config ,
422430 max_suggestion_cost = 3600 ,
423431 resample_frequency = 0 ,
424- num_random_samples = 30 ,
432+ num_random_samples = 10 ,
425433 global_search_scale = 1 ,
426434 suggestions_per_pareto = 256 ,
427435 seed_with_search_center = True ,
@@ -435,21 +443,27 @@ def __init__(self,
435443 cost_param = "train/total_timesteps" ,
436444 prune_pareto = True ,
437445 ):
438- self .device = torch .device ("cuda:0" if use_gpu and torch .cuda .is_available () else "cpu" )
446+ # Process sweep config. NOTE: sweep_config takes precedence. It's not good.
447+ _use_gpu = sweep_config ['use_gpu' ] if 'use_gpu' in sweep_config else use_gpu
448+ _prune_pareto = sweep_config ['prune_pareto' ] if 'prune_pareto' in sweep_config else prune_pareto
449+ _max_suggestion_cost = sweep_config ['max_suggestion_cost' ] if 'max_suggestion_cost' in sweep_config else max_suggestion_cost
450+
451+ self .device = torch .device ("cuda:0" if _use_gpu and torch .cuda .is_available () else "cpu" )
439452 self .hyperparameters = Hyperparameters (sweep_config )
440453 self .global_search_scale = global_search_scale
441454 self .suggestions_per_pareto = suggestions_per_pareto
442455 self .seed_with_search_center = seed_with_search_center
443456 self .resample_frequency = resample_frequency
444- self .max_suggestion_cost = max_suggestion_cost
457+ self .max_suggestion_cost = _max_suggestion_cost
445458 self .expansion_rate = expansion_rate
446459 self .gp_training_iter = gp_training_iter
447460 self .gp_learning_rate = gp_learning_rate
448461 self .optimizer_reset_frequency = optimizer_reset_frequency
449- self .prune_pareto = prune_pareto
462+ self .prune_pareto = _prune_pareto
450463
451464 self .success_observations = []
452465 self .failure_observations = []
466+
453467 self .suggestion_idx = 0
454468 self .min_score , self .max_score = math .inf , - math .inf
455469 self .log_c_min , self .log_c_max = math .inf , - math .inf
@@ -462,11 +476,17 @@ def __init__(self,
462476 # self.num_random_samples = 3 * points_per_run * self.hyperparameters.num
463477
464478 self .cost_param_idx = self .hyperparameters .get_flat_idx (cost_param )
465- self .cost_random_suggestion = self .hyperparameters .search_centers [self .cost_param_idx ]
479+ self .cost_random_suggestion = None
480+ if self .cost_param_idx is not None :
481+ self .cost_random_suggestion = self .hyperparameters .search_centers [self .cost_param_idx ]
466482
467483 self .gp_max_obs = gp_max_obs # train time bumps after 800?
468484 self .infer_batch_size = infer_batch_size
469485
486+ # Probably useful only when downsample=1 and each run is expensive.
487+ self .use_success_prob = sweep_config ['downsample' ] == 1
488+ self .success_classifier = LogisticRegression (class_weight = 'balanced' )
489+
470490 # Use 64 bit for GP regression
471491 with default_tensor_dtype (torch .float64 ):
472492 # Params taken from HEBO: https://arxiv.org/abs/2012.03826
@@ -514,17 +534,29 @@ def _sample_observations(self, max_size=None, recent_ratio=0.5):
514534 if not self .success_observations :
515535 return []
516536
537+ observations = self .success_observations .copy ()
538+
517539 # Update the stats using the full data
518- y = np .array ([e ['output' ] for e in self . success_observations ])
540+ y = np .array ([e ['output' ] for e in observations ])
519541 self .min_score , self .max_score = y .min (), y .max ()
520542
521- c = np .array ([e ['cost' ] for e in self . success_observations ])
543+ c = np .array ([e ['cost' ] for e in observations ])
522544 log_c = np .log (np .maximum (c , EPSILON ))
523545 self .log_c_min , self .log_c_max = log_c .min (), log_c .max ()
524546
525- params = np .array ([e ['input' ] for e in self .success_observations ])
547+ # When the data is scare, also use failed observations
548+ if len (observations ) < 100 and self .failure_observations :
549+ # Give the min score for the failed obs, so this value will keep changing.
550+ for e in self .failure_observations :
551+ e ['output' ] = self .min_score
552+
553+ # NOTE: the order of obs matters since recent obs are always fed into gp training
554+ # So, putting the failure obs first.
555+ observations = self .failure_observations + observations
556+
557+ params = np .array ([np .append (e ['input' ], [e ['output' ], e ['cost' ]]) for e in observations ])
526558 dedup_indices = self ._filter_near_duplicates (params )
527- observations = [self . success_observations [i ] for i in dedup_indices ]
559+ observations = [observations [i ] for i in dedup_indices ]
528560
529561 if max_size is None :
530562 max_size = self .gp_max_obs
@@ -574,16 +606,19 @@ def _train_gp_models(self):
574606 def suggest (self , fill ):
575607 info = {}
576608 self .suggestion_idx += 1
577- if len (self .success_observations ) == 0 and self .seed_with_search_center :
578- suggestion = self .hyperparameters .search_centers
579- return self .hyperparameters .to_dict (suggestion , fill ), info
609+
610+ # NOTE: Changed pufferl to use the train args, NOT the sweep hyperparam search center
611+ # if len(self.success_observations) == 0 and self.seed_with_search_center:
612+ # suggestion = self.hyperparameters.search_centers
613+ # return self.hyperparameters.to_dict(suggestion, fill), info
580614
581- elif len ( self .success_observations ) < self .num_random_samples :
615+ if self .suggestion_idx <= self .num_random_samples :
582616 # Suggest the next point in the Sobol sequence
583617 zero_one = self .sobol .random (1 )[0 ]
584618 suggestion = 2 * zero_one - 1 # Scale from [0, 1) to [-1, 1)
585- cost_suggestion = self .cost_random_suggestion + 0.1 * np .random .randn ()
586- suggestion [self .cost_param_idx ] = np .clip (cost_suggestion , - 1 , 1 ) # limit the cost
619+ if self .cost_param_idx is not None :
620+ cost_suggestion = self .cost_random_suggestion + 0.1 * np .random .randn ()
621+ suggestion [self .cost_param_idx ] = np .clip (cost_suggestion , - 1 , 1 ) # limit the cost
587622 return self .hyperparameters .to_dict (suggestion , fill ), info
588623
589624 elif self .resample_frequency and self .suggestion_idx % self .resample_frequency == 0 :
@@ -601,14 +636,13 @@ def suggest(self, fill):
601636 self .cost_opt = torch .optim .Adam (self .gp_cost .parameters (), lr = self .gp_learning_rate , amsgrad = True )
602637
603638 candidates , pareto_idxs = pareto_points (self .success_observations )
604-
605639 if self .prune_pareto :
606640 candidates = prune_pareto_front (candidates )
607641
608642 ### Sample suggestions
609643 search_centers = np .stack ([e ['input' ] for e in candidates ])
610- suggestions = self .hyperparameters . sample (
611- len ( candidates ) * self .suggestions_per_pareto , mu = search_centers )
644+ num_sample = len ( candidates ) * self .suggestions_per_pareto
645+ suggestions = self .hyperparameters . sample ( num_sample , mu = search_centers )
612646
613647 dedup_indices = self ._filter_near_duplicates (suggestions )
614648 suggestions = suggestions [dedup_indices ]
@@ -655,16 +689,31 @@ def suggest(self, fill):
655689 gp_log_c = gp_log_c_norm * (self .log_c_max - self .log_c_min ) + self .log_c_min
656690 gp_c = np .exp (gp_log_c )
657691
658- max_c_mask = gp_c < self .max_suggestion_cost
692+ # Maximize score. (Tried upper confidence bounds, but it did more harm because gp was noisy)
693+ suggestion_scores = self .hyperparameters .optimize_direction * gp_y_norm
659694
695+ # Then, decide the budget for this session and favor closer suggestions
696+ max_c_mask = gp_c < self .max_suggestion_cost
660697 target = (1 + self .expansion_rate )* np .random .rand ()
661698 weight = 1 - abs (target - gp_log_c_norm )
662-
663- # NOTE: Tried upper confidence bounds, but it did more harm because gp was noisy
664- score = gp_y_norm
665-
666- suggestion_scores = self .hyperparameters .optimize_direction * max_c_mask * (
667- score * weight )
699+ suggestion_scores *= max_c_mask * weight
700+
701+ # Then, consider the prob of training success, only when downsample = 1
702+ # NOTE: Useful only in limited scenarios, where each data point is expensive. So turn it off by default.
703+ if self .use_success_prob and len (self .success_observations ) > 9 and len (self .failure_observations ) > 9 :
704+ success_params = np .array ([e ['input' ] for e in self .success_observations ])
705+ failure_params = np .array ([e ['input' ] for e in self .failure_observations ])
706+ X_train = np .vstack ([success_params , failure_params ])
707+ y_train = np .concatenate ([
708+ np .ones (len (success_params )),
709+ np .zeros (len (failure_params ))
710+ ])
711+ if len (np .unique (y_train )) > 1 :
712+ self .success_classifier .fit (X_train , y_train )
713+ with warnings .catch_warnings ():
714+ warnings .simplefilter ("ignore" , UserWarning )
715+ p_success = self .success_classifier .predict_proba (suggestions )[:, 1 ]
716+ suggestion_scores *= p_success
668717
669718 best_idx = np .argmax (suggestion_scores )
670719 info = dict (
@@ -708,7 +757,7 @@ def observe(self, hypers, score, cost, is_failure=False):
708757 return
709758
710759 # Ignore obs that are below the minimum cost
711- if params [self .cost_param_idx ] <= - 1 :
760+ if self . cost_param_idx is not None and params [self .cost_param_idx ] <= - 1 :
712761 return
713762
714763 self .success_observations .append (new_observation )
0 commit comments