transfer sweep improvements from g2048

kywch · kywch · commit e90546b5c277 · 2025-11-12T18:25:22.000-08:00
diff --git a/pufferlib/config/default.ini b/pufferlib/config/default.ini
@@ -3,7 +3,6 @@ package = None
 env_name = None
 policy_name = Policy
 rnn_name = None
-max_suggestion_cost = 3600
 
 [vec]
 backend = Multiprocessing
@@ -26,10 +25,11 @@ torch_deterministic = True
 cpu_offload = False
 device = cuda
 optimizer = muon
-anneal_lr = True
 precision = float32
 total_timesteps = 10_000_000
 learning_rate = 0.015
+anneal_lr = True
+min_learning_rate = 0.0
 gamma = 0.995
 gae_lambda = 0.90
 update_epochs = 1
@@ -64,6 +64,7 @@ prio_beta0 = 0.2
 method = Protein 
 metric = score
 goal = maximize
+max_suggestion_cost = 3600
 downsample = 5
 use_gpu = True
 prune_pareto = True
@@ -127,14 +128,14 @@ scale = auto
 
 [sweep.train.vtrace_rho_clip]
 distribution = uniform
-min = 0.0
+min = 0.1
 max = 5.0
 mean = 1.0
 scale = auto
 
 [sweep.train.vtrace_c_clip]
 distribution = uniform
-min = 0.0
+min = 0.1
 max = 5.0
 mean = 1.0
 scale = auto
@@ -164,14 +165,14 @@ scale = auto
 
 [sweep.train.vf_coef]
 distribution = uniform
-min = 0.0
+min = 0.1
 max = 5.0
 mean = 2.0
 scale = auto
 
 [sweep.train.max_grad_norm]
 distribution = uniform
-min = 0.0
+min = 0.1
 mean = 1.0
 max = 5.0
 scale = auto
diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
@@ -184,7 +184,8 @@ def __init__(self, config, vecenv, policy, logger=None):
 
         # Learning rate scheduler
         epochs = config['total_timesteps'] // config['batch_size']
-        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
+        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+            optimizer, T_max=epochs, eta_min=config['min_learning_rate'])
         self.total_epochs = epochs
 
         # Automatic mixed precision
@@ -1055,7 +1056,10 @@ def sweep(args=None, env_name=None):
         np.random.seed(seed)
         torch.manual_seed(seed)
 
-        sweep.suggest(args)
+        # In the first run, skip sweep and use the train args specified in the config
+        if i > 0:
+            sweep.suggest(args)
+
         all_logs = train(env_name, args=args, should_stop_early=stop_if_loss_nan)
         all_logs = [e for e in all_logs if target_key in e]
 
diff --git a/pufferlib/sweep.py b/pufferlib/sweep.py
@@ -17,6 +17,7 @@
 from gpytorch.priors import LogNormalPrior
 from scipy.stats.qmc import Sobol
 from scipy.spatial import KDTree
+from sklearn.linear_model import LogisticRegression
 
 EPSILON = 1e-6
 
@@ -126,17 +127,24 @@ def unnormalize(self, value):
         log_spaced = zero_one*(math.log(1-self.max, self.base) - math.log(1-self.min, self.base)) + math.log(1-self.min, self.base)
         return 1 - self.base**log_spaced
 
-def _params_from_puffer_sweep(sweep_config):
+def _params_from_puffer_sweep(sweep_config, only_include=None):
     param_spaces = {}
+
+    if 'sweep_only' in sweep_config:
+        only_include = [p.strip() for p in sweep_config['sweep_only'].split(',')]
+
     for name, param in sweep_config.items():
-        if name in ('method', 'metric', 'goal', 'downsample', 'use_gpu', 'prune_pareto'):
+        if name in ('method', 'metric', 'goal', 'downsample', 'use_gpu', 'prune_pareto', 'sweep_only', 'max_suggestion_cost'):
             continue
 
         assert isinstance(param, dict)
         if any(isinstance(param[k], dict) for k in param):
-            param_spaces[name] = _params_from_puffer_sweep(param)
+            param_spaces[name] = _params_from_puffer_sweep(param, only_include)
             continue
  
+        if only_include and not any(k in name for k in only_include):
+            continue
+
         assert 'distribution' in param
         distribution = param['distribution']
         search_center = param['mean']
@@ -232,8 +240,8 @@ def _fill(self, params, spaces, flat_sample, idx=0):
         return idx
 
     def get_flat_idx(self, flat_key):
-        return list(self.flat_spaces.keys()).index(flat_key)
-
+        keys = list(self.flat_spaces.keys())
+        return keys.index(flat_key) if flat_key in keys else None
 
 def pareto_points(observations):
     if not observations:
@@ -421,7 +429,7 @@ def __init__(self,
             sweep_config,
             max_suggestion_cost = 3600,
             resample_frequency = 0,
-            num_random_samples = 30,
+            num_random_samples = 10,
             global_search_scale = 1,
             suggestions_per_pareto = 256,
             seed_with_search_center = True,
@@ -435,21 +443,27 @@ def __init__(self,
             cost_param = "train/total_timesteps",
             prune_pareto = True,
         ):
-        self.device = torch.device("cuda:0" if use_gpu and torch.cuda.is_available() else "cpu")
+        # Process sweep config. NOTE: sweep_config takes precedence. It's not good.
+        _use_gpu = sweep_config['use_gpu'] if 'use_gpu' in sweep_config else use_gpu
+        _prune_pareto = sweep_config['prune_pareto'] if 'prune_pareto' in sweep_config else prune_pareto
+        _max_suggestion_cost = sweep_config['max_suggestion_cost'] if 'max_suggestion_cost' in sweep_config else max_suggestion_cost
+
+        self.device = torch.device("cuda:0" if _use_gpu and torch.cuda.is_available() else "cpu")
         self.hyperparameters = Hyperparameters(sweep_config)
         self.global_search_scale = global_search_scale
         self.suggestions_per_pareto = suggestions_per_pareto
         self.seed_with_search_center = seed_with_search_center
         self.resample_frequency = resample_frequency
-        self.max_suggestion_cost = max_suggestion_cost
+        self.max_suggestion_cost = _max_suggestion_cost
         self.expansion_rate = expansion_rate
         self.gp_training_iter = gp_training_iter
         self.gp_learning_rate = gp_learning_rate
         self.optimizer_reset_frequency = optimizer_reset_frequency
-        self.prune_pareto = prune_pareto
+        self.prune_pareto = _prune_pareto
 
         self.success_observations = []
         self.failure_observations = []
+
         self.suggestion_idx = 0
         self.min_score, self.max_score = math.inf, -math.inf
         self.log_c_min, self.log_c_max = math.inf, -math.inf
@@ -462,11 +476,17 @@ def __init__(self,
         # self.num_random_samples = 3 * points_per_run * self.hyperparameters.num
 
         self.cost_param_idx = self.hyperparameters.get_flat_idx(cost_param)
-        self.cost_random_suggestion = self.hyperparameters.search_centers[self.cost_param_idx]
+        self.cost_random_suggestion = None
+        if self.cost_param_idx is not None:
+            self.cost_random_suggestion = self.hyperparameters.search_centers[self.cost_param_idx]
 
         self.gp_max_obs = gp_max_obs  # train time bumps after 800?
         self.infer_batch_size = infer_batch_size
 
+        # Probably useful only when downsample=1 and each run is expensive.
+        self.use_success_prob = sweep_config['downsample'] == 1
+        self.success_classifier = LogisticRegression(class_weight='balanced')
+
         # Use 64 bit for GP regression
         with default_tensor_dtype(torch.float64):
             # Params taken from HEBO: https://arxiv.org/abs/2012.03826
@@ -514,17 +534,29 @@ def _sample_observations(self, max_size=None, recent_ratio=0.5):
         if not self.success_observations:
             return []
 
+        observations = self.success_observations.copy()
+
         # Update the stats using the full data
-        y = np.array([e['output'] for e in self.success_observations])
+        y = np.array([e['output'] for e in observations])
         self.min_score, self.max_score = y.min(), y.max()
 
-        c = np.array([e['cost'] for e in self.success_observations])
+        c = np.array([e['cost'] for e in observations])
         log_c = np.log(np.maximum(c, EPSILON))
         self.log_c_min, self.log_c_max = log_c.min(), log_c.max()
 
-        params = np.array([e['input'] for e in self.success_observations])
+        # When the data is scare, also use failed observations
+        if len(observations) < 100 and self.failure_observations:
+            # Give the min score for the failed obs, so this value will keep changing.
+            for e in self.failure_observations:
+                e['output'] = self.min_score
+            
+            # NOTE: the order of obs matters since recent obs are always fed into gp training
+            # So, putting the failure obs first.
+            observations = self.failure_observations + observations
+
+        params = np.array([np.append(e['input'], [e['output'], e['cost']]) for e in observations])
         dedup_indices = self._filter_near_duplicates(params)
-        observations = [self.success_observations[i] for i in dedup_indices]
+        observations = [observations[i] for i in dedup_indices]
 
         if max_size is None:
             max_size = self.gp_max_obs
@@ -574,16 +606,19 @@ def _train_gp_models(self):
     def suggest(self, fill):
         info = {}
         self.suggestion_idx += 1
-        if len(self.success_observations) == 0 and self.seed_with_search_center:
-            suggestion = self.hyperparameters.search_centers
-            return self.hyperparameters.to_dict(suggestion, fill), info
+        
+        # NOTE: Changed pufferl to use the train args, NOT the sweep hyperparam search center
+        # if len(self.success_observations) == 0 and self.seed_with_search_center:
+        #     suggestion = self.hyperparameters.search_centers
+        #     return self.hyperparameters.to_dict(suggestion, fill), info
 
-        elif len(self.success_observations) < self.num_random_samples:
+        if self.suggestion_idx <= self.num_random_samples:
             # Suggest the next point in the Sobol sequence
             zero_one = self.sobol.random(1)[0]
             suggestion = 2*zero_one - 1  # Scale from [0, 1) to [-1, 1)
-            cost_suggestion = self.cost_random_suggestion + 0.1 * np.random.randn()
-            suggestion[self.cost_param_idx] = np.clip(cost_suggestion, -1, 1)  # limit the cost
+            if self.cost_param_idx is not None:
+                cost_suggestion = self.cost_random_suggestion + 0.1 * np.random.randn()
+                suggestion[self.cost_param_idx] = np.clip(cost_suggestion, -1, 1)  # limit the cost
             return self.hyperparameters.to_dict(suggestion, fill), info
 
         elif self.resample_frequency and self.suggestion_idx % self.resample_frequency == 0:
@@ -601,14 +636,13 @@ def suggest(self, fill):
             self.cost_opt = torch.optim.Adam(self.gp_cost.parameters(), lr=self.gp_learning_rate, amsgrad=True)
        
         candidates, pareto_idxs = pareto_points(self.success_observations)
-
         if self.prune_pareto:
             candidates = prune_pareto_front(candidates)
 
         ### Sample suggestions
         search_centers = np.stack([e['input'] for e in candidates])
-        suggestions = self.hyperparameters.sample(
-            len(candidates)*self.suggestions_per_pareto, mu=search_centers)
+        num_sample = len(candidates) * self.suggestions_per_pareto
+        suggestions = self.hyperparameters.sample(num_sample, mu=search_centers)
 
         dedup_indices = self._filter_near_duplicates(suggestions)
         suggestions = suggestions[dedup_indices]
@@ -655,16 +689,31 @@ def suggest(self, fill):
         gp_log_c = gp_log_c_norm*(self.log_c_max - self.log_c_min) + self.log_c_min
         gp_c = np.exp(gp_log_c)
 
-        max_c_mask = gp_c < self.max_suggestion_cost
+        # Maximize score. (Tried upper confidence bounds, but it did more harm because gp was noisy)
+        suggestion_scores = self.hyperparameters.optimize_direction * gp_y_norm
 
+        # Then, decide the budget for this session and favor closer suggestions
+        max_c_mask = gp_c < self.max_suggestion_cost
         target = (1 + self.expansion_rate)*np.random.rand()
         weight = 1 - abs(target - gp_log_c_norm)
-
-        # NOTE: Tried upper confidence bounds, but it did more harm because gp was noisy
-        score = gp_y_norm
-
-        suggestion_scores = self.hyperparameters.optimize_direction * max_c_mask * (
-                score * weight)
+        suggestion_scores *= max_c_mask * weight
+
+        # Then, consider the prob of training success, only when downsample = 1
+        # NOTE: Useful only in limited scenarios, where each data point is expensive. So turn it off by default.
+        if self.use_success_prob and len(self.success_observations) > 9 and len(self.failure_observations) > 9:
+            success_params = np.array([e['input'] for e in self.success_observations])
+            failure_params = np.array([e['input'] for e in self.failure_observations])
+            X_train = np.vstack([success_params, failure_params])
+            y_train = np.concatenate([
+                np.ones(len(success_params)),
+                np.zeros(len(failure_params))
+            ])
+            if len(np.unique(y_train)) > 1:
+                self.success_classifier.fit(X_train, y_train)
+                with warnings.catch_warnings():
+                    warnings.simplefilter("ignore", UserWarning)
+                    p_success = self.success_classifier.predict_proba(suggestions)[:, 1]
+                suggestion_scores *= p_success
 
         best_idx = np.argmax(suggestion_scores)
         info = dict(
@@ -708,7 +757,7 @@ def observe(self, hypers, score, cost, is_failure=False):
                 return
 
         # Ignore obs that are below the minimum cost
-        if params[self.cost_param_idx] <= -1:
+        if self.cost_param_idx is not None and params[self.cost_param_idx] <= -1:
             return
 
         self.success_observations.append(new_observation)
diff --git a/setup.py b/setup.py
@@ -288,6 +288,7 @@ def run(self):
         'rich_argparse',
         'imageio',
         'gpytorch',
+        'scikit-learn',
         'heavyball>=2.2.0', # contains relevant fixes compared to 1.7.2 and 2.1.1
         'neptune',
         'wandb',