feat(sweep):auto hyperparameter discovery

Ladbaby · Ladbaby · commit 712b319b45cd · 2025-09-28T16:50:12.000+08:00
diff --git a/exp/exp_main.py b/exp/exp_main.py
@@ -1,11 +1,9 @@
 from pathlib import Path
 import datetime
 import warnings
-import yaml
 import json
 from collections import OrderedDict
 from typing import Generator
-from dataclasses import asdict
 import importlib
 
 import numpy as np
@@ -195,37 +193,9 @@ def vali(
 
     def train(self) -> None:
         logger.info('>>>>>>> training start <<<<<<<')
-        # save training config file for reference
         path = Path(self.configs.checkpoints) / self.configs.dataset_name / self.configs.model_name / self.configs.model_id / f"{self.configs.seq_len}_{self.configs.pred_len}" / self.configs.subfolder_train / f"iter{self.configs.itr_i}"
-        path.mkdir(parents=True, exist_ok=True)
-        logger.info(f"Training iter{self.configs.itr_i} save to: {path}")
-        with open(path / "configs.yaml", 'w', encoding='utf-8') as f:
-            yaml.dump(asdict(self.configs), f, default_flow_style=False)
-
-        accelerator.project_configuration.set_directories(project_dir=path)
-
-        # init exp tracker
         if (self.configs.wandb and accelerator.is_main_process) or self.configs.sweep:
             import wandb
-            run = wandb.init(
-                # Set the project where this run will be logged
-                project="YOUR_PROJECT_NAME",
-                # Track hyperparameters and run metadata
-                config={
-                    "model_name": self.configs.model_name,
-                    "model_id": self.configs.model_id,
-                    "dataset_name": self.configs.dataset_name,
-                    "seq_len": self.configs.seq_len,
-                    "pred_len": self.configs.pred_len,
-                    "learning_rate": self.configs.learning_rate,
-                    "batch_size": self.configs.batch_size
-                },
-                dir=path
-            )
-            if self.configs.sweep:
-                # overwrite default configs by wandb.config when sweeping
-                self.configs.learning_rate = wandb.config.learning_rate
-                self.configs.batch_size = wandb.config.batch_size
 
         train_data, train_loader = self._get_data(flag='train')
         vali_data, vali_loader = self._get_data(flag='val')
@@ -278,7 +248,7 @@ def train(self) -> None:
                         outputs: dict[str, Tensor] = model_train(
                             exp_stage="train",
                             train_stage=train_stage,
-                            current_epoch=epoch
+                            current_epoch=epoch,
                             **batch
                         )
 
diff --git a/main.py b/main.py
@@ -1,5 +1,10 @@
 import random
+from pathlib import Path
 import datetime
+import importlib
+import yaml
+from dataclasses import asdict
+import pprint
 
 import torch
 import numpy as np
@@ -8,6 +13,8 @@
 from utils.globals import logger, accelerator
 from utils.configs import configs
 
+hyperparameters_sweep: dict[str, dict[str, list]] = {}
+
 def main():
     # random seed
     fix_seed_list = range(2024, 2024 + configs.itr)
@@ -16,6 +23,41 @@ def main():
 
     Exp = Exp_Main
 
+    def start_exp_train() -> Exp_Main:
+        # save training config file for reference
+        path = Path(configs.checkpoints) / configs.dataset_name / configs.model_name / configs.model_id / f"{configs.seq_len}_{configs.pred_len}" / configs.subfolder_train / f"iter{configs.itr_i}" # same as the one in Exp_Main.train()
+        path.mkdir(parents=True, exist_ok=True)
+        logger.info(f"Training iter{configs.itr_i} save to: {path}")
+        with open(path / "configs.yaml", 'w', encoding='utf-8') as f:
+            yaml.dump(asdict(configs), f, default_flow_style=False)
+        # init exp tracker
+        if (configs.wandb and accelerator.is_main_process) or configs.sweep:
+            import wandb
+            run = wandb.init(
+                # Set the project where this run will be logged
+                project="YOUR_PROJECT_NAME",
+                # Track hyperparameters and run metadata
+                config={
+                    "model_name": configs.model_name,
+                    "model_id": configs.model_id,
+                    "dataset_name": configs.dataset_name,
+                    "seq_len": configs.seq_len,
+                    "pred_len": configs.pred_len,
+                    "learning_rate": configs.learning_rate,
+                    "batch_size": configs.batch_size
+                },
+                dir=path
+            )
+            # overwrite model hyperparameters when sweeping
+            for attribute_name in hyperparameters_sweep.keys():
+                setattr(configs, attribute_name, getattr(wandb.config, attribute_name))
+
+        accelerator.project_configuration.set_directories(project_dir=path)
+
+        exp = Exp(configs)
+        exp.train()
+        return exp
+
     if configs.sweep:
         '''
         Currently, wandb sweep with huggingface accelerate multi GPU is tricky, use at your own risk.
@@ -42,12 +84,12 @@ def main():
         torch.manual_seed(fix_seed_list[configs.itr_i])
         np.random.seed(fix_seed_list[configs.itr_i])
 
-        exp = Exp(configs)
-
-        exp.train()
+        exp = start_exp_train()
         exp.test()
-
     elif configs.is_training:
+        '''
+        Normal train&test
+        '''
         subfolder = datetime.datetime.now().strftime("%Y_%m%d_%H%M")
         configs.subfolder_train = subfolder
         for i in range(configs.itr):
@@ -57,12 +99,13 @@ def main():
             torch.manual_seed(fix_seed_list[i])
             np.random.seed(fix_seed_list[i])
 
-            exp = Exp(configs)
-            exp.train()
-
+            exp = start_exp_train()
             torch.cuda.empty_cache()
         exp.test()
     else:
+        '''
+        test only
+        '''
         exp = Exp(configs)
         exp.test()
         torch.cuda.empty_cache()
@@ -73,14 +116,40 @@ def main():
         if not configs.sweep:
             main()
         else:
+            # first determine the hyperparameters actually accessed by model
+            from utils.ExpConfigs import ExpConfigsTracker
+            configs_tracker = ExpConfigsTracker(configs)
+            model_module = importlib.import_module("models." + configs.model_name)
+            model = model_module.Model(configs_tracker)
+            del model
+            accessed_configs: set[str] = configs_tracker.get_accessed_attributes()
+            max_count = 1
+            for accessed_config in accessed_configs:
+                try:
+                    ref_values = configs.get_sweep_values(accessed_config)
+                    if ref_values and (type(ref_values) is list):
+                        hyperparameters_sweep[accessed_config] = {
+                            "values": ref_values
+                        }
+                        max_count *= len(ref_values)
+                except Exception as e:
+                    continue
+
+            if hyperparameters_sweep == {}:
+                logger.error(f"No hyperparameter to be searched, stopping now..")
+                logger.debug(f"{configs.model_name} access these attributes in ExpConfigs:")
+                configs_tracker.print_access_report()
+                logger.debug("""Possible reasons: (1) The model does not access any hyperparameters in ExpConfigs; (2) The accessed hyperparameters have not set their metadata properly. Check the ExpConfigs class in utils/ExpConfigs.py. Example metadata setting:
+                d_model: int = field(metadata={"sweep": [32, 64, 128, 256]})""")
+                exit(0)
+            else:
+                logger.info(f"""{len(hyperparameters_sweep)} hyperparameters and {max_count} runs: \n{pprint.pformat(hyperparameters_sweep)}""")
+                
             import wandb
             sweep_configuration = {
                 "method": "grid",
-                "metric": {"goal": "minimize", "name": "loss_val"},
-                "parameters": {
-                    "learning_rate": {"values": [0.01, 0.001, 0.0001, 0.00001]},
-                    "batch_size": {"values": [16, 32, 64, 128]},
-                },
+                "metric": {"goal": "minimize", "name": "loss_val_best"},
+                "parameters": hyperparameters_sweep
             }
             temp_file_path = "storage/tmp.txt"
             if accelerator.is_main_process:
@@ -95,7 +164,7 @@ def main():
                 sweep_id, 
                 function=main, 
                 project="YOUR_PROJECT_NAME",
-                count=16
+                count=max_count
             )
     except KeyboardInterrupt:
         if accelerator.is_main_process:
diff --git a/utils/ExpConfigs.py b/utils/ExpConfigs.py
@@ -1,4 +1,5 @@
-from dataclasses import dataclass
+from dataclasses import dataclass, field
+from typing import Any, Optional
 
 @dataclass
 class ExpConfigs:
@@ -7,6 +8,13 @@ class ExpConfigs:
 
     Make sure to update this dataclass after adding new args in argparse
     '''
+    @classmethod
+    def get_sweep_values(cls, attr_name: str) -> Optional[list]:
+        for field_info in cls.__dataclass_fields__.values():
+            if field_info.name == attr_name:
+                return field_info.metadata.get('sweep')
+        return None
+
     # basic config
     task_name: str
     is_training: int
@@ -85,10 +93,10 @@ class ExpConfigs:
     enc_in: int
     dec_in: int
     c_out: int
-    d_model: int
+    d_model: int = field(metadata={"sweep": [32, 64, 128, 256]})
     d_timesteps: int
     n_heads: int
-    n_layers: int
+    n_layers: int = field(metadata={"sweep": [1, 2, 3, 4]})
     e_layers: int
     d_layers: int
     hidden_layers: int
@@ -155,3 +163,42 @@ class ExpConfigs:
     patch_len_max_irr: int | None = None # maximum number of observations along time dimension in a patch of x, set in irregular time series datasets
     subfolder_train: str = "" # timestamp of training in format %Y_%m%d_%H%M
     itr_i: int = 0 # current training iteration. [0, itr-1]
+
+class ExpConfigsTracker:
+    """Wrapper that tracks which ExpConfigs attributes are accessed"""
+    
+    def __init__(self, configs: ExpConfigs):
+        object.__setattr__(self, '_config', configs)
+        object.__setattr__(self, '_accessed_attrs', set())
+    
+    def __getattr__(self, name: str) -> Any:
+        if hasattr(self._config, name):
+            self._accessed_attrs.add(name)
+            return getattr(self._config, name)
+        raise AttributeError(f"'{type(self._config).__name__}' object has no attribute '{name}'")
+    
+    def __setattr__(self, name: str, value: Any) -> None:
+        if name.startswith('_'):
+            object.__setattr__(self, name, value)
+        else:
+            self._accessed_attrs.add(name)
+            setattr(self._config, name, value)
+    
+    def get_accessed_attributes(self) -> set[str]:
+        """Return set of accessed attribute names"""
+        return self._accessed_attrs.copy()
+    
+    def get_unused_attributes(self) -> set[str]:
+        """Return set of unused attribute names"""
+        all_attrs = {field.name for field in self._config.__dataclass_fields__.values()}
+        return all_attrs - self._accessed_attrs
+    
+    def print_access_report(self):
+        """Print a report of accessed vs unused attributes"""
+        accessed = self.get_accessed_attributes()
+        unused = self.get_unused_attributes()
+        
+        print("=== ExpConfigs Access Report ===")
+        print(f"Accessed attributes ({len(accessed)}):")
+        for attr in sorted(accessed):
+            print(f"  ✓ {attr}")