add benchmark mode

hiyuchang · hiyuchang · commit 39ff2e447aca · 2025-05-16T10:30:29.000+08:00
diff --git a/docs/sphinx_doc/source/tutorial/trinity_configs.md b/docs/sphinx_doc/source/tutorial/trinity_configs.md
@@ -10,12 +10,14 @@ global_config:
   total_epochs: 1
   batch_size: 96
   eval_interval: 1000
+  eval_on_latest_ckp: true
 ```
 
 - `mode`: The mode of the experiment, chosen from `both`, `train`, `explore` or `bench`. `both` means both trainer and explorer are launched; `train` means only trainer is launched; `explore` means only explorer is launched; `bench` conducts benchmark evaluation. Default is `both`.
 - `global_config.total_epochs`: The total number of epochs. It should be checked manually.
 - `global_config.batch_size`: The batch size used for training. It should be checked manually.
 - `global_config.eval_interval`: The interval steps between two evaluations. Default is `1000`.
+- `global_config.eval_on_latest_ckp`: In bench mode, whether to evaluate on only the latest checkpoint or all the checkpoints in the path. Default is `true`.
 
 
 ## Monitor
diff --git a/trinity/cli/launcher.py b/trinity/cli/launcher.py
@@ -3,6 +3,7 @@
 import sys
 
 import ray
+import wandb
 
 from trinity.common.config import Config, load_config
 from trinity.common.constants import AlgorithmType
@@ -19,11 +20,12 @@ def bench(config: Config) -> None:
     try:
         ray.get(explorer.prepare.remote())
         ray.get(explorer.sync_weight.remote())
-        _, step = ray.get(explorer.eval.remote())
-        logger.info("Evaluation finished.")
-        ray.get(explorer.flush_log.remote(step=step))
+        bm_finished, step = ray.get(explorer.benchmark.remote())
+        logger.info("Benchmark finished.")
+        if bm_finished:
+            ray.get(explorer.flush_log.remote(step=step))
     except Exception as e:
-        logger.error(f"Evaluation failed: {e}")
+        logger.error(f"Benchmark failed: {e}")
         raise e
 
 
@@ -168,6 +170,9 @@ def run(config_path: str):
     elif config.mode == "bench":
         bench(config)
 
+    if config.monitor.monitor_type == "wandb":
+        wandb.finish()
+
 
 def studio(port: int = 8501):
     from streamlit.web import cli as stcli
diff --git a/trinity/common/config.py b/trinity/common/config.py
@@ -107,6 +107,7 @@ class GlobalConfig:
     total_epochs: int = 1
     batch_size: int = 1
     eval_interval: int = 100
+    eval_on_latest_ckp: bool = True
 
 
 @dataclass
@@ -299,7 +300,8 @@ def _check_interval(self) -> None:
 
         # check eval_interval
         if (
-            self.trainer.algorithm_type != AlgorithmType.DPO
+            self.mode != "bench"
+            and self.trainer.algorithm_type != AlgorithmType.DPO
             and self.global_config.eval_interval % self.synchronizer.sync_interval != 0
         ):
             self.global_config.eval_interval = (
@@ -316,7 +318,7 @@ def _check_interval(self) -> None:
         ):
             if self.trainer.save_interval != self.synchronizer.sync_interval:
                 logger.warning(
-                    f"When `trainer.algorithm_type != DPO` and `synchronizer.sync_method == checkpoint`, "
+                    f"When `trainer.algorithm_type` != `DPO` and `synchronizer.sync_method` == `checkpoint`, "
                     f"`trainer.save_interval` will be set to "
                     f"`synchronizer.sync_interval = {self.synchronizer.sync_interval}`."
                 )
diff --git a/trinity/explorer/explorer.py b/trinity/explorer/explorer.py
@@ -261,6 +261,25 @@ def wait():
         self.monitor.log(log_metrics, step=self.step_num)  # type: ignore
         return True, self.step_num
 
+    def benchmark(self) -> Tuple[bool, int]:
+        """Benchmark the model checkpoints."""
+        latest_step = self.step_num
+
+        # benchmark on the latest checkpoint
+        if self.config.global_config.eval_on_latest_ckp:
+            self.eval()
+            return True, self.step_num
+
+        # benchmark on all checkoints
+        for step_num in range(latest_step + 1):
+            path = os.path.join(self.config.model.checkpoint_path, f"global_step_{step_num}")
+            if os.path.isdir(path) and os.listdir(path):
+                self.logger.info(f"{path} exists.")
+                self.step_num = step_num
+                self._checkpoint_weights_update(step_num=step_num)
+                self.eval()
+        return True, self.step_num
+
     def sync_weight(self) -> None:
         """Synchronize model weights."""
         # call this method before training start to load the latest model weights