add docs

pan-x-c · pan-x-c · commit 292795a90d23 · 2025-07-03T20:27:08.000+08:00
diff --git a/docs/sphinx_doc/source/tutorial/trinity_configs.md b/docs/sphinx_doc/source/tutorial/trinity_configs.md
@@ -313,7 +313,7 @@ Controls the rollout models and workflow execution.
 ```yaml
 explorer:
   name: explorer
-  runner_num: 32
+  runner_per_model: 8
   max_timeout: 900
   max_retry_times: 2
   env_vars: {}
@@ -324,17 +324,22 @@ explorer:
   auxiliary_models:
   - model_path: /PATH/TO/MODEL
     tensor_parallel_size: 1
+  eval_interval: 100
+  eval_on_startup: True
 ```
 
 - `name`: Name of the explorer. This name will be used as the Ray actor's name, so it must be unique.
-- `runner_num`: Number of parallel workflow runners.
+- `runner_per_model`: Number of parallel workflow runners per each rollout model.
 - `max_timeout`: Maximum time (in seconds) for a workflow to complete.
 - `max_retry_times`: Maximum number of retries for a workflow.
 - `env_vars`: Environment variables to be set for every workflow runners.
 - `rollout_model.engine_type`: Type of inference engine. Options: `vllm_async` (recommended), `vllm`.
 - `rollout_model.engine_num`: Number of inference engines.
 - `rollout_model.tensor_parallel_size`: Degree of tensor parallelism.
 - `auxiliary_models`: Additional models used for custom workflows.
+- `eval_interval`: Interval (in steps) for evaluating the model.
+- `eval_on_startup`: Whether to evaluate the model on startup. More precisely, at step 0 with the original model, so it will not be triggered when restarting.
+- `runner_num`: (*Deprecated*) Number of parallel workflow runners.
 
 ---
 
diff --git a/tests/template/config.yaml b/tests/template/config.yaml
@@ -37,7 +37,7 @@ buffer:
     default_reward_fn_type: ''
 explorer:
   eval_interval: 100
-  runner_num: 4
+  runner_per_model: 8
   rollout_model:
     engine_type: vllm_async
     engine_num: 2
diff --git a/tests/trainer/trainer_test.py b/tests/trainer/trainer_test.py
@@ -60,7 +60,7 @@ def test_trainer(self):
         self.config.buffer.explorer_input.eval_tasksets.append(
             get_unittest_dataset_config("copy_countdown", "test")
         )
-        self.config.trainer.save_interval = 4
+        self.config.trainer.save_interval = 6
         self.config.check_and_update()
         self.config.trainer.trainer_config.trainer.max_actor_ckpt_to_keep = 2
         self.config.trainer.trainer_config.trainer.max_critic_ckpt_to_keep = 2
@@ -84,24 +84,25 @@ def test_trainer(self):
         self.assertEqual(parser.metric_max_step(response_metrics[0]), 8)
         ray.shutdown(_exiting_interpreter=True)
         # check checkpoint
-        checkpoint_step_4, _ = get_checkpoint_dir_with_step_num(
+        checkpoint_step_6, _ = get_checkpoint_dir_with_step_num(
             checkpoint_root_path=self.config.checkpoint_job_dir,
             trainer_type=self.config.trainer.trainer_type,
-            step_num=4,
+            step_num=6,
         )
-        checkpoint_step_8, _ = get_checkpoint_dir_with_step_num(
+        # check save lastest checkpoint
+        checkpoint_step_8, step_num = get_checkpoint_dir_with_step_num(
             checkpoint_root_path=self.config.checkpoint_job_dir,
             trainer_type=self.config.trainer.trainer_type,
-            step_num=8,
         )
-        self.assertTrue(os.path.exists(checkpoint_step_4))
-        self.assertTrue(os.path.exists(checkpoint_step_8))
+        self.assertTrue(len(os.listdir(os.path.join(checkpoint_step_6, "actor"))) > 0)
+        self.assertTrue(len(os.listdir(os.path.join(checkpoint_step_8, "actor"))) > 0)
+        self.assertEqual(step_num, 8)
         # TODO: Reinit will fail when using v1 engine, find a way to fix it
         ray.init(ignore_reinit_error=True)
         # test bench mode
         self.config.mode = "bench"
         self.config.synchronizer.sync_method = SyncMethod.CHECKPOINT
-        self.config.explorer.eval_on_latest_checkpoint = False
+        self.config.explorer.bench_on_latest_checkpoint = False
         self.config.check_and_update()
         bench(self.config)
         parser = TensorBoardParser(os.path.join(self.config.monitor.cache_dir, "tensorboard"))
@@ -116,7 +117,8 @@ def test_trainer(self):
 
     def tearDown(self):
         # remove dir only when the test passed
-        shutil.rmtree(self.config.checkpoint_job_dir)
+        # shutil.rmtree(self.config.checkpoint_job_dir)
+        pass
 
 
 class TestStepAheadAsyncRL(BaseTrainerCase):
@@ -328,7 +330,6 @@ def test_fully_async_mode(self):
         config.cluster.node_num = 1
         explorer1_config.explorer.rollout_model.engine_num = 1
         explorer1_config.explorer.rollout_model.tensor_parallel_size = 1
-        explorer1_config.explorer.runner_num = 4
         explorer1_config.buffer.explorer_output = StorageConfig(
             name="exp_buffer",
             storage_type=StorageType.QUEUE,
diff --git a/trinity/common/config.py b/trinity/common/config.py
@@ -301,12 +301,12 @@ class ExplorerConfig:
     name: str = EXPLORER_NAME
     # for workflow runner
     # number of workflow runners.
-    # For sync engine (vllm), it should be equal to `engine_num`.
-    # For async engine (vllm_async), it can be larger than `engine_num`, e.g. 16 * `engine_num`
-    runner_num: int = 1
+    # For sync engine (vllm), it should be `1`.
+    # For async engine (vllm_async), it could be a large number.
+    runner_per_model: int = 8  # number of runners per each rollout model
     max_timeout: int = 900  # wait each task for 15 minutes
     max_retry_times: int = 2  # retry each task for 2 times if it fails or timeout
-    runner_per_model: int = 8
+    runner_num: Optional[int] = None  # deprecated
 
     # for inference models
     # for rollout model
@@ -316,7 +316,10 @@ class ExplorerConfig:
 
     # for evaluation
     eval_interval: int = 100
-    eval_on_latest_checkpoint: bool = False
+    eval_on_startup: bool = True  # evalulate at step 0
+
+    # for benchmark
+    bench_on_latest_checkpoint: bool = False  # only benchmark the latest checkpoint
 
 
 @dataclass
diff --git a/trinity/explorer/explorer.py b/trinity/explorer/explorer.py
@@ -39,7 +39,7 @@ def __init__(self, config: Config):
         self.cache = CacheManager(config)
         explorer_meta = self.cache.load_explorer()
         self.explore_step_num = explorer_meta.get("latest_iteration", 0)
-        self.last_sync_step = self.explore_step_num
+        self.last_sync_step = self.explore_step_num if self.explore_step_num > 0 else -1
         self.config = config
         self.algorithm_manager = AlgorithmManager(config)
         self.models, self.auxiliary_models = create_inference_models(config)
@@ -169,6 +169,8 @@ async def prepare(self) -> None:
                 asyncio.create_task(self.setup_weight_sync_group(master_address, master_port))
             )
         asyncio.gather(*futures, return_exceptions=True)
+        if self.config.explorer.eval_on_startup and self.explore_step_num == 0:
+            self.eval()
 
     async def get_weight(self, name: str) -> torch.Tensor:
         """Get the weight of the loaded model (For checkpoint weights update)."""
@@ -177,21 +179,21 @@ async def get_weight(self, name: str) -> torch.Tensor:
     async def explore(self) -> str:
         """
         The timeline of the exploration process:
-        explorer | <--------------------------------- one period -------------------------------------> |
-                 | <------------------------------ eval -------------------------------> | <-- sync --> |
-                 | <---------------- step_1 --------------> |                                           |
+                 | <--------------------------------- one period -------------------------------------> |
+        explorer | <---------------- step_1 --------------> |                                           |
                  |   | <---------------- step_2 --------------> |                                       |
                  |      ...                                                                             |
                  |          | <---------------- step_n ---------------> |                               |
                  |                  | <---------------------- eval --------------------> | <-- sync --> |
-        trainer  |--------------------------------------------------------------------------------------|
-                 | <-- idle --> | <-- step_1 --> | <-- step_2 --> | ... | <-- step_n --> | <-- sync --> |
+                 |--------------------------------------------------------------------------------------|
+        trainer  | <-- idle --> | <-- step_1 --> | <-- step_2 --> | ... | <-- step_n --> | <-- sync --> |
         """
         while True:
             try:
                 self.logger.info(f"Explore step {self.explore_step_num + 1} started.")
                 explore_contionue = await self.explore_step()
                 if not explore_contionue:
+                    # TODO: support eval on last checkpoint
                     break
                 if self.need_eval():
                     self.eval()
@@ -253,7 +255,7 @@ def eval(self):
     async def benchmark(self) -> bool:
         """Benchmark the model checkpoints."""
         # benchmark on the latest checkpoint
-        if self.config.explorer.eval_on_latest_checkpoint:
+        if self.config.explorer.bench_on_latest_checkpoint:
             self.explore_step_num = await self._checkpoint_weights_update()
             self.eval()
             await self._log_eval_metrics()
diff --git a/trinity/explorer/scheduler.py b/trinity/explorer/scheduler.py
@@ -1,6 +1,7 @@
 """Scheduler for rollout tasks."""
 
 import asyncio
+import re
 import time
 import traceback
 from collections import defaultdict, deque
@@ -89,6 +90,20 @@ def restart_runner(self):
             pass
 
 
+def sort_batch_id(batch_id: Union[int, str]):
+    """Priority of batch_id"""
+    # TODO: avoid sort the batch_id every time
+    if isinstance(batch_id, int):
+        return (batch_id, 0)
+    else:
+        match = re.match(r"^(\d+)", batch_id)
+        if match:
+            num = int(match.group(1))
+            return (num, 1)
+        else:
+            return (float("inf"), 1)
+
+
 class Scheduler:
     """Scheduler for rollout tasks."""
 
@@ -112,9 +127,14 @@ def __init__(
         self.idle_runners = set()  # runner_id
         self.busy_runners = dict()  # runner_id -> (task, batch_id)
 
-        self.pending_tasks: Dict[str, deque] = defaultdict(deque)  # batch_id -> tasks
-        self.running_tasks: Dict[str, set[asyncio.Future]] = defaultdict(set)  # batch_id -> futures
-        self.completed_tasks: Dict[str, deque[Status]] = defaultdict(deque)  # batch_id -> results
+        self.pending_tasks_heap = []
+        self.pending_tasks: Dict[Union[int, str], deque] = defaultdict(deque)  # batch_id -> tasks
+        self.running_tasks: Dict[Union[int, str], set[asyncio.Future]] = defaultdict(
+            set
+        )  # batch_id -> futures
+        self.completed_tasks: Dict[Union[int, str], deque[Status]] = defaultdict(
+            deque
+        )  # batch_id -> results
 
         self.scheduler_task: Optional[asyncio.Task] = None
         self.running = False
@@ -168,7 +188,7 @@ async def _schedule_pending_tasks(self) -> None:
             return
 
         # TODO: Support more advanced scheduling strategies
-        for batch_id in sorted(self.pending_tasks.keys()):
+        for batch_id in sorted(self.pending_tasks.keys(), key=sort_batch_id):
             task_queue = self.pending_tasks[batch_id]
 
             while task_queue and self.idle_runners:
@@ -205,7 +225,7 @@ async def _check_completed_tasks(self) -> None:
             if not futures:
                 del self.running_tasks[batch_id]
 
-    def _clear_timeout_tasks(self, batch_id: str) -> None:
+    def _clear_timeout_tasks(self, batch_id: Union[int, str]) -> None:
         if batch_id in self.pending_tasks:
             self.logger.info(f"Clear timeout pending tasks at batch_id {batch_id}.")
             del self.pending_tasks[batch_id]
@@ -252,11 +272,11 @@ def schedule(self, tasks: List[Task], batch_id: Union[int, str]) -> None:
 
         Args:
             tasks (`List[Task]`): The tasks to schedule.
-            batch_id (`Union[int, str]`): The id of provided tasks.
+            batch_id (`Union[int, str]`): The id of provided tasks. It should be an integer or a string
+                starting with an integer (e.g., 123, "123/my_task")
         """
         if not tasks:
             return
-        batch_id = str(batch_id)
         for task in tasks:
             self.pending_tasks[batch_id].appendleft(task)
 
@@ -276,7 +296,6 @@ async def get_results(
             clear_timeout_tasks (`bool`): Whether to clear timeout tasks.
         """
         timeout = timeout or self.timeout
-        batch_id = str(batch_id)
         start_time = time.time()
         if min_num is None:
             min_num = 0
@@ -320,7 +339,6 @@ async def get_results(
         return results
 
     def has_step(self, batch_id: Union[int, str]) -> bool:
-        batch_id = str(batch_id)
         return (
             batch_id in self.completed_tasks
             or batch_id in self.pending_tasks
@@ -353,8 +371,8 @@ async def wait_all(
             running_count = sum(len(futures) for futures in self.running_tasks.values())
 
             self.logger.debug(f"Pending tasks: {pending_count}, Running tasks: {running_count}")
-
             await asyncio.sleep(0.1)
+
         pending_count = sum(len(tasks) for tasks in self.pending_tasks.values())
         running_count = sum(len(futures) for futures in self.running_tasks.values())
         error_msg = f"Timeout after {timeout} seconds. Still have {pending_count} pending tasks and {running_count} running tasks."
diff --git a/trinity/manager/config_manager.py b/trinity/manager/config_manager.py
@@ -199,9 +199,11 @@ def _expert_buffer_part(self):
     def _expert_explorer_part(self):
         self.get_configs("sync_method", "sync_interval", "sync_timeout")
 
-        self.get_configs("runner_num", "max_timeout", "explorer_max_retry_times", "eval_interval")
+        self.get_configs(
+            "runner_per_model", "max_timeout", "explorer_max_retry_times", "eval_interval"
+        )
 
-        self.get_configs("eval_on_latest_checkpoint")
+        self.get_configs("bench_on_latest_checkpoint")
 
         with st.expander("Rollout Model Config", expanded=True):
             self.get_configs("engine_type", "engine_num", "tensor_parallel_size")
@@ -571,7 +573,7 @@ def _gen_buffer_config(self):
 
     def _gen_explorer_config(self):
         explorer_config = {
-            "runner_num": st.session_state["runner_num"],
+            "runner_per_model": st.session_state["runner_per_model"],
             "max_timeout": st.session_state["max_timeout"],
             "max_retry_times": st.session_state["explorer_max_retry_times"],
             "rollout_model": {
@@ -584,7 +586,7 @@ def _gen_explorer_config(self):
             },
             "auxiliary_models": [],
             "eval_interval": st.session_state["eval_interval"],
-            "eval_on_latest_checkpoint": st.session_state["eval_on_latest_checkpoint"],
+            "bench_on_latest_checkpoint": st.session_state["bench_on_latest_checkpoint"],
         }
         for i in range(st.session_state["_auxiliary_models_num"]):
             auxiliary_model_config = {
diff --git a/trinity/manager/config_registry/explorer_config_manager.py b/trinity/manager/config_registry/explorer_config_manager.py
@@ -9,9 +9,9 @@ def explorer_visible() -> bool:
     return st.session_state["mode"] == "both"
 
 
-@CONFIG_GENERATORS.register_config(default_value=32, visible=explorer_visible)
-def set_runner_num(**kwargs):
-    st.number_input("Runner Num", min_value=1, **kwargs)
+@CONFIG_GENERATORS.register_config(default_value=8, visible=explorer_visible)
+def set_runner_per_model(**kwargs):
+    st.number_input("Runner per Model", min_value=1, **kwargs)
 
 
 @CONFIG_GENERATORS.register_config(default_value=900, visible=explorer_visible)
@@ -30,7 +30,7 @@ def set_eval_interval(**kwargs):
 
 
 @CONFIG_GENERATORS.register_config(default_value=True, visible=explorer_visible)
-def set_eval_on_latest_checkpoint(**kwargs):
+def set_bench_on_latest_checkpoint(**kwargs):
     st.checkbox("Eval on Latest Checkpoint", **kwargs)