fix some comments

pan-x-c · pan-x-c · commit b8289adf378e · 2025-05-22T12:35:54.000+08:00
diff --git a/docs/sphinx_doc/source/tutorial/example_async_mode.md b/docs/sphinx_doc/source/tutorial/example_async_mode.md
@@ -7,7 +7,7 @@ Trinity-RFT supports an asynchronous mode by running the trainer and explorer in
 For this purpose, we prepare two main config files: `trainer.yaml` and `explorer.yaml`.
 The main difference between them is that in `trainer.yaml` we set `mode=train`, while in `explorer.yaml` we set `mode=explore`.
 In addition, we need to configure the following parameters in both files.
-The model weights of the explorer and trainer are synchronized once every `sync_iteration_interval * batch_size` tasks.
+The model weights of the explorer and trainer are synchronized once every `sync_interval * batch_size` tasks.
 
 ```yaml
 project: tutorial
@@ -24,7 +24,7 @@ buffer:
 
 synchronizer:
   sync_method: 'checkpoint'
-  sync_interval: <sync_iteration_interval>
+  sync_interval: <sync_interval>
 ```
 
 You may run this example with the following command:
diff --git a/docs/sphinx_doc/source/tutorial/trinity_configs.md b/docs/sphinx_doc/source/tutorial/trinity_configs.md
@@ -5,16 +5,16 @@ The following is the main config file for Trinity-RFT. Take `countdown.yaml` as
 ## Global Config
 
 ```yaml
-mode: both
 project: Trinity-RFT
 name: example
-checkpoint_root_dir: /PATH/TO/CHECKPOINT_DIR
+mode: both
+checkpoint_root_dir: /PATH/TO/CHECKPOINT
 ```
 
-- `mode`: The mode of the experiment, chosen from `both`, `train`, `explore` or `bench`. `both` means both trainer and explorer are launched; `train` means only trainer is launched; `explore` means only explorer is launched; `bench` conducts benchmark evaluation. Default is `both`.
 - `project`: The name of the project.
 - `name`: The name of the experiment.
-- `checkpoint_root_dir`: The root directory of the checkpoint.
+- `mode`: The mode of the experiment, chosen from `both`, `train`, `explore` or `bench`. `both` means both trainer and explorer are launched; `train` means only trainer is launched; `explore` means only explorer is launched; `bench` conducts benchmark evaluation. Default is `both`.
+- `checkpoint_root_dir`: The root directory to save the checkpoints. Sepcifically, the generated checkpoints will be saved in `<checkpoint_root_dir>/<project>/<name>/.
 
 ## Algorithm
 
@@ -24,7 +24,7 @@ algorithm:
   repeat_times: 1
 ```
 
-- `algorithm.algorithm_type`: The type of the algorithm, Support `ppo`, `grpo`, `opmd` and `dpo`.
+- `algorithm.algorithm_type`: The type of the algorithm. Support `ppo`, `grpo`, `opmd` and `dpo`.
 - `algorithm.repeat_times`: The number of times to repeat each task. Used for GRPO-like algorithm. Default is `1`.
 
 ## Monitor
diff --git a/examples/grpo_alfworld/alfworld.yaml b/examples/grpo_alfworld/alfworld.yaml
@@ -6,6 +6,8 @@ algorithm:
   repeat_times: 8
 model:
   model_path: /PATH/TO/MODEL/
+  max_prompt_tokens: 4096
+  max_response_tokens: 16384
 cluster:
   node_num: 1
   gpu_per_node: 8
@@ -39,8 +41,6 @@ explorer:
     tensor_parallel_size: 2
     enable_prefix_caching: false
     enforce_eager: true
-    max_prompt_tokens: 4096
-    max_response_tokens: 16384
     dtype: bfloat16
     seed: 42
     gpu_memory_utilization: 0.7
diff --git a/examples/grpo_gsm8k/gsm8k.yaml b/examples/grpo_gsm8k/gsm8k.yaml
@@ -20,6 +20,8 @@ data_processor:
 
 model:
   model_path: '/PATH/TO/MODEL/'
+  max_prompt_tokens: 256
+  max_response_tokens: 1024
 cluster:
   node_num: 1
   gpu_per_node: 8
@@ -71,8 +73,6 @@ explorer:
     enable_prefix_caching: false
     enforce_eager: true
     dtype: bfloat16
-    max_prompt_tokens: 256
-    max_response_tokens: 1024
     seed: 42
 synchronizer:
   sync_method: 'nccl'
diff --git a/examples/grpo_sciworld/sciworld.yaml b/examples/grpo_sciworld/sciworld.yaml
@@ -6,6 +6,8 @@ algorithm:
   repeat_times: 8
 model:
   model_path: /PATH/TO/MODEL/
+  max_prompt_tokens: 4096
+  max_response_tokens: 16384
 cluster:
   node_num: 1
   gpu_per_node: 8
@@ -41,8 +43,6 @@ explorer:
     enforce_eager: true
     dtype: bfloat16
     seed: 42
-    max_prompt_tokens: 4096
-    max_response_tokens: 16384
     gpu_memory_utilization: 0.7
     enable_chunked_prefill: true
 synchronizer:
diff --git a/examples/grpo_webshop/webshop.yaml b/examples/grpo_webshop/webshop.yaml
@@ -6,6 +6,8 @@ algorithm:
   repeat_times: 8
 model:
   model_path: /PATH/TO/MODEL/
+  max_prompt_tokens: 4096
+  max_response_tokens: 16384
 cluster:
   node_num: 1
   gpu_per_node: 8
@@ -40,8 +42,6 @@ explorer:
     enable_prefix_caching: false
     enforce_eager: true
     dtype: bfloat16
-    max_prompt_tokens: 4096
-    max_response_tokens: 16384
     seed: 42
     gpu_memory_utilization: 0.7
     enable_chunked_prefill: true
diff --git a/examples/opmd_gsm8k/opmd_gsm8k.yaml b/examples/opmd_gsm8k/opmd_gsm8k.yaml
@@ -6,6 +6,8 @@ algorithm:
   repeat_times: 8
 model:
   model_path: /PATH/TO/MODEL/
+  max_prompt_tokens: 4096
+  max_response_tokens: 16384
 cluster:
   node_num: 1
   gpu_per_node: 8
@@ -40,8 +42,6 @@ explorer:
     tensor_parallel_size: 1
     enable_prefix_caching: false
     enforce_eager: true
-    max_prompt_tokens: 4096
-    max_response_tokens: 16384
     dtype: bfloat16
     seed: 42
 synchronizer:
diff --git a/tests/template/config.yaml b/tests/template/config.yaml
@@ -7,6 +7,8 @@ algorithm:
   repeat_times: 1
 model:
   model_path: ''
+  max_prompt_tokens: 2048
+  max_response_tokens: 2048
 cluster:  # 2 for explorer, 2 for trainer
   node_num: 1
   gpu_per_node: 4
@@ -33,8 +35,6 @@ explorer:
     enable_prefix_caching: false
     enforce_eager: true
     dtype: bfloat16
-    max_prompt_tokens: 2048
-    max_response_tokens: 2048
     seed: 42
     use_v1: true
 trainer:
diff --git a/trinity/common/config.py b/trinity/common/config.py
@@ -118,8 +118,8 @@ class ModelConfig:
     # source model path
     model_path: str = ""
     critic_model_path: str = ""
-    max_prompt_tokens: int = 2048
-    max_response_tokens: int = 2048
+    max_prompt_tokens: Optional[int] = None
+    max_response_tokens: Optional[int] = None
 
 
 @dataclass
@@ -130,14 +130,14 @@ class InferenceModelConfig:
     engine_num: int = 1
     tensor_parallel_size: int = 1
     use_v1: bool = True
-    max_prompt_tokens: int = 2048
-    max_response_tokens: int = 2048
     enforce_eager: bool = True
     enable_prefix_caching: bool = False
     enable_chunked_prefill: bool = False
     gpu_memory_utilization: float = 0.9
     dtype: str = "bfloat16"
     seed: int = 42
+    max_prompt_tokens: Optional[int] = None
+    max_response_tokens: Optional[int] = None
     # override chat template in model
     chat_template: Optional[str] = None
     # For Qwen3
@@ -478,6 +478,10 @@ def check_and_update(self) -> None:  # noqa: C901
             and self.explorer.rollout_model.enable_openai_api
         ):
             raise ValueError("OpenAI API server only support `vllm_async` engine.")
+        if self.explorer.rollout_model.max_prompt_tokens is None:
+            self.explorer.rollout_model.max_prompt_tokens = self.model.max_prompt_tokens
+        if self.explorer.rollout_model.max_response_tokens is None:
+            self.explorer.rollout_model.max_response_tokens = self.model.max_response_tokens
 
         # check synchronizer
         self.synchronizer.explorer_world_size = (
diff --git a/trinity/common/models/vllm_async_model.py b/trinity/common/models/vllm_async_model.py
@@ -64,14 +64,17 @@ def __init__(
         )
         self.enable_thinking = config.enable_thinking
         self.request_id = 0
+        max_model_len = None
+        if config.max_prompt_tokens is not None and config.max_response_tokens is not None:
+            max_model_len = config.max_prompt_tokens + config.max_response_tokens
         engine_args = vllm.AsyncEngineArgs(
             model=config.model_path,
             enforce_eager=config.enforce_eager,
             worker_extension_cls="trinity.common.models.vllm_worker.WorkerExtension",
             tensor_parallel_size=config.tensor_parallel_size,
             seed=config.seed,
             distributed_executor_backend=("uni" if config.tensor_parallel_size == 1 else "ray"),
-            max_model_len=config.max_prompt_tokens + config.max_response_tokens,
+            max_model_len=max_model_len,
             enable_prefix_caching=config.enable_prefix_caching,
             dtype=config.dtype,
             trust_remote_code=True,
diff --git a/trinity/common/models/vllm_model.py b/trinity/common/models/vllm_model.py
@@ -51,6 +51,9 @@ def __init__(self, config: InferenceModelConfig):
             include_stop_str_in_output=False,
             logprobs=0,
         )
+        max_model_len = None
+        if config.max_prompt_tokens is not None and config.max_response_tokens is not None:
+            max_model_len = config.max_prompt_tokens + config.max_response_tokens
         self.llm = LLM(
             # TODO: check checkpoint path
             model=config.model_path,
@@ -59,7 +62,7 @@ def __init__(self, config: InferenceModelConfig):
             tensor_parallel_size=config.tensor_parallel_size,
             seed=config.seed,
             distributed_executor_backend=("uni" if config.tensor_parallel_size == 1 else "ray"),
-            max_model_len=config.max_prompt_tokens + config.max_response_tokens,
+            max_model_len=max_model_len,
             enable_prefix_caching=config.enable_prefix_caching,
             dtype=config.dtype,
             trust_remote_code=True,
diff --git a/trinity/trainer/trainer.py b/trinity/trainer/trainer.py
@@ -14,7 +14,7 @@
 
 from trinity.buffer import get_buffer_reader
 from trinity.common.config import Config
-from trinity.common.constants import AlgorithmType, ReadStrategy, SyncMethod
+from trinity.common.constants import AlgorithmType, SyncMethod
 from trinity.common.experience import Experiences
 from trinity.utils.log import get_logger
 
@@ -75,7 +75,7 @@ def train_step(self, algo_type: AlgorithmType = AlgorithmType.PPO) -> Tuple[bool
         """
         self.engine.set_mode(algo_type)
         if algo_type.is_rft() and self.config.buffer.trainer_input.read_experience_strategy:
-            strategy = ReadStrategy(self.config.buffer.trainer_input.read_experience_strategy)
+            strategy = self.config.buffer.trainer_input.read_experience_strategy
         else:
             strategy = None
         try: