Update verl to v0.4.1, vllm to v0.9.2 (#125)

hiyuchang · web-flow · commit 15f8a8bb4a0b · 2025-07-18T10:40:04.000+08:00
diff --git a/docs/sphinx_doc/source/tutorial/faq.md b/docs/sphinx_doc/source/tutorial/faq.md
@@ -96,7 +96,7 @@ ray start --head
 
 **A:** The following parameters may be helpful:
 
-- For trainer, adjust `actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu` when `actor_rollout_ref.actor.use_dynamic_bsz=false`; adjust `actor_rollout_ref.actor.ppo_max_token_len_per_gpu` and `actor_rollout_ref.actor.ulysses_sequence_parallel_size` when `actor_rollout_ref.actor.use_dynamic_bsz=true`.
+- For trainer, adjust `actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu` when `actor_rollout_ref.actor.use_dynamic_bsz=false`; adjust `actor_rollout_ref.actor.ppo_max_token_len_per_gpu` and `actor_rollout_ref.actor.ulysses_sequence_parallel_size` when `actor_rollout_ref.actor.use_dynamic_bsz=true`. Setting `actor_rollout_ref.actor.entropy_from_logits_with_chunking=true` may also help.
 - For explorer, adjust `explorer.rollout_model.tensor_parallel_size`,
 
 
diff --git a/docs/sphinx_doc/source/tutorial/trinity_configs.md b/docs/sphinx_doc/source/tutorial/trinity_configs.md
@@ -443,8 +443,11 @@ actor_rollout_ref:
     ppo_epochs: 1
     shuffle: False
     ulysses_sequence_parallel_size: 1 # sp size
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
     checkpoint:
-      contents: ['model', 'hf_model', 'optimizer', 'extra']  # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
+      load_contents: ['model', 'optimizer', 'extra']
+      save_contents: ['model', 'optimizer', 'extra']
     optim:
       lr: 1e-6
       lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
@@ -458,17 +461,22 @@ actor_rollout_ref:
       param_offload: False
       optimizer_offload: False
       fsdp_size: -1
+      forward_prefetch: False
   ref:
     fsdp_config:
       param_offload: False
       wrap_policy:
         # transformer_layer_cls_to_wrap: None
         min_num_params: 0
+      fsdp_size: -1
+      forward_prefetch: False
     # log_prob_micro_batch_size: 4 # will be deprecated, use log_prob_micro_batch_size_per_gpu
     log_prob_micro_batch_size_per_gpu: 8
     log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
     log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
     ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
+    entropy_from_logits_with_chunking: ${actor_rollout_ref.actor.entropy_from_logits_with_chunking}
+    entropy_checkpointing: ${actor_rollout_ref.actor.entropy_checkpointing}
 
 critic:
   strategy: fsdp
@@ -490,6 +498,7 @@ critic:
         # transformer_layer_cls_to_wrap: None
         min_num_params: 0
       fsdp_size: -1
+      forward_prefetch: False
   ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
   ppo_micro_batch_size_per_gpu: 8
   forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
@@ -523,6 +532,9 @@ trainer:
 - `actor_rollout_ref.actor.use_dynamic_bsz`: Whether to reorganize the batch data, specifically to splice the shorter data to reduce the batch size in the actual training process.
 - `actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu`: Batch size for one GPU in one forward pass.
 - `actor_rollout_ref.actor.ulysses_sequence_parallel_size`: Ulysses sequence parallel size.
+- `actor_rollout_ref.actor.entropy_from_logits_with_chunking`: Calculate entropy with chunking to reduce memory peak.
+- `actor_rollout_ref.actor.entropy_checkpointing`: Recompute entropy.
+- `actor_rollout_ref.actor.checkpoint`: Contents to be loaded and saved. With 'hf_model' you can save whole model as hf format; now only use sharded model checkpoint to save space.
 - `actor_rollout_ref.actor.optim.lr`: Learning rate for actor model.
 - `actor_rollout_ref.actor.optim.lr_warmup_steps_ratio`: Ratio of warmup steps for learning rate.
 - `actor_rollout_ref.actor.optim.warmup_style`: Warmup style for learning rate.
diff --git a/examples/ppo_countdown/train_countdown.yaml b/examples/ppo_countdown/train_countdown.yaml
@@ -15,7 +15,8 @@ actor_rollout_ref:
     shuffle: False
     ulysses_sequence_parallel_size: 1 # sp size
     checkpoint:
-      contents: ['model', 'hf_model', 'optimizer', 'extra']  # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
+      load_contents: ['model', 'hf_model', 'optimizer', 'extra']  # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
+      save_contents: ['model', 'hf_model', 'optimizer', 'extra']
     optim:
       lr: 1e-6
       lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,9 +21,9 @@ classifiers = [
 ]
 requires-python = ">=3.10"
 dependencies = [
-    "verl==0.4.0",
+    "verl==0.4.1",
     "ray[default]>=2.45.0",
-    "vllm==0.9.1",
+    "vllm==0.9.2",
     "tensordict==0.6.2",
     "wandb",
     "omegaconf",
diff --git a/tests/template/verl_config.yaml b/tests/template/verl_config.yaml
@@ -15,8 +15,11 @@ actor_rollout_ref:
     ppo_epochs: 1
     shuffle: False
     ulysses_sequence_parallel_size: 1 # sp size
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
     checkpoint:
-      contents: ["model", "optimizer", "extra"]  # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
+      load_contents: ['model', 'optimizer', 'extra']  # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
+      save_contents: ['model', 'optimizer', 'extra']
     optim:
       lr: 1e-6
       lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
@@ -30,16 +33,21 @@ actor_rollout_ref:
       param_offload: False
       optimizer_offload: False
       fsdp_size: -1
+      forward_prefetch: False
   ref:
     fsdp_config:
       param_offload: False
       wrap_policy:
         # transformer_layer_cls_to_wrap: None
         min_num_params: 0
+      fsdp_size: -1
+      forward_prefetch: False
     log_prob_micro_batch_size_per_gpu: 1
     log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
     log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
     ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
+    entropy_from_logits_with_chunking: ${actor_rollout_ref.actor.entropy_from_logits_with_chunking}
+    entropy_checkpointing: ${actor_rollout_ref.actor.entropy_checkpointing}
 
 critic:
   strategy: fsdp
@@ -61,6 +69,7 @@ critic:
         # transformer_layer_cls_to_wrap: None
         min_num_params: 0
       fsdp_size: -1
+      forward_prefetch: False
   ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
   ppo_micro_batch_size_per_gpu: 1
   forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
@@ -73,7 +82,8 @@ critic:
   grad_clip: 1.0
   cliprange_value: 0.5
   checkpoint:
-    contents: ["model", "optimizer", "extra"]  # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
+    load_contents: ['model', 'optimizer', 'extra']  # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
+    save_contents: ['model', 'optimizer', 'extra']
 
 trainer:
   balance_batch: True
diff --git a/trinity/buffer/reader/file_reader.py b/trinity/buffer/reader/file_reader.py
@@ -111,7 +111,7 @@ def __init__(self, meta: StorageConfig, config: BufferConfig):
         self.response_key = meta.format.response_key
         self.read_batch_size = config.batch_size
         self.dataset = _HFBatchReader(
-            load_dataset(meta.path, name=subset_name, split=self.split, trust_remote_code=True),
+            load_dataset(meta.path, name=subset_name, split=self.split),
             name=meta.name,
             default_batch_size=self.read_batch_size,
             total_epochs=meta.total_epochs,
@@ -193,7 +193,7 @@ def __init__(self, meta: StorageConfig, config: BufferConfig):
         self.rejected_key = meta.format.rejected_key
         self.read_batch_size = config.batch_size
         self.dataset = _HFBatchReader(
-            load_dataset(meta.path, name=subset_name, split=self.split, trust_remote_code=True),
+            load_dataset(meta.path, name=subset_name, split=self.split),
             name=meta.name,
             default_batch_size=self.read_batch_size,
             total_epochs=meta.total_epochs,
@@ -272,7 +272,7 @@ def __init__(self, meta: StorageConfig, config: BufferConfig):
         datasets.disable_caching()
         self.read_batch_size = config.batch_size
         self.dataset = _HFBatchReader(
-            load_dataset(meta.path, name=subset_name, split=self.split, trust_remote_code=True),
+            load_dataset(meta.path, name=subset_name, split=self.split),
             name=meta.name,
             default_batch_size=self.read_batch_size,
             total_epochs=self.meta.total_epochs if meta.task_type == TaskType.EXPLORE else 1,
@@ -328,9 +328,7 @@ def read(
 class RawDataReader(BufferReader):
     def __init__(self, meta: StorageConfig, config: Optional[BufferConfig]):
         self.returned = False
-        self.dataset = load_dataset(
-            meta.path, name=meta.subset_name, split=meta.split, trust_remote_code=True
-        )
+        self.dataset = load_dataset(meta.path, name=meta.subset_name, split=meta.split)
 
     def __len__(self):
         return len(self.dataset)
diff --git a/trinity/common/config.py b/trinity/common/config.py
@@ -177,6 +177,7 @@ class ModelConfig:
     critic_model_path: str = ""
     max_prompt_tokens: Optional[int] = None
     max_response_tokens: Optional[int] = None
+    custom_chat_template: Optional[str] = None
 
 
 @dataclass
diff --git a/trinity/common/verl_config.py b/trinity/common/verl_config.py
@@ -24,6 +24,7 @@ class ActorModel:
     enable_gradient_checkpointing: bool = True
     use_remove_padding: bool = False
     use_fused_kernels: bool = False
+    custom_chat_template: Optional[str] = None
 
 
 @dataclass
@@ -49,11 +50,13 @@ class FSDPConfig:
     param_offload: bool = False
     optimizer_offload: bool = False
     fsdp_size: int = -1
+    forward_prefetch: bool = False
 
 
 @dataclass
 class Checkpoint:
-    contents: List[str] = field(default_factory=lambda: ["model", "hf_model", "optimizer", "extra"])
+    load_contents: List[str] = field(default_factory=lambda: ["model", "optimizer", "extra"])
+    save_contents: List[str] = field(default_factory=lambda: ["model", "optimizer", "extra"])
 
 
 @dataclass
@@ -70,6 +73,8 @@ class Actor:
     ppo_epochs: int = 1
     shuffle: bool = False
     ulysses_sequence_parallel_size: int = 1
+    entropy_from_logits_with_chunking: bool = False
+    entropy_checkpointing: bool = False
     checkpoint: Checkpoint = field(default_factory=Checkpoint)
     optim: Optim = field(default_factory=Optim)
     fsdp_config: FSDPConfig = field(default_factory=FSDPConfig)
@@ -90,6 +95,11 @@ class Ref:
     log_prob_use_dynamic_bsz: bool = False
     log_prob_max_token_len_per_gpu: int = 0
     ulysses_sequence_parallel_size: int = 1
+    entropy_from_logits_with_chunking: bool = False
+    entropy_checkpointing: bool = False
+    checkpoint: Checkpoint = field(
+        default_factory=lambda: Checkpoint(load_contents=["model"], save_contents=["model"])
+    )
 
 
 @dataclass
@@ -309,6 +319,7 @@ def synchronize_config(self, config: Config) -> None:  # noqa: C901
 
         # Actor / Critic config
         self.actor_rollout_ref.model.path = config.model.model_path
+        self.actor_rollout_ref.model.custom_chat_template = config.model.custom_chat_template
         self.critic.model.path = config.model.critic_model_path
         self.critic.model.tokenizer_path = config.model.critic_model_path
         self.actor_rollout_ref.actor.ppo_mini_batch_size = (
diff --git a/trinity/manager/config_manager.py b/trinity/manager/config_manager.py
@@ -257,7 +257,7 @@ def _expert_verl_training_part(self):
 
         self.get_configs("ppo_epochs", "training_strategy", "resume_mode")
 
-        self.get_configs("param_offload", "optimizer_offload")
+        self.get_configs("param_offload", "optimizer_offload", "forward_prefetch")
         self.get_configs("resume_from_path")
 
         with st.expander("Advanced Config"):
@@ -275,6 +275,8 @@ def _expert_verl_actor_part(self):
             "actor_ppo_micro_batch_size_per_gpu",
             "ref_log_prob_micro_batch_size_per_gpu",
             "actor_ulysses_sequence_parallel_size",
+            "actor_entropy_from_logits_with_chunking",
+            "actor_entropy_checkpointing",
         )
 
         self.get_configs("actor_lr", "actor_warmup_style", "actor_lr_warmup_steps_ratio")
@@ -335,6 +337,7 @@ def _generate_verl_config(self):
                 "param_offload": st.session_state["param_offload"],
                 "optimizer_offload": st.session_state["optimizer_offload"],
                 "fsdp_size": -1,
+                "forward_prefetch": st.session_state["forward_prefetch"],
             }
         else:
             fsdp_config = {}
@@ -363,6 +366,10 @@ def _generate_verl_config(self):
                     "ulysses_sequence_parallel_size": st.session_state[
                         "actor_ulysses_sequence_parallel_size"
                     ],
+                    "entropy_from_logits_with_chunking": st.session_state[
+                        "actor_entropy_from_logits_with_chunking"
+                    ],
+                    "entropy_checkpointing": st.session_state["actor_entropy_checkpointing"],
                     "checkpoint": {"contents": st.session_state["actor_checkpoint"]},
                     "optim": {
                         "lr": st.session_state["actor_lr"],
@@ -386,6 +393,10 @@ def _generate_verl_config(self):
                     "ulysses_sequence_parallel_size": st.session_state[
                         "actor_ulysses_sequence_parallel_size"
                     ],
+                    "entropy_from_logits_with_chunking": st.session_state[
+                        "actor_entropy_from_logits_with_chunking"
+                    ],
+                    "entropy_checkpointing": st.session_state["actor_entropy_checkpointing"],
                 },
             },
             "critic": {},
diff --git a/trinity/manager/config_registry/trainer_config_manager.py b/trinity/manager/config_registry/trainer_config_manager.py
@@ -114,6 +114,11 @@ def set_optimizer_offload(**kwargs):
     st.checkbox("FSDP Optimizer Offload", **kwargs)
 
 
+@CONFIG_GENERATORS.register_config(default_value=False, visible=use_fsdp)
+def set_forward_prefetch(**kwargs):
+    st.checkbox("FSDP Forward Prefetch", **kwargs)
+
+
 @CONFIG_GENERATORS.register_config(default_value="auto")
 def set_resume_mode(**kwargs):
     st.selectbox("Resume Mode", ["disable", "auto", "resume_path"], **kwargs)
@@ -235,6 +240,16 @@ def set_actor_ulysses_sequence_parallel_size(**kwargs):
     )
 
 
+@CONFIG_GENERATORS.register_config(default_value=False)
+def set_actor_entropy_from_logits_with_chunking(**kwargs):
+    st.checkbox("Entropy from Logits with Chunking", **kwargs)
+
+
+@CONFIG_GENERATORS.register_config(default_value=False)
+def set_actor_entropy_checkpointing(**kwargs):
+    st.checkbox("Entropy Checkpointing", **kwargs)
+
+
 @CONFIG_GENERATORS.register_config(default_value=1e-6)
 def set_actor_lr(**kwargs):
     st.number_input(
diff --git a/trinity/trainer/verl/dp_actor.py b/trinity/trainer/verl/dp_actor.py
diff --git a/trinity/trainer/verl/fsdp_workers.py b/trinity/trainer/verl/fsdp_workers.py
diff --git a/trinity/trainer/verl_trainer.py b/trinity/trainer/verl_trainer.py