1. Implement serial save.

chenyushuo · chenyushuo · commit 552fe2da7fa0 · 2025-10-14T17:23:43.000+08:00
2. No longer set `max_model_len` from model config.json
diff --git a/trinity/common/config.py b/trinity/common/config.py
@@ -843,21 +843,7 @@ def _check_model(self) -> None:
                     f"`max_model_len` is set to {model.max_model_len} from `max_prompt_tokens` and `max_response_tokens`."
                 )
             else:
-                from transformers import AutoConfig, AutoTokenizer
-                from transformers.tokenization_utils_base import LARGE_INTEGER
-
-                tokenizer = AutoTokenizer.from_pretrained(model.model_path)
-                config = AutoConfig.from_pretrained(model.model_path)
-                max_model_len = min(
-                    getattr(tokenizer, "model_max_length", LARGE_INTEGER),
-                    getattr(config, "max_position_embeddings", LARGE_INTEGER),
-                )
-                if max_model_len >= LARGE_INTEGER:
-                    max_model_len = MAX_MODEL_LEN
-                    logger.warning(
-                        f"Failed to get `max_model_len` from model {model.model_path}, use {MAX_MODEL_LEN} instead."
-                    )
-                model.max_model_len = max_model_len
+                raise ValueError("Unable to determine `max_model_len`, please set it manually.")
 
         # both max_prompt_tokens and max_response_tokens are None
         if model.max_prompt_tokens is None and model.max_response_tokens is None:
diff --git a/trinity/trainer/verl/fsdp_checkpoint_manager.py b/trinity/trainer/verl/fsdp_checkpoint_manager.py
@@ -115,6 +115,7 @@ def _save_with_thread(
             thread.join()
 
         def _save():
+            ray.get(self.checkpoint_monitor.notify_started.remote())
             torch.save(obj, path)
             log_with_rank(
                 f"Saved {prefix} to {os.path.abspath(path)}",
@@ -357,6 +358,7 @@ def save_checkpoint(  # noqa: C901
                     self._save_model_thread.join()
 
                 def _save_model():
+                    ray.get(self.checkpoint_monitor.notify_started.remote())
                     save_model.save_pretrained(hf_local_path, state_dict=state_dict)
                     log_with_rank(
                         f"Saved hf_model to {os.path.abspath(hf_local_path)}",
diff --git a/trinity/trainer/verl/megatron_checkpoint_manager.py b/trinity/trainer/verl/megatron_checkpoint_manager.py
@@ -125,6 +125,7 @@ def _save_state_dict(self, local_path, global_step):
 
         def finalize_save_fn():
             # Rank 0 uploads checkpoint to HDFS if hdfs_path is provided
+            ray.get(self.checkpoint_monitor.notify_started.remote())
             log_with_rank(
                 f"Dist checkpointing save completed for {dist_checkpoint_path}",
                 rank=self.rank,
diff --git a/trinity/trainer/verl_trainer.py b/trinity/trainer/verl_trainer.py
@@ -3,6 +3,7 @@
 
 Modified from verl/trainer/ppo/ray_trainer.py
 """
+import asyncio
 import os
 import sys
 from collections import defaultdict
@@ -57,6 +58,9 @@ def __init__(self, default_local_dir: str, default_hdfs_dir: str = None):
         self.latest_checkpoint_step = 0
         self.latest_state_dict_step = 0
 
+        self.condition = asyncio.Condition()
+        self.saving_count = 0
+
     def update_latest_checkpoint_step(self, step: int):
         assert step >= self.latest_checkpoint_step
         if step == self.latest_checkpoint_step:
@@ -87,7 +91,7 @@ def update_latest_state_dict_step(self, step: int):
         with open(self.local_latest_state_dict_iteration, "w") as f:
             f.write(str(step))
 
-    def register_thread_count(
+    async def register_thread_count(
         self,
         step: int,
         *,
@@ -99,7 +103,7 @@ def register_thread_count(
         if checkpoint_thread_count != 0:
             self.checkpoint_counter[step] += checkpoint_thread_count
 
-    def monitor_step(self, step: int, is_state_dict: bool = False):
+    async def monitor_step(self, step: int, is_state_dict: bool = False):
         if is_state_dict:
             self.state_dict_steps.add(step)
             if self.state_dict_counter[step] == 0:
@@ -109,7 +113,16 @@ def monitor_step(self, step: int, is_state_dict: bool = False):
             if self.checkpoint_counter[step] == 0 and self.state_dict_counter[step] == 0:
                 self.update_latest_checkpoint_step(step)
 
-    def notify_finished(self, step: int, is_state_dict: bool = False):
+    async def notify_started(self):
+        async with self.condition:
+            while self.saving_count > 0:
+                await self.condition.wait_for(lambda: self.saving_count == 0)
+            self.saving_count += 1
+
+    async def notify_finished(self, step: int, is_state_dict: bool = False):
+        async with self.condition:
+            self.saving_count -= 1
+            self.condition.notify()
         if is_state_dict:
             self.state_dict_counter[step] -= 1
             if (