add CheckpointMonitor and refactor checkpoint manager

chenyushuo · chenyushuo · commit cdfd70e03f95 · 2025-09-24T20:23:44.000+08:00
diff --git a/trinity/manager/synchronizer.py b/trinity/manager/synchronizer.py
@@ -8,7 +8,7 @@
 import ray
 
 from trinity.common.config import Config
-from trinity.common.constants import RunningStatus
+from trinity.common.constants import RunningStatus, SyncMethod
 from trinity.common.models.utils import (
     get_checkpoint_dir_with_step_num,
     load_state_dict,
@@ -43,6 +43,8 @@ def __init__(self, config: Config, module_ref: ray.actor.ActorHandle):
         self._modules = {module_ref}
         self._modules_lock = asyncio.Lock()
         asyncio.create_task(self._check_modules())
+        if self.config.synchronizer.sync_method == SyncMethod.CHECKPOINT:
+            asyncio.create_task(self._find_latest_state_dict())
 
     async def add_module(self, module_ref: ray.actor.ActorHandle) -> None:
         """Adds a module to be tracked by the synchronizer.
@@ -72,6 +74,32 @@ async def _check_modules(self) -> None:
         except Exception:
             pass
 
+    async def _find_latest_state_dict(self) -> None:
+        assert self.config.trainer.trainer_type == "verl"
+        default_local_dir = self.config.trainer.trainer_config.trainer.default_local_dir
+        local_latest_state_dict_iteration = os.path.join(
+            default_local_dir, "latest_state_dict_iteration.txt"
+        )
+        while True:
+            if os.path.exists(local_latest_state_dict_iteration):
+                with open(local_latest_state_dict_iteration, "r") as f:
+                    latest_model_version = int(f.read().strip())
+                if latest_model_version > self.model_version:
+                    self.logger.info(
+                        f"Synchronizer has found a new model state dict at step {latest_model_version}."
+                    )
+                    model_state_dict = load_state_dict(
+                        os.path.join(
+                            default_local_dir, f"global_step_{latest_model_version}", "actor"
+                        ),
+                        self.config.trainer,
+                    )
+                    self.logger.info(
+                        f"Synchronizer has loaded model state dict from checkpoint {self.model_version}."
+                    )
+                    await self.set_model_state_dict(model_state_dict, latest_model_version)
+            await asyncio.sleep(1)
+
     async def set_trainer_status(self, status: RunningStatus):
         """Update the status of the trainer."""
         async with self._ready_condition:
diff --git a/trinity/trainer/trainer.py b/trinity/trainer/trainer.py
@@ -189,7 +189,6 @@ def save_checkpoint(self, block_until_saved: bool = False, save_as_hf: bool = Fa
                 current_exp_index=self.engine.train_step_num * self.config.buffer.train_batch_size,
                 current_step=self.train_step_num,
             )
-            self.logger.info(f"Checkpoint at step {self.train_step_num} saved.")
         return metrics
 
     async def shutdown(self) -> None:
diff --git a/trinity/trainer/verl/fsdp_checkpoint_manager.py b/trinity/trainer/verl/fsdp_checkpoint_manager.py
@@ -46,8 +46,8 @@
 )
 from verl.utils.logger import log_with_rank
 
-from trinity.common.constants import SyncMethod
 from trinity.manager.synchronizer import Synchronizer
+from trinity.trainer.verl_trainer import CheckpointMonitor
 
 
 class FSDPCheckpointManager(OldFSDPCheckpointManager):
@@ -60,15 +60,12 @@ class FSDPCheckpointManager(OldFSDPCheckpointManager):
     This class is useful in distributed training scenarios where synchronization and non-blocking I/O are important.
     """
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args, ray_namespace: str = "", **kwargs):
         super().__init__(*args, **kwargs)
-        config = kwargs.pop("config", None)
-        self.synchronizer_config = config
-        if config is not None:
-            # Retrieve the remote Synchronizer actor using the provided namespace
-            self.synchronizer = Synchronizer.get_actor(namespace=config.ray_namespace)
-        else:
-            self.synchronizer = None
+        self.synchronizer = Synchronizer.get_actor(namespace=ray_namespace)
+        self.checkpoint_monitor = CheckpointMonitor.get_actor(
+            namespace=ray_namespace,
+        )
 
         # Threads for asynchronous saving of different components
         self._model_state_dict_thread = None
@@ -77,21 +74,6 @@ def __init__(self, *args, **kwargs):
         self._save_model_thread = None
         self.previous_state_dict_step = None
 
-    def _notify_synchronizer_with_step_num(self, global_step):
-        """
-        Notifies the Synchronizer actor about the current training step number,
-        used when SyncMethod is CHECKPOINT.
-
-        Args:
-            global_step (int): The current global training step.
-        """
-        if getattr(self.synchronizer_config, "sync_method", None) == SyncMethod.CHECKPOINT:
-            ray.get(
-                self.synchronizer.set_model_state_dict_with_step_num.remote(
-                    global_step, self.world_size
-                )
-            )
-
     def _upload_state_dict(self, state_dict: Union[dict, None], global_step: int):
         """
         Internal method to upload a state dict to the Synchronizer actor.
@@ -131,14 +113,16 @@ def _save_model_state_dict():
                 rank=self.rank,
                 logger=logger,
             )
-            self._notify_synchronizer_with_step_num(global_step)
+            ray.get(self.checkpoint_monitor.notify_finished.remote(global_step, True))
 
         self._model_state_dict_thread = threading.Thread(
             target=_save_model_state_dict,
         )
         self._model_state_dict_thread.start()
 
-    def _save_optimizer(self, local_path):
+        self.previous_state_dict_step = global_step
+
+    def _save_optimizer(self, local_path, global_step):
         optim_path = os.path.join(
             local_path, f"optim_world_size_{self.world_size}_rank_{self.rank}.pt"
         )
@@ -153,13 +137,14 @@ def _save_optimizer_state_dict():
                 rank=self.rank,
                 logger=logger,
             )
+            ray.get(self.checkpoint_monitor.notify_finished.remote(global_step))
 
         self._optimizer_state_dict_thread = threading.Thread(
             target=_save_optimizer_state_dict,
         )
         self._optimizer_state_dict_thread.start()
 
-    def _save_extra_state(self, local_path):
+    def _save_extra_state(self, local_path, global_step):
         extra_path = os.path.join(
             local_path, f"extra_state_world_size_{self.world_size}_rank_{self.rank}.pt"
         )
@@ -180,6 +165,7 @@ def _save_extra_state_dict():
                 rank=self.rank,
                 logger=logger,
             )
+            ray.get(self.checkpoint_monitor.notify_finished.remote(global_step))
 
         self._extra_state_dict_thread = threading.Thread(
             target=_save_extra_state_dict,
@@ -193,11 +179,12 @@ def save_state_dict(  # noqa: C901
         global_step: int = 0,
     ):
         if self.previous_state_dict_step is None:
+            # First sync in trainer.prepare
             self.previous_state_dict_step = global_step
             self._upload_state_dict(None, global_step)
             return
         elif self.previous_state_dict_step == global_step:
-            self._notify_synchronizer_with_step_num(global_step)
+            # No need to save for sync again
             return
         if local_path is None:
             return
@@ -213,8 +200,7 @@ def save_state_dict(  # noqa: C901
                 self.model, StateDictType.SHARDED_STATE_DICT, state_dict_cfg, optim_cfg
             ):
                 self._save_model(local_path, global_step)
-
-        self.previous_state_dict_step = global_step
+        ray.get(self.checkpoint_monitor.register_state_dict_save_count.remote(global_step, 1))
 
     def save_checkpoint(  # noqa: C901
         self,
@@ -239,12 +225,14 @@ def save_checkpoint(  # noqa: C901
             hdfs_path (str, optional): HDFS path for saving the checkpoint (not implemented here).
             global_step (int): Current training step.
             max_ckpt_to_keep (int, optional): Maximum number of checkpoints to keep locally.
-            model_state_dict_only (bool): Whether to only save the model state dict (no optimizer, etc.).
             save_as_hf (bool): Whether to force save the model in Hugging Face format.
         """
         if local_path is None:
             return
 
+        # record the previous global step
+        self.previous_global_step = global_step
+
         # remove previous local_path, only rank 0 should do this
         if (
             self.rank == 0
@@ -270,6 +258,9 @@ def save_checkpoint(  # noqa: C901
                 self.optimizer is not None
             ), "optimizer must be provided when checkpoint_contents.save includes ['optimizer']"
 
+        state_dict_thread_count = 0
+        other_thread_count = 0
+
         # every rank will save its own model and optim shard
         state_dict_cfg = ShardedStateDictConfig(offload_to_cpu=True if is_cuda_available else False)
         optim_cfg = ShardedOptimStateDictConfig(offload_to_cpu=True if is_cuda_available else False)
@@ -279,16 +270,17 @@ def save_checkpoint(  # noqa: C901
                 self.model, StateDictType.SHARDED_STATE_DICT, state_dict_cfg, optim_cfg
             ):
                 if self.should_save_model:
-                    if self.previous_state_dict_step == global_step:
-                        self._notify_synchronizer_with_step_num(global_step)
-                    else:
+                    if self.previous_state_dict_step != global_step:
+                        state_dict_thread_count += 1
                         self._save_model(local_path, global_step)
 
                 if self.should_save_optimizer:
-                    self._save_optimizer(local_path)
+                    other_thread_count += 1
+                    self._save_optimizer(local_path, global_step)
 
                 if self.should_save_extra:
-                    self._save_extra_state(local_path)
+                    other_thread_count += 1
+                    self._save_extra_state(local_path, global_step)
 
         if self.rank == 0:
             # Save HF tokenizer/processor and model config on rank 0 to huggingface/ directory, no matter whether
@@ -341,6 +333,7 @@ def save_checkpoint(  # noqa: C901
             state_dict = get_fsdp_full_state_dict(self.model, offload_to_cpu=True, rank0_only=True)
 
             if self.rank == 0:
+                other_thread_count += 1
                 hf_local_path = os.path.join(local_path, "huggingface")
                 os.makedirs(hf_local_path, exist_ok=True)
 
@@ -386,19 +379,21 @@ def _save_model():
                         logger=logger,
                         log_only_rank_0=True,
                     )
+                    ray.get(self.checkpoint_monitor.notify_finished.remote(global_step))
 
                 self._save_model_thread = threading.Thread(
                     target=_save_model,
                 )
                 self._save_model_thread.start()
-                self.processing_class.save_pretrained(hf_local_path)
 
             # wait for rank0 to dump hf_model to local
             torch.distributed.barrier()
 
-        # record the previous global step
-        self.previous_global_step = global_step
-        self.previous_state_dict_step = global_step
+        ray.get(
+            self.checkpoint_monitor.register_checkpoint_save_count.remote(
+                global_step, state_dict_thread_count, other_thread_count
+            )
+        )
         self.previous_saved_paths.append(local_path)
 
     def wait_on_save_thread(self) -> None:
diff --git a/trinity/trainer/verl/fsdp_workers.py b/trinity/trainer/verl/fsdp_workers.py
@@ -569,6 +569,7 @@ def init_model(self):
                 lr_scheduler=None,
                 processing_class=self.processor if self.processor is not None else self.tokenizer,
                 checkpoint_config=self.config.ref.checkpoint,
+                ray_namespace=self.config.synchronizer.ray_namespace,
             )
 
         if self._is_actor:
@@ -579,7 +580,7 @@ def init_model(self):
                 lr_scheduler=self.actor_lr_scheduler,
                 processing_class=self.processor if self.processor is not None else self.tokenizer,
                 checkpoint_config=self.config.actor.checkpoint,
-                config=self.config.synchronizer,
+                ray_namespace=self.config.synchronizer.ray_namespace,
             )
 
     @register(dispatch_mode=Dispatch.ONE_TO_ALL)
@@ -870,7 +871,6 @@ def save_checkpoint(
         hdfs_path=None,
         global_step=0,
         max_ckpt_to_keep=None,
-        model_state_dict_only=False,
         save_as_hf: bool = False,
     ):
         # only support save and load ckpt for actor
@@ -882,7 +882,6 @@ def save_checkpoint(
                 hdfs_path=hdfs_path,
                 global_step=global_step,
                 max_ckpt_to_keep=max_ckpt_to_keep,
-                model_state_dict_only=model_state_dict_only,
                 save_as_hf=save_as_hf,
             )
             dist.barrier()
@@ -1233,6 +1232,7 @@ def init_model(self):
             lr_scheduler=self.critic_lr_scheduler,
             processing_class=self.processor if self.processor is not None else self.tokenizer,
             checkpoint_config=self.config.checkpoint,
+            ray_namespace=self.config.synchronizer.ray_namespace,
         )
 
     @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
diff --git a/trinity/trainer/verl/megatron_checkpoint_manager.py b/trinity/trainer/verl/megatron_checkpoint_manager.py
@@ -38,9 +38,8 @@
     get_transformer_config_checkpoint_path,
 )
 
-from trinity.common.config import SynchronizerConfig
-from trinity.common.constants import SyncMethod
 from trinity.manager.synchronizer import Synchronizer
+from trinity.trainer.verl_trainer import CheckpointMonitor
 
 
 class MegatronCheckpointManager(OldMegatronCheckpointManager):
@@ -53,34 +52,17 @@ class MegatronCheckpointManager(OldMegatronCheckpointManager):
     def __init__(
         self,
         *args,
-        sync_config: SynchronizerConfig = None,
+        ray_namespace: str = "",
         **kwargs,
     ):
         super().__init__(
             *args,
             **kwargs,
         )
-        self.synchronizer_config = sync_config
-        if sync_config is not None:
-            # Retrieve the remote Synchronizer actor using the provided namespace
-            self.synchronizer = Synchronizer.get_actor(namespace=sync_config.ray_namespace)
-        else:
-            self.synchronizer = None
-
-    def _notify_synchronizer_with_step_num(self, global_step):
-        """
-        Notifies the Synchronizer actor about the current training step number,
-        used when SyncMethod is CHECKPOINT.
-
-        Args:
-            global_step (int): The current global training step.
-        """
-        if getattr(self.synchronizer_config, "sync_method", None) == SyncMethod.CHECKPOINT:
-            ray.get(
-                self.synchronizer.set_model_state_dict_with_step_num.remote(
-                    global_step, self.world_size
-                )
-            )
+        self.synchronizer = Synchronizer.get_actor(namespace=ray_namespace)
+        self.checkpoint_monitor = CheckpointMonitor.get_actor(
+            namespace=ray_namespace,
+        )
 
     def save_checkpoint(  # noqa: C901
         self,
@@ -260,14 +242,16 @@ def save_checkpoint(  # noqa: C901
                         log_only_rank_0=True,
                     )
 
+        ray.get(self.checkpoint_monitor.register_checkpoint_save_count.remote(global_step, 1, 0))
+
         def finalize_save_fn():
             # Rank 0 uploads checkpoint to HDFS if hdfs_path is provided
             log_with_rank(
                 f"Dist checkpointing save completed for {dist_checkpoint_path}",
                 rank=self.rank,
                 logger=logger,
             )
-            self._notify_synchronizer_with_step_num(global_step)
+            ray.get(self.checkpoint_monitor.notify_finished.remote(global_step, True))
             if self.rank == 0:
                 if hdfs_path is not None:
                     log_with_rank(
diff --git a/trinity/trainer/verl_trainer.py b/trinity/trainer/verl_trainer.py

Original file line number	Diff line number	Diff line change
`@@ -189,7 +189,6 @@ def save_checkpoint(self, block_until_saved: bool = False, save_as_hf: bool = Fa`
`189`	`189`	`current_exp_index=self.engine.train_step_num * self.config.buffer.train_batch_size,`
`190`	`190`	`current_step=self.train_step_num,`
`191`	`191`	`)`
`192`		`- self.logger.info(f"Checkpoint at step {self.train_step_num} saved.")`
`193`	`192`	`return metrics`
`194`	`193`
`195`	`194`	`async def shutdown(self) -> None:`