fix tests

pan-x-c · pan-x-c · commit 54249232cdfc · 2026-01-14T13:55:05.000+08:00
diff --git a/tests/trainer/trainer_test.py b/tests/trainer/trainer_test.py
@@ -749,7 +749,7 @@ def setUp(self):
         if multiprocessing.get_start_method(allow_none=True) != "spawn":
             multiprocessing.set_start_method("spawn", force=True)
         self.config = get_template_config()
-        self.config.buffer.total_epochs = 1
+        self.config.buffer.total_steps = 6
         self.config.buffer.batch_size = 4
         self.config.model.model_path = get_model_path()
         self.config.explorer.rollout_model.engine_type = "vllm_async"
@@ -762,9 +762,10 @@ def setUp(self):
         self.config.synchronizer.sync_method = SyncMethod.CHECKPOINT
         self.config.explorer.eval_interval = 4
         self.config.buffer.explorer_input.taskset = get_unittest_dataset_config("countdown")
-        self.config.trainer.save_interval = 4
+        self.config.trainer.save_interval = 2
         self.config.trainer.save_hf_checkpoint = "last"
         self.config.trainer.trainer_strategy = self.strategy
+        self.config.trainer.max_checkpoints_to_keep = 2
         self.config.check_and_update()
         self.process_list = []
 
@@ -775,8 +776,6 @@ def test_trainer(self):  # noqa: C901
             _trainer_config.actor_rollout_ref.actor.megatron.tensor_model_parallel_size = 2
             _trainer_config.actor_rollout_ref.ref.megatron.tensor_model_parallel_size = 2
             _trainer_config.critic.megatron.tensor_model_parallel_size = 2
-        _trainer_config.trainer.max_actor_ckpt_to_keep = 2
-        _trainer_config.trainer.max_critic_ckpt_to_keep = 2
 
         stop_event = multiprocessing.Event()
         trainer_process = multiprocessing.Process(target=run_both, args=(self.config, stop_event))
@@ -887,10 +886,27 @@ def test_trainer(self):  # noqa: C901
         if not stop_event.is_set():
             self.fail("Training process failed to stop.")
         # check only full checkpoint dirs are kept
-        for sync_step in [0, 1, 2, 3]:
+        for sync_step in [1, 3, 5]:
             state_dict_dir = os.path.join(default_local_dir, f"global_step_{sync_step}")
-            self.assertFalse(os.path.exists(state_dict_dir))
-        self.assertTrue(os.path.exists(os.path.join(default_local_dir, "global_step_4")))
+            self.assertFalse(
+                os.path.exists(state_dict_dir),
+                f"Found unexpected state dict dir at step {sync_step}",
+            )
+        for checkpoint_step in [4, 6]:
+            checkpoint_dir = os.path.join(default_local_dir, f"global_step_{checkpoint_step}")
+            self.assertTrue(
+                os.path.exists(checkpoint_dir),
+                f"Missing expected checkpoint dir at step {checkpoint_step}",
+            )
+            actor_checkpoint_dir = os.path.join(checkpoint_dir, "actor")
+            self.assertTrue(os.path.exists(actor_checkpoint_dir))
+        # check step 2 should have no checkpoint
+        checkpoint_dir = os.path.join(default_local_dir, "global_step_2")
+        self.assertTrue(os.path.exists(checkpoint_dir))
+        actor_checkpoint_dir = os.path.join(checkpoint_dir, "actor")
+        self.assertFalse(os.path.exists(actor_checkpoint_dir))
+        critic_checkpoint_dir = os.path.join(checkpoint_dir, "critic")
+        self.assertFalse(os.path.exists(critic_checkpoint_dir))
         trainer_process.join(timeout=10)
         self.assertIn("model.safetensors", huggingface_dir_files)
 
diff --git a/trinity/manager/synchronizer.py b/trinity/manager/synchronizer.py
@@ -135,7 +135,7 @@ async def _remove_previous_state_dict(self, previous_model_version: int) -> None
                 self.logger.info(
                     f"Removing previous checkpoint for sync at step {previous_model_version}."
                 )
-                shutil.rmtree(previous_state_dict_dir)
+                shutil.rmtree(previous_state_dict_dir, ignore_errors=True)
 
     async def _find_tinker_latest_state_dict(self) -> None:
         default_local_dir = self.config.checkpoint_job_dir
diff --git a/trinity/trainer/verl/fsdp_checkpoint_manager.py b/trinity/trainer/verl/fsdp_checkpoint_manager.py
@@ -48,6 +48,7 @@
 
 from trinity.manager.synchronizer import Synchronizer
 from trinity.trainer.verl_trainer import CheckpointMonitor
+from trinity.utils.log import get_logger
 
 
 class FSDPCheckpointManager(OldFSDPCheckpointManager):
@@ -62,6 +63,7 @@ class FSDPCheckpointManager(OldFSDPCheckpointManager):
 
     def __init__(self, *args, ray_namespace: str = "", **kwargs):
         super().__init__(*args, **kwargs)
+        self.logger = get_logger()
         self.synchronizer = Synchronizer.get_actor(namespace=ray_namespace)
         self.checkpoint_monitor = CheckpointMonitor.get_actor(
             namespace=ray_namespace,
@@ -439,6 +441,10 @@ def save_checkpoint(
             and local_path != self.previous_saved_paths[-1]  # type: ignore
         ):  # last step may save twice
             keep_start = len(self.previous_saved_paths) - max_ckpt_to_keep + 1  # type: ignore
+            self.logger.info(
+                "Checkpoint manager is removing previous checkpoints at "
+                + str(self.previous_saved_paths[:keep_start])  # type: ignore
+            )
             self.remove_previous_save_local_path(self.previous_saved_paths[:keep_start])  # type: ignore
             self.previous_saved_paths = self.previous_saved_paths[keep_start:]  # type: ignore
 
diff --git a/trinity/trainer/verl/megatron_checkpoint_manager.py b/trinity/trainer/verl/megatron_checkpoint_manager.py
@@ -40,6 +40,7 @@
 
 from trinity.manager.synchronizer import Synchronizer
 from trinity.trainer.verl_trainer import CheckpointMonitor
+from trinity.utils.log import get_logger
 
 
 class MegatronCheckpointManager(OldMegatronCheckpointManager):
@@ -59,6 +60,7 @@ def __init__(
             *args,
             **kwargs,
         )
+        self.logger = get_logger()
         self.synchronizer = Synchronizer.get_actor(namespace=ray_namespace)
         self.checkpoint_monitor = CheckpointMonitor.get_actor(
             namespace=ray_namespace,
@@ -340,6 +342,10 @@ def save_checkpoint(
             and local_path != self.previous_saved_paths[-1]  # type: ignore
         ):  # last step may save twice
             keep_start = len(self.previous_saved_paths) - max_ckpt_to_keep + 1  # type: ignore
+            self.logger.info(
+                "Checkpoint manager is removing previous checkpoints at "
+                + str(self.previous_saved_paths[:keep_start])  # type: ignore
+            )
             self.remove_previous_save_local_path(self.previous_saved_paths[:keep_start])  # type: ignore
             self.previous_saved_paths = self.previous_saved_paths[keep_start:]  # type: ignore
 
diff --git a/trinity/trainer/verl_trainer.py b/trinity/trainer/verl_trainer.py
@@ -498,6 +498,7 @@ def _save_checkpoint(self, save_as_hf: bool = False):
         # make sure this flag is created before notifying the synchronizer
         # to avoid the synchronizer recognizing it as a state_dict-only checkpoint
         # TODO: use a better way to indicate full checkpoint
+        os.makedirs(local_global_step_folder, exist_ok=True)
         flag_path = os.path.join(local_global_step_folder, ".full_checkpoint")
         with open(flag_path, "w") as f:
             f.write("")

Original file line number	Diff line number	Diff line change
`@@ -135,7 +135,7 @@ async def _remove_previous_state_dict(self, previous_model_version: int) -> None`
`135`	`135`	`self.logger.info(`
`136`	`136`	`f"Removing previous checkpoint for sync at step {previous_model_version}."`
`137`	`137`	`)`
`138`		`- shutil.rmtree(previous_state_dict_dir)`
	`138`	`+ shutil.rmtree(previous_state_dict_dir, ignore_errors=True)`
`139`	`139`
`140`	`140`	`async def _find_tinker_latest_state_dict(self) -> None:`
`141`	`141`	`default_local_dir = self.config.checkpoint_job_dir`