[ckpt] fix: prevent data loss when max_ckpt_to_keep=1 (#4873)

jreiml · web-flow · commit f16b2453bcb7 · 2026-01-11T14:03:38.000+08:00
### What does this PR do? Fixes a data loss bug when `max_ckpt_to_keep=1`: the old checkpoint was deleted **before** the new save completed. If the save fails (disk full, crash, etc.), all checkpoints are lost. The fix ensures the previous checkpoint is preserved until the new one is successfully saved. Also consolidates duplicated cleanup logic from FSDP/Megatron managers into `BaseCheckpointManager`. **Trade-off:** With `max_ckpt_to_keep=1`, there is now temporary additional storage overhead during saves — two checkpoints exist briefly until the old one is deleted after the new save completes. This is the expected behavior to guarantee data safety. ### Checklist Before Starting - [x] Search for similar PRs. Paste at least one query link here: https://github.com/volcengine/verl/pulls?q=is%3Apr+max_ckpt_to_keep - [x] Format the PR title as `[{modules}] {type}: {description}` (This will be checked by the CI) ### Test Added CPU unit tests in `tests/utils/ckpt/test_checkpoint_cleanup_on_cpu.py` covering: - `max_ckpt_to_keep=1` preserves checkpoint before save (regression test) - `max_ckpt_to_keep=1` deletes old checkpoint after successful save - `max_ckpt_to_keep=2` keeps safety buffer - `max_ckpt_to_keep=0` (unlimited) keeps all - Full save cycle simulation ### API and Usage Example No API changes. Existing `max_ckpt_to_keep` parameter now works correctly. ### Design & Code Changes **New methods in `BaseCheckpointManager`:** - `ensure_checkpoint_capacity(max_ckpt_to_keep)` — called before save, keeps `max - 1` checkpoints as safety buffer (does nothing when `max=1`) - `register_checkpoint(new_path, max_ckpt_to_keep)` — called after save, registers path and enforces retention limit **Changes to subclasses:** - `FSDPCheckpointManager`: replaced inline cleanup logic with calls to base class methods - `MegatronCheckpointManager`: same refactor for both sync and async save paths ### Checklist Before Submitting - [x] Read the [Contribute Guide](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md). - [x] Apply [pre-commit checks](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md#code-linting-and-formatting): `pre-commit install && pre-commit run --all-files --show-diff-on-failure --color=always` - [ ] Add / Update [the documentation](https://github.com/volcengine/verl/tree/main/docs). *(N/A - no user-facing changes)* - [x] Add unit or end-to-end test(s) to [the CI workflow](https://github.com/volcengine/verl/tree/main/.github/workflows) to cover all the code. - [ ] Once your PR is ready for CI, send a message in [the `ci-request` channel](https://verl-project.slack.com/archives/C091TCESWB1) in [the `verl` Slack workspace](https://join.slack.com/t/verl-project/shared_invite/zt-3855yhg8g-CTkqXu~hKojPCmo7k_yXTQ).
diff --git a/tests/utils/ckpt/test_checkpoint_cleanup_on_cpu.py b/tests/utils/ckpt/test_checkpoint_cleanup_on_cpu.py
@@ -0,0 +1,139 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+import tempfile
+
+import pytest
+
+
+class TestCheckpointCleanupLogic:
+    """Tests for checkpoint cleanup methods in BaseCheckpointManager."""
+
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        """Set up test fixtures."""
+        self.test_dir = tempfile.mkdtemp()
+        yield
+        shutil.rmtree(self.test_dir, ignore_errors=True)
+
+    @pytest.fixture
+    def manager(self, monkeypatch):
+        """Create a minimal BaseCheckpointManager for testing."""
+        import torch.distributed
+
+        monkeypatch.setattr(torch.distributed, "get_rank", lambda: 0)
+        monkeypatch.setattr(torch.distributed, "get_world_size", lambda: 1)
+
+        from verl.utils.checkpoint.checkpoint_manager import BaseCheckpointManager
+
+        class MockModel:
+            pass
+
+        class MockOptimizer:
+            pass
+
+        return BaseCheckpointManager(
+            model=MockModel(),
+            optimizer=MockOptimizer(),
+            lr_scheduler=None,
+            processing_class=None,
+            checkpoint_config=None,
+        )
+
+    def _create_checkpoint_dir(self, step: int) -> str:
+        """Create a mock checkpoint directory."""
+        path = os.path.join(self.test_dir, f"global_step_{step}")
+        os.makedirs(path, exist_ok=True)
+        with open(os.path.join(path, "checkpoint.txt"), "w") as f:
+            f.write(f"step={step}")
+        return path
+
+    def test_max_ckpt_1_preserves_existing_before_save(self, manager):
+        """
+        Regression test: max_ckpt_to_keep=1 must NOT delete existing checkpoint before save.
+        """
+        ckpt_100 = self._create_checkpoint_dir(100)
+        manager.previous_saved_paths = [ckpt_100]
+
+        manager.ensure_checkpoint_capacity(max_ckpt_to_keep=1)
+
+        assert os.path.exists(ckpt_100), "Bug: checkpoint deleted before save!"
+        assert manager.previous_saved_paths == [ckpt_100]
+
+    def test_max_ckpt_1_deletes_old_after_save(self, manager):
+        """After save succeeds, old checkpoint should be deleted."""
+        ckpt_100 = self._create_checkpoint_dir(100)
+        manager.previous_saved_paths = [ckpt_100]
+
+        ckpt_200 = self._create_checkpoint_dir(200)
+        manager.register_checkpoint(ckpt_200, max_ckpt_to_keep=1)
+
+        assert not os.path.exists(ckpt_100)
+        assert os.path.exists(ckpt_200)
+        assert manager.previous_saved_paths == [ckpt_200]
+
+    def test_max_ckpt_2_keeps_one_before_save(self, manager):
+        """With max_ckpt_to_keep=2, pre-save cleanup keeps 1 checkpoint."""
+        ckpt_100 = self._create_checkpoint_dir(100)
+        ckpt_200 = self._create_checkpoint_dir(200)
+        manager.previous_saved_paths = [ckpt_100, ckpt_200]
+
+        manager.ensure_checkpoint_capacity(max_ckpt_to_keep=2)
+
+        assert not os.path.exists(ckpt_100)
+        assert os.path.exists(ckpt_200)
+        assert len(manager.previous_saved_paths) == 1
+
+    def test_max_ckpt_0_keeps_all(self, manager):
+        """max_ckpt_to_keep=0 means unlimited - no deletions."""
+        ckpt_100 = self._create_checkpoint_dir(100)
+        ckpt_200 = self._create_checkpoint_dir(200)
+        manager.previous_saved_paths = [ckpt_100, ckpt_200]
+
+        manager.ensure_checkpoint_capacity(max_ckpt_to_keep=0)
+        ckpt_300 = self._create_checkpoint_dir(300)
+        manager.register_checkpoint(ckpt_300, max_ckpt_to_keep=0)
+
+        assert os.path.exists(ckpt_100)
+        assert os.path.exists(ckpt_200)
+        assert os.path.exists(ckpt_300)
+        assert len(manager.previous_saved_paths) == 3
+
+    def test_full_save_cycle_max_ckpt_1(self, manager):
+        """Simulate multiple save cycles with max_ckpt_to_keep=1."""
+        # First save
+        manager.ensure_checkpoint_capacity(1)
+        ckpt_100 = self._create_checkpoint_dir(100)
+        manager.register_checkpoint(ckpt_100, 1)
+        assert manager.previous_saved_paths == [ckpt_100]
+
+        # Second save - existing checkpoint must survive pre-save
+        manager.ensure_checkpoint_capacity(1)
+        assert os.path.exists(ckpt_100), "Bug: checkpoint deleted before save!"
+
+        ckpt_200 = self._create_checkpoint_dir(200)
+        manager.register_checkpoint(ckpt_200, 1)
+        assert not os.path.exists(ckpt_100)
+        assert manager.previous_saved_paths == [ckpt_200]
+
+        # Third save
+        manager.ensure_checkpoint_capacity(1)
+        assert os.path.exists(ckpt_200), "Bug: checkpoint deleted before save!"
+
+        ckpt_300 = self._create_checkpoint_dir(300)
+        manager.register_checkpoint(ckpt_300, 1)
+        assert not os.path.exists(ckpt_200)
+        assert manager.previous_saved_paths == [ckpt_300]
diff --git a/verl/utils/checkpoint/checkpoint_manager.py b/verl/utils/checkpoint/checkpoint_manager.py
@@ -141,6 +141,36 @@ def remove_previous_save_local_path(self, path):
                 continue
             shutil.rmtree(abs_path, ignore_errors=True)
 
+    def ensure_checkpoint_capacity(self, max_ckpt_to_keep: int):
+        """
+        Remove old checkpoints to make room for a new one, keeping a safety buffer.
+
+        With max_ckpt_to_keep=1, this does nothing - we keep the existing checkpoint
+        until the new save completes successfully (handled by register_checkpoint).
+        For max_ckpt_to_keep >= 2, we keep (max_ckpt_to_keep - 1) checkpoints before save.
+        """
+        if not (max_ckpt_to_keep and isinstance(max_ckpt_to_keep, int) and max_ckpt_to_keep > 1):
+            return
+        if len(self.previous_saved_paths) >= max_ckpt_to_keep:
+            keep_start = len(self.previous_saved_paths) - max_ckpt_to_keep + 1
+            self.remove_previous_save_local_path(self.previous_saved_paths[:keep_start])
+            self.previous_saved_paths = self.previous_saved_paths[keep_start:]
+
+    def register_checkpoint(self, new_path: str, max_ckpt_to_keep: int):
+        """
+        Register a successfully saved checkpoint and enforce retention limit.
+
+        Adds the new checkpoint path to tracking and removes excess old
+        checkpoints beyond max_ckpt_to_keep.
+        """
+        self.previous_saved_paths.append(new_path)
+        if not (max_ckpt_to_keep and isinstance(max_ckpt_to_keep, int) and max_ckpt_to_keep > 0):
+            return
+        if len(self.previous_saved_paths) > max_ckpt_to_keep:
+            keep_start = len(self.previous_saved_paths) - max_ckpt_to_keep
+            self.remove_previous_save_local_path(self.previous_saved_paths[:keep_start])
+            self.previous_saved_paths = self.previous_saved_paths[keep_start:]
+
     @staticmethod
     def get_rng_state():
         rng_state = {
diff --git a/verl/utils/checkpoint/fsdp_checkpoint_manager.py b/verl/utils/checkpoint/fsdp_checkpoint_manager.py
@@ -201,17 +201,8 @@ def save_checkpoint(self, local_path: str, hdfs_path: str = None, global_step: i
         # record the previous global step
         self.previous_global_step = global_step
 
-        # remove previous local_path, only rank 0 should do this
-        if (
-            self.rank == 0
-            and max_ckpt_to_keep
-            and isinstance(max_ckpt_to_keep, int)
-            and max_ckpt_to_keep > 0
-            and len(self.previous_saved_paths) >= max_ckpt_to_keep
-        ):
-            keep_start = len(self.previous_saved_paths) - max_ckpt_to_keep + 1
-            self.remove_previous_save_local_path(self.previous_saved_paths[:keep_start])
-            self.previous_saved_paths = self.previous_saved_paths[keep_start:]
+        if self.rank == 0:
+            self.ensure_checkpoint_capacity(max_ckpt_to_keep)
 
         local_path = local_mkdir_safe(local_path)
         torch.distributed.barrier()
@@ -367,4 +358,5 @@ def save_checkpoint(self, local_path: str, hdfs_path: str = None, global_step: i
             # wait for rank0 to dump hf_model to local
             torch.distributed.barrier()
 
-        self.previous_saved_paths.append(local_path)
+        if self.rank == 0:
+            self.register_checkpoint(local_path, max_ckpt_to_keep)
diff --git a/verl/utils/checkpoint/megatron_checkpoint_manager.py b/verl/utils/checkpoint/megatron_checkpoint_manager.py
@@ -414,17 +414,8 @@ def save_checkpoint(self, local_path: str, hdfs_path: str = None, global_step: i
         # record the previous global step
         self.previous_global_step = global_step
 
-        # remove previous local_path
-        if (
-            not self.checkpoint_config.async_save
-            and max_ckpt_to_keep
-            and isinstance(max_ckpt_to_keep, int)
-            and max_ckpt_to_keep > 0
-            and len(self.previous_saved_paths) >= max_ckpt_to_keep
-        ):
-            keep_start = len(self.previous_saved_paths) - max_ckpt_to_keep + 1
-            self.remove_previous_save_local_path(self.previous_saved_paths[:keep_start])
-            self.previous_saved_paths = self.previous_saved_paths[keep_start:]
+        if not self.checkpoint_config.async_save:
+            self.ensure_checkpoint_capacity(max_ckpt_to_keep)
 
         local_path = local_mkdir_safe(local_path)
         dist_checkpoint_path = get_dist_checkpoint_path(local_path)
@@ -646,46 +637,37 @@ def finalize_save_fn():
                     hdfs_io.copy(src=hf_config_tokenizer_path, dst=hdfs_path, dirs_exist_ok=True)
 
             # update latest_checkpointed_iteration.txt when async_save is True
-            if not self.checkpoint_config.async_save:
-                return
-
-            head_node = None
-            nodes = api.list_nodes()
-            for node in nodes:
-                if node.is_head_node:
-                    head_node = node
-                    break
-
-            current_node_id = ray.get_runtime_context().get_node_id()
-            ray_local_world_size = int(os.getenv("RAY_LOCAL_WORLD_SIZE", -1))
-            if ray_local_world_size == -1:
-                nnodes = int(os.getenv("NNODES", 1))
-                ray_local_world_size = torch.distributed.get_world_size() / nnodes
-
-            if head_node is not None and head_node.node_id == current_node_id and self.rank % ray_local_world_size == 0:
-                log_with_rank(
-                    f"Update latest_checkpointed_iteration.txt to step {global_step}",
-                    rank=self.rank,
-                    logger=logger,
-                )
-                local_latest_checkpointed_iteration = os.path.join(
-                    os.path.dirname(os.path.dirname(local_path)), "latest_checkpointed_iteration.txt"
-                )
-                with open(local_latest_checkpointed_iteration, "w") as f:
-                    f.write(str(global_step))
-
-            # remove previous local_path
-            self.previous_saved_paths.append(local_path)
-
-            if (
-                max_ckpt_to_keep
-                and isinstance(max_ckpt_to_keep, int)
-                and max_ckpt_to_keep > 0
-                and len(self.previous_saved_paths) > max_ckpt_to_keep
-            ):
-                keep_start = len(self.previous_saved_paths) - max_ckpt_to_keep
-                self.remove_previous_save_local_path(self.previous_saved_paths[:keep_start])
-                self.previous_saved_paths = self.previous_saved_paths[keep_start:]
+            if self.checkpoint_config.async_save:
+                head_node = None
+                nodes = api.list_nodes()
+                for node in nodes:
+                    if node.is_head_node:
+                        head_node = node
+                        break
+
+                current_node_id = ray.get_runtime_context().get_node_id()
+                ray_local_world_size = int(os.getenv("RAY_LOCAL_WORLD_SIZE", -1))
+                if ray_local_world_size == -1:
+                    nnodes = int(os.getenv("NNODES", 1))
+                    ray_local_world_size = torch.distributed.get_world_size() / nnodes
+
+                if (
+                    head_node is not None
+                    and head_node.node_id == current_node_id
+                    and self.rank % ray_local_world_size == 0
+                ):
+                    log_with_rank(
+                        f"Update latest_checkpointed_iteration.txt to step {global_step}",
+                        rank=self.rank,
+                        logger=logger,
+                    )
+                    local_latest_checkpointed_iteration = os.path.join(
+                        os.path.dirname(os.path.dirname(local_path)), "latest_checkpointed_iteration.txt"
+                    )
+                    with open(local_latest_checkpointed_iteration, "w") as f:
+                        f.write(str(global_step))
+
+            self.register_checkpoint(local_path, max_ckpt_to_keep)
 
         if self.checkpoint_config.async_save:
             assert async_save_request is not None, "Async save request should not be None when using async save."
@@ -695,4 +677,3 @@ def finalize_save_fn():
             async_calls.schedule_async_request(async_save_request)
         else:
             finalize_save_fn()
-            self.previous_saved_paths.append(local_path)