Fix ModelCheckpoint file_exists OOM in DDP (#21380)

littlebullGit · justusschock · web-flow · commit b09e96edb793 · 2025-11-25T13:22:21.000+01:00
* Fix ModelCheckpoint.file_exists OOM in DDP

* Document ModelCheckpoint.file_exists DDP memory fix

* Update src/lightning/pytorch/callbacks/model_checkpoint.py

---------

Co-authored-by: Justus Schock &lt;12886177+justusschock@users.noreply.github.com&gt;
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -82,6 +82,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed FSDP mixed precision semantics and added user warning ([#21361](https://github.com/Lightning-AI/pytorch-lightning/pull/21361))
 
 
+- Fixed `ModelCheckpoint.file_exists` using broadcast in DDP, reducing memory usage when checking for existing checkpoints ([#19674](https://github.com/Lightning-AI/pytorch-lightning/issues/19674))
+
+
 ---
 
 ## [2.5.6] - 2025-11-05
diff --git a/src/lightning/pytorch/callbacks/model_checkpoint.py b/src/lightning/pytorch/callbacks/model_checkpoint.py
@@ -997,10 +997,12 @@ def to_yaml(self, filepath: Optional[_PATH] = None) -> None:
             yaml.dump(best_k, fp)
 
     def file_exists(self, filepath: _PATH, trainer: "pl.Trainer") -> bool:
-        """Checks if a file exists on rank 0 and broadcasts the result to all other ranks, preventing the internal
+        """Checks if a file exists on rank 0 and synchronizes the result to all other ranks, preventing the internal
         state to diverge between ranks."""
-        exists = self._fs.exists(filepath)
-        return trainer.strategy.broadcast(exists)
+        # In distributed setups, only global rank 0 touches the filesystem
+        local_decision = self._fs.exists(filepath) if trainer.is_global_zero else False
+        # Reduce the decision across ranks using an "any"-style reduction to decide if the file exists anywhere
+        return trainer.strategy.reduce_boolean_decision(local_decision, all=False)
 
     def _should_remove_checkpoint(self, trainer: "pl.Trainer", previous: str, current: str) -> bool:
         """Checks if the previous checkpoint should be deleted.
diff --git a/tests/tests_pytorch/checkpointing/test_checkpoint_callback_frequency.py b/tests/tests_pytorch/checkpointing/test_checkpoint_callback_frequency.py
@@ -121,3 +121,28 @@ def on_train_epoch_end(self):
     trainer.fit(model)
     if os.getenv("LOCAL_RANK") == "0":
         assert save_mock.call_count == expected
+
+
+@RunIf(min_cuda_gpus=2, standalone=True)
+def test_model_checkpoint_ddp_monitor_none(tmp_path):
+    """Ensure that ModelCheckpoint with monitor=None works correctly under DDP and exercises the file_exists path."""
+
+    model = BoringModel()
+    checkpoint = callbacks.ModelCheckpoint(dirpath=tmp_path, monitor=None, save_top_k=1)
+
+    trainer = Trainer(
+        default_root_dir=tmp_path,
+        callbacks=[checkpoint],
+        enable_progress_bar=False,
+        enable_model_summary=False,
+        max_epochs=1,
+        strategy="ddp",
+        accelerator="gpu",
+        devices=2,
+        limit_train_batches=2,
+        limit_val_batches=0,
+    )
+
+    trainer.fit(model)
+    if os.getenv("LOCAL_RANK") == "0":
+        assert checkpoint.best_model_path