NVIDIA-NeMo · ananthsub · Aug 12, 2025 · Aug 13, 2025 · Aug 20, 2025 · mikolajblaz
diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py
@@ -179,7 +179,7 @@ class MegatronStrategy(DDPStrategy, io.IOMixin):
         save_ckpt_format (str): Distributed checkpoint format to use for checkpoint saving. Should be one of
             'torch_dist' or 'zarr'. Defaults to 'torch_dist'.
         ckpt_async_save (bool): Whether to save checkpoints asynchronously to reduce checkpointing overhead.
-            Defaults to True.
+            Defaults to False.
         ckpt_torch_dist_multiproc (int): Number of extra processes per rank used during ckpt save
             with PyTorch distributed format. Defaults to None.
         ckpt_assume_constant_structure (bool): Allows caching some computation across checkpoint saves.
@@ -266,7 +266,7 @@ def __init__(
         use_te_rng_tracker: bool = False,
         use_sharp: bool = False,
         save_ckpt_format: str = "torch_dist",
-        ckpt_async_save: bool = True,
+        ckpt_async_save: bool = False,
         ckpt_torch_dist_multiproc: int = None,  ## TODO(ashors): put elsewhere?
         ckpt_assume_constant_structure: bool = False,
         ckpt_parallel_save: bool = True,

diff --git a/nemo/lightning/pytorch/strategies/utils.py b/nemo/lightning/pytorch/strategies/utils.py
@@ -197,8 +197,10 @@ def create_checkpoint_io(wrapping_ckpt_io=None, **kwargs):
 
     if wrapping_ckpt_io:
         checkpoint_io = wrapping_ckpt_io(checkpoint_io)
-    if kwargs.get("async_save", False):
-        checkpoint_io = AsyncFinalizableCheckpointIO(checkpoint_io)
+
+    async_save = kwargs.get("async_save", False)
+    if async_save:
+        checkpoint_io = AsyncFinalizableCheckpointIO(checkpoint_io, persistent_workers=True)
 
     return checkpoint_io
 

diff --git a/nemo/utils/callbacks/dist_ckpt_io.py b/nemo/utils/callbacks/dist_ckpt_io.py
@@ -98,17 +98,18 @@ class AsyncFinalizableCheckpointIO(_WrappingCheckpointIO):
     Args:
         checkpoint_io (CheckpointIO): wrapped checkpoint_io object. Must be
             of type AsyncCompatibleCheckpointIO.
+        persistent_workers (bool): whether to use persistent workers for checkpoint writing. Defaults to False.
     Requires the underlying checkpoint_io.save_checkpoint to return save_fn, save_args, finalize_fn.
     """
 
-    def __init__(self, checkpoint_io: AsyncCompatibleCheckpointIO) -> None:
+    def __init__(self, checkpoint_io: AsyncCompatibleCheckpointIO, persistent_workers: bool = False) -> None:
         if not HAVE_MEGATRON_CORE:
             raise ImportError(IMPORT_ERROR)
         if not isinstance(checkpoint_io, AsyncCompatibleCheckpointIO):
             raise ValueError(f'Incompatible wrapped checkpoint_io type: {type(checkpoint_io)}')
 
         super().__init__(checkpoint_io)
-        self.async_calls_queue = AsyncCallsQueue()
+        self.async_calls_queue = AsyncCallsQueue(persistent=persistent_workers)
 
     def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_options: Optional[Any] = None) -> None:
         """Executes async request returned from the underlying checkpoint_io asynchronously.