NVIDIA · nanz-nv · Nov 3, 2025 · Nov 17, 2025 · Nov 17, 2025 · Nov 17, 2025
@@ -4,6 +4,7 @@
 
 import logging
 
+import gc
 import torch
 
 from megatron.core.tensor_parallel.random import get_all_rng_states
@@ -98,10 +99,18 @@ class FullCudaGraphWrapper:
     cuda_graph = {'training': None, 'validation': None}
     result = {'training': None, 'validation': None}
 
-    def __init__(self, forward_backward_func, cuda_graph_warmup_steps=1):
+    def __init__(
+        self,
+        forward_backward_func,
+        cuda_graph_warmup_steps=1,
+        moe_paged_stash=False,
+        moe_expert_rank_capacity_factor=None,
+    ):
         self.forward_backward_func = forward_backward_func
         self.static_loader = StaticBufferLoader()
         self.cuda_graph_warmup_steps = cuda_graph_warmup_steps
+        self.moe_paged_stash = moe_paged_stash
+        self.moe_expert_rank_capacity_factor = moe_expert_rank_capacity_factor
 
     def data_read(self, data_iterator, model, training, num_microbatches):
         """Read all microbatch inputs from Dataloader and copy to static buffers."""
@@ -180,19 +189,48 @@ def __call__(self, *args, **kwargs):
             torch.cuda.synchronize()
             torch.distributed.barrier()
             logger.info(f'CUDA graph capture done for {training_str}!!!')
-
         if FullCudaGraphWrapper.cuda_graph[training_str] is None:
             FullCudaGraphWrapper.result[training_str] = self.forward_backward_func(*args, **kwargs)
         else:
             FullCudaGraphWrapper.cuda_graph[training_str].replay()
-
         self.next_iter(training_str)
         return FullCudaGraphWrapper.result[training_str]
 
+    def speculative_cuda_graph_check(self, model):
+        '''check speculative execution modules'''
+        if self.moe_expert_rank_capacity_factor is not None:
+            # Check if there is any overflow in the receiving buffer
+            over_budget = torch.zeros(1, dtype=torch.bool, device='cuda')
+            for model_chunk in model:
+                for layer in model_chunk.module.module.decoder.layers:
+                    mlp = layer.mlp
+                    if hasattr(mlp, 'token_dispatcher') and hasattr(
+                        mlp.token_dispatcher, 'check_over_budget'
+                    ):
+                        over_budget |= mlp.token_dispatcher.check_over_budget()
+            if over_budget.item():
+                raise Exception(f"Rank {torch.distributed.get_rank()} overbudget")
+
     def curr_iter(self, stage):
         """Return current training/validation iteration."""
         return FullCudaGraphWrapper.curr_iteration[stage]
 
     def next_iter(self, stage):
         """Increment current training/validation iteration."""
         FullCudaGraphWrapper.curr_iteration[stage] += 1
+
+    def reset_cuda_graph(self, stage=None):
+        """Reset CUDA graph."""
+        if stage is None or stage == 'training':
+            if FullCudaGraphWrapper.cuda_graph['training'] is not None:
+                del FullCudaGraphWrapper.cuda_graph['training']
+                FullCudaGraphWrapper.cuda_graph['training'] = None
+            FullCudaGraphWrapper.result['training'] = None
+            FullCudaGraphWrapper.curr_iteration['training'] = 0
+        if stage is None or stage == 'validation':
+            if FullCudaGraphWrapper.cuda_graph['validation'] is not None:
+                del FullCudaGraphWrapper.cuda_graph['validation']
+                FullCudaGraphWrapper.cuda_graph['validation'] = None
+            FullCudaGraphWrapper.result['validation'] = None
+            FullCudaGraphWrapper.curr_iteration['validation'] = 0
+        gc.collect()
@@ -253,3 +253,4 @@ def weighted_bias_swiglu_impl(input, bias, weights, fp8_input_store=False):
 
 # bias_swiglu_impl = BiasSwiGLUFunction.apply
 # swiglu_impl = SwiGLUFunction.apply
+
@@ -275,6 +275,14 @@ class ModelParallelConfig:
     in 1f1b phase of pipelining or non-pipelining schedule.
     """
 
+    use_dynamic_comp_stream: bool = False
+    """Use dynamic computation stream selection instead of binding to the default stream.
+    When enabled, get_comp_stream() returns torch.cuda.current_stream() at call time,
+    allowing CUDA graph capture and replay on non-default streams. This is required for
+    full-iteration CUDA graph with 1f1b EP overlap where the capture stream differs
+    from the default stream.
+    """
+
     delay_wgrad_compute: bool = False
     """Delay the weight gradient computation to improve batch-level communication overlapping"""
 

@@ -15,6 +15,7 @@
     get_comp_stream,
 )
 from megatron.core.transformer.enums import CudaGraphScope
+from megatron.core.transformer.moe.paged_stash import paged_stash_set_last_layer
 
 
 class ModelChunkState:
@@ -63,8 +64,8 @@ def __init__(self, layer, event, chunk_state, comp_stream, comm_stream, extra_ar
             event (torch.cuda.Event):
                 record CUDA event across multiple nodes on different streams for synchronization.
             chunk_state (ModelChunkState): model state shared in the model chunk.
-            comp_stream (torch.cuda.Stream): CUDA stream for computation.
-            comm_stream (torch.cuda.Stream): CUDA stream for communication.
+            comp_stream (Callable): Func that returns CUDA stream for computation.
+            comm_stream (Callable): Func that returns CUDA stream for communication.
             extra_args (dict): extra arguments for the layer.
 
         The event and chunk_state are binded to the TransformerModelChunkSchedulePlan
@@ -317,9 +318,6 @@ def __init__(
         self.post_process = None
         self.vp_stage = model.vp_stage
 
-        comp_stream = get_comp_stream()
-        comm_stream = get_comm_stream()
-
         # save the inputs of model.forward() to ModelChunkState
         self._model_chunk_state.input_ids = input_ids
         self._model_chunk_state.position_ids = position_ids
@@ -338,18 +336,22 @@ def __init__(
         self._model_chunk_state.attention_bias = None
 
         # build preprocess
-        self.pre_process = PreProcessNode(model, self._model_chunk_state, self._event, comp_stream)
+        self.pre_process = PreProcessNode(
+            model, self._model_chunk_state, self._event, get_comp_stream
+        )
 
         # build layer schedule plan for each layer.
         # The methods to obtain layers are different for MTP so we need the other build plan for
         # MTP. Also, this can help annotate MTP layer so that it can know where MTP is.
-        self._build_layer_schedule_plan(model.decoder, comp_stream, comm_stream)
-        self._build_layer_schedule_plan(getattr(model, "mtp", None), comp_stream, comm_stream)
+        self._build_layer_schedule_plan(model.decoder, get_comp_stream, get_comm_stream)
+        self._build_layer_schedule_plan(
+            getattr(model, "mtp", None), get_comp_stream, get_comm_stream
+        )
 
         # build post process
         if model.post_process:
             self.post_process = PostProcessNode(
-                model, self._model_chunk_state, self._event, comp_stream
+                model, self._model_chunk_state, self._event, get_comp_stream
             )
 
     def _build_layer_schedule_plan(self, module, comp_stream, comm_stream):
@@ -479,6 +481,8 @@ def run(
             f_layer = f_schedule_plan.get_layer(i)
             b_layer = b_schedule_plan.pop_layer()
             torch.cuda.nvtx.range_push(f"layer_{i}f-layer_{b_schedule_plan.num_layers()}b")
+            if f_layer.layer.config.moe_paged_stash:
+                paged_stash_set_last_layer(i == f_num_layers - 1)
             f_input, b_grad = TransformerLayerSchedulePlan.run(
                 f_layer,
                 b_layer,
@@ -505,6 +509,8 @@ def run(
         for i in range(overlapped_layers, f_num_layers):
             f_layer = f_schedule_plan.get_layer(i)
             torch.cuda.nvtx.range_push(f"layer_{i}f")
+            if f_layer.layer.config.moe_paged_stash:
+                paged_stash_set_last_layer(i == f_num_layers - 1)
             f_input, _ = TransformerLayerSchedulePlan.run(f_layer, None, f_input=f_input)
             torch.cuda.nvtx.range_pop()
 

@@ -3,7 +3,7 @@
 import weakref
 from contextlib import nullcontext
 from functools import partial
-from typing import Optional
+from typing import Callable, Optional
 
 import torch
 from torch import Tensor
@@ -330,6 +330,8 @@ def backward_dw(self):
         """Computes the weight gradients for the transformer layer node."""
         if not self.delay_wgrad_compute:
             return
+        if isinstance(self.stream, Callable):
+            self.stream = self.stream()
         with torch.cuda.stream(self.stream):
             torch.cuda.nvtx.range_push(f"{self.name} wgrad")
             for module in self.bwd_dw_callables:

@@ -26,6 +26,7 @@
 from megatron.core.tensor_parallel import gather_from_sequence_parallel_region
 from megatron.core.transformer.enums import CudaGraphScope, ModelType
 from megatron.core.transformer.linear_cross_entropy import LinearCrossEntropyModule
+from megatron.core.transformer.moe.paged_stash import paged_stash_init_chunk_handler
 from megatron.core.transformer.multi_token_prediction import (
     MultiTokenPredictionBlock,
     mtp_on_this_rank,
@@ -473,6 +474,12 @@ def preprocess_for_fine_grained_offloading(self):
                     off_interface.mark_not_offloadable(param)
             self.disable_param_offloading = False
 
+    def preprocess_for_paged_stash(self):
+        """Preprocess for paged stash."""
+        return paged_stash_init_chunk_handler(
+            vp_size=self.config.virtual_pipeline_model_parallel_size, vp_stage=self.vp_stage
+        )
+
     def forward(
         self,
         input_ids: Tensor,
@@ -505,6 +512,9 @@ def forward(
         if self.config.fine_grained_activation_offloading:
             self.preprocess_for_fine_grained_offloading()
 
+        if self.config.moe_paged_stash:
+            self.preprocess_for_paged_stash()
+
         inference_context = deprecate_inference_params(inference_context, inference_params)
 
         preproc_output = self._preprocess(
@@ -745,6 +755,8 @@ def build_schedule_plan(
 
         if self.config.fine_grained_activation_offloading:
             self.preprocess_for_fine_grained_offloading()
+        if self.config.moe_paged_stash:
+            self.preprocess_for_paged_stash()
 
         from ..common.model_chunk_schedule_plan import TransformerModelChunkSchedulePlan
 

@@ -8,7 +8,12 @@
 
 from megatron.core.enums import Fp8Recipe
 from megatron.core.fp8_utils import get_fp8_context
-from megatron.core.pipeline_parallel.utils import AbstractSchedulePlan, ScheduleNode, set_streams
+from megatron.core.pipeline_parallel.utils import (
+    AbstractSchedulePlan,
+    ScheduleNode,
+    get_comp_stream,
+    set_streams,
+)
 from megatron.core.utils import get_attr_wrapped_model
 
 # Types
@@ -47,7 +52,7 @@ def combined_1f1b_schedule_for_no_pipelining(
     Phases 4: 4th microbatch backward
     """
 
-    set_streams()
+    set_streams(use_dynamic_comp_stream=config.use_dynamic_comp_stream)
     # The forward step for the first microbatch is executed alone, no a2a overlapping
     output_tensor, num_tokens, _ = combined_forward_backward_step(
         forward_step_func,
@@ -173,7 +178,7 @@ def combined_1f1b_schedule_for_interleaved_pipelining():
                 # backward_step_helper_postprocess()
     """
 
-    set_streams()
+    set_streams(use_dynamic_comp_stream=config.use_dynamic_comp_stream)
     # forward prepare
     f_model_chunk_id = None
     f_microbatch_id = None
@@ -405,7 +410,7 @@ def forward_backward_step():
         from megatron.core.pipeline_parallel.schedules import forward_step_calc_loss
 
         loss_node = ScheduleNode(
-            loss_func, torch.cuda.current_stream(), f_schedule_plan.event, name="loss_func"
+            loss_func, get_comp_stream, f_schedule_plan.event, name="loss_func"
         )
         loss_func = loss_node.forward
         output_tensor, num_tokens = forward_step_calc_loss(

@@ -22,6 +22,7 @@
 from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.transformer.cuda_graphs import create_cudagraphs
 from megatron.core.transformer.enums import CudaGraphScope
+from megatron.core.transformer.moe.paged_stash import paged_stash_reset
 from megatron.core.transformer.moe.router import MoEAuxLossAutoScaler
 from megatron.core.utils import (
     drain_embedding_wgrad_compute,
@@ -590,6 +591,8 @@ def forward_backward_no_pipelining(
     if config.timers is not None:
         config.timers('forward-backward', log_level=1).start(barrier=config.barrier_with_L1_time)
 
+    paged_stash_reset(enabled=config.moe_paged_stash and not forward_only, config=config)
+
     no_sync_func = config.no_sync_func
     if no_sync_func is None:
         no_sync_func = contextlib.nullcontext
@@ -1049,6 +1052,8 @@ def forward_backward_pipelining_with_interleaving(
         adjust_tensor_shapes_fn is None
     ), "adjust_tensor_shapes_fn is not supported for interleaved pipeline parallelism"
 
+    paged_stash_reset(enabled=config.moe_paged_stash and not forward_only, config=config)
+
     if config.overlap_p2p_comm and config.batch_p2p_comm:
         raise ValueError("Can not use both overlap_p2p_comm and batch_p2p_comm")
 
@@ -2232,6 +2237,8 @@ def forward_backward_pipelining_without_interleaving(
     if config.timers is not None:
         config.timers('forward-backward', log_level=1).start(barrier=config.barrier_with_L1_time)
 
+    paged_stash_reset(enabled=config.moe_paged_stash and not forward_only, config=config)
+
     # Disable async grad reductions
     no_sync_func = config.no_sync_func
     if no_sync_func is None:
Original file line number	Diff line number	Diff line change
Expand Up		@@ -253,3 +253,4 @@ def weighted_bias_swiglu_impl(input, bias, weights, fp8_input_store=False):

		# bias_swiglu_impl = BiasSwiGLUFunction.apply
		# swiglu_impl = SwiGLUFunction.apply