NVIDIA
diff --git a/‎megatron/core/extensions/transformer_engine.py‎
Lines changed: 1 addition & 1 deletion b/‎megatron/core/extensions/transformer_engine.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎megatron/core/models/common/model_chunk_schedule_plan.py‎
Lines changed: 2 additions & 0 deletions b/‎megatron/core/models/common/model_chunk_schedule_plan.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎megatron/core/models/gpt/fine_grained_callables.py‎
Lines changed: 14 additions & 7 deletions b/‎megatron/core/models/gpt/fine_grained_callables.py‎
Lines changed: 14 additions & 7 deletions
diff --git a/‎megatron/core/models/gpt/gpt_model.py‎
Lines changed: 33 additions & 4 deletions b/‎megatron/core/models/gpt/gpt_model.py‎
Lines changed: 33 additions & 4 deletions
diff --git a/‎megatron/core/models/mamba/mamba_model.py‎
Lines changed: 2 additions & 0 deletions b/‎megatron/core/models/mamba/mamba_model.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎megatron/core/ssm/mamba_block.py‎
Lines changed: 2 additions & 0 deletions b/‎megatron/core/ssm/mamba_block.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎megatron/core/transformer/mlp.py‎
Lines changed: 1 addition & 1 deletion b/‎megatron/core/transformer/mlp.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎megatron/core/transformer/moe/moe_layer.py‎
Lines changed: 14 additions & 7 deletions b/‎megatron/core/transformer/moe/moe_layer.py‎
Lines changed: 14 additions & 7 deletions
@@ -2161,7 +2161,7 @@ def forward_post_hook(module, *_) -> None:
                     "TEFusedMLP module does not support submodules with post-backward hooks"
                 )
 
-        def forward(self, hidden_states: torch.Tensor) -> Tuple[Tensor, Optional[Tensor]]:
+        def forward(self, hidden_states: torch.Tensor, **kwargs) -> Tuple[Tensor, Optional[Tensor]]:
             """Forward."""
 
             # Construct fused impl if needed
 
@@ -281,6 +281,7 @@ def __init__(
         extra_block_kwargs=None,
         runtime_gather_output: Optional[bool] = None,
         loss_mask: Optional[Tensor] = None,
+        padding_mask=None,
     ):
         """Initialize the schedule plan of all Transformer layers' sub-modules.
 
@@ -323,6 +324,7 @@ def __init__(
         self._model_chunk_state.mtp_hidden_states = None
         self._model_chunk_state.loss_mask = loss_mask
         self._model_chunk_state.packed_seq_params = packed_seq_params
+        self._model_chunk_state.padding_mask = padding_mask
         self._model_chunk_state.extra_block_kwargs = extra_block_kwargs
         self._model_chunk_state.runtime_gather_output = runtime_gather_output
         self._model_chunk_state.model = model
 
@@ -131,13 +131,19 @@ def forward_impl(self):
         if not self.gpt_model.pre_process:
             self.chunk_state.decoder_input = self.gpt_model.decoder.input_tensor
         # Run GPTModel._preprocess
-        decoder_input, rotary_pos_emb, rotary_pos_cos, rotary_pos_sin, sequence_len_offset = (
-            self.gpt_model._preprocess(
-                input_ids=self.chunk_state.input_ids,
-                position_ids=self.chunk_state.position_ids,
-                decoder_input=self.chunk_state.decoder_input,
-                packed_seq_params=self.chunk_state.packed_seq_params,
-            )
+        (
+            decoder_input,
+            rotary_pos_emb,
+            rotary_pos_cos,
+            rotary_pos_sin,
+            sequence_len_offset,
+            padding_mask,
+        ) = self.gpt_model._preprocess(
+            input_ids=self.chunk_state.input_ids,
+            position_ids=self.chunk_state.position_ids,
+            decoder_input=self.chunk_state.decoder_input,
+            packed_seq_params=self.chunk_state.packed_seq_params,
+            padding_mask=self.chunk_state.padding_mask,
         )
 
         # Saved for later use
@@ -146,6 +152,7 @@ def forward_impl(self):
         self.chunk_state.rotary_pos_cos = rotary_pos_cos
         self.chunk_state.rotary_pos_sin = rotary_pos_sin
         self.chunk_state.sequence_len_offset = sequence_len_offset
+        self.chunk_state.padding_mask = padding_mask
         return decoder_input
 
 
 
@@ -288,6 +288,7 @@ def _preprocess(
         decoder_input: Tensor = None,
         inference_context: BaseInferenceContext = None,
         packed_seq_params: PackedSeqParams = None,
+        padding_mask: Optional[Tensor] = None,
     ):
         """Preprocesses inputs for the transformer decoder.
 
@@ -304,7 +305,20 @@ def _preprocess(
         if decoder_input is not None:
             pass
         elif self.pre_process:
+            if padding_mask is not None:
+                assert padding_mask.shape == input_ids.shape, (
+                    f"padding_mask shape {padding_mask.shape} does not match "
+                    f"input_ids shape {input_ids.shape}"
+                )
             decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids)
+            if padding_mask is not None and self.config.sequence_parallel:
+                padding_mask = (
+                    tensor_parallel.scatter_to_sequence_parallel_region(
+                        padding_mask.transpose(0, 1).contiguous()
+                    )
+                    .transpose(0, 1)
+                    .contiguous()
+                )
         else:
             # intermediate stage of pipeline
             # decoder will get hidden_states from encoder.input_tensor
@@ -423,6 +437,7 @@ def _preprocess(
             rotary_pos_cos,
             rotary_pos_sin,
             sequence_len_offset,
+            padding_mask,
         )
         if rotary_pos_cos_sin is not None:
             # only in the case of flashinfer fused rope will we
@@ -466,6 +481,7 @@ def forward(
         *,
         inference_params: Optional[BaseInferenceContext] = None,
         loss_mask: Optional[Tensor] = None,
+        padding_mask: Optional[Tensor] = None,
     ) -> Tensor:
         """Forward function of the GPT Model This function passes the input tensors
         through the embedding layer, and then the decoder and finally into the post
@@ -476,6 +492,9 @@ def forward(
         Args:
             runtime_gather_output (bool): Gather output at runtime. Default None means
                 `parallel_output` arg in the constructor will be used.
+            padding_mask (Tensor, optional): Padding mask for MoE routing.
+                Shape [bsz, seq_length]. True = padding (exclude), False = valid (include).
+                Only used for MoE layers to exclude padding tokens from routing computations.
         """
         if self.config.fine_grained_activation_offloading:
             self.preprocess_for_fine_grained_offloading()
@@ -488,13 +507,19 @@ def forward(
             decoder_input=decoder_input,
             inference_context=inference_context,
             packed_seq_params=packed_seq_params,
+            padding_mask=padding_mask,
         )
 
-        (decoder_input, rotary_pos_emb, rotary_pos_cos, rotary_pos_sin, sequence_len_offset) = (
-            preproc_output[:5]
-        )
+        (
+            decoder_input,
+            rotary_pos_emb,
+            rotary_pos_cos,
+            rotary_pos_sin,
+            sequence_len_offset,
+            padding_mask,
+        ) = preproc_output[:6]
 
-        rotary_pos_cos_sin = preproc_output[5] if len(preproc_output) == 6 else None
+        rotary_pos_cos_sin = preproc_output[6] if len(preproc_output) == 7 else None
 
         # Run decoder.
         hidden_states = self.decoder(
@@ -507,6 +532,7 @@ def forward(
             rotary_pos_cos_sin=rotary_pos_cos_sin,
             packed_seq_params=packed_seq_params,
             sequence_len_offset=sequence_len_offset,
+            padding_mask=padding_mask,
             **(extra_block_kwargs or {}),
         )
 
@@ -723,6 +749,7 @@ def build_schedule_plan(
         runtime_gather_output: Optional[bool] = None,
         inference_params: Optional[BaseInferenceContext] = None,
         loss_mask: Optional[Tensor] = None,
+        padding_mask: Optional[Tensor] = None,
     ):
         """Builds a computation schedule plan for the model.
 
@@ -748,6 +775,7 @@ def build_schedule_plan(
             inference_params (InferenceParams, optional):
                 Parameters for inference. Defaults to None.
             loss_mask (Optional[Tensor], optional): Loss mask. Defaults to None.
+            padding_mask (Optional[Tensor], optional): Padding mask. Defaults to None.
 
         Returns:
             TransformerModelChunkSchedulePlan: The model chunk schedule plan.
@@ -769,6 +797,7 @@ def build_schedule_plan(
             extra_block_kwargs,
             runtime_gather_output,
             loss_mask,
+            padding_mask,
         )
 
     def sharded_state_dict(
 
@@ -185,6 +185,7 @@ def forward(
         *,
         inference_params: Optional[BaseInferenceContext] = None,
         packed_seq_params: Optional[PackedSeqParams] = None,
+        padding_mask: Optional[Tensor] = None,
     ) -> Tensor:
         """Forward function of the Mamba model. This function passes the input tensors
         through the embedding layer, and then the decoder and finally into the post
@@ -254,6 +255,7 @@ def forward(
             inference_context=inference_context,
             rotary_pos_emb=rotary_pos_emb,
             packed_seq_params=packed_seq_params,
+            padding_mask=padding_mask,
         )
 
         if not self.post_process:
 
@@ -211,6 +211,7 @@ def forward(
         *,
         inference_params: Optional[BaseInferenceContext] = None,
         packed_seq_params: Optional[PackedSeqParams] = None,
+        padding_mask=None,
     ):
         """
         Forward function of the MambaStack class.
@@ -293,6 +294,7 @@ def forward(
                             rotary_pos_emb=rotary_pos_emb,
                             sequence_len_offset=sequence_len_offset,
                             packed_seq_params=packed_seq_params,
+                            padding_mask=padding_mask,
                         )
                     else:  # MambaLayer
                         hidden_states = layer(
 
@@ -148,7 +148,7 @@ def __init__(
             tp_group=tp_group,
         )
 
-    def forward(self, hidden_states, per_token_scale=None):
+    def forward(self, hidden_states, per_token_scale=None, **kwargs):
         """Perform the forward pass through the MLP block."""
         # [s, b, 4 * h/p]
         nvtx_range_push(suffix="linear_fc1")
 
@@ -239,13 +239,13 @@ def __init__(
         self.cudagraph_tensor_store = MoECudaGraphTensorStore()
 
     @maybe_skip_or_early_return_by_cudagraph("route")
-    def route(self, hidden_states: torch.Tensor):
+    def route(self, hidden_states: torch.Tensor, padding_mask: Optional[torch.Tensor] = None):
         """Compute token routing for preprocessing.
 
         This method uses the router to determine which experts to send each token to,
         producing routing probabilities and a mapping.
         """
-        probs, routing_map = apply_module(self.router)(hidden_states)
+        probs, routing_map = apply_module(self.router)(hidden_states, padding_mask)
         return probs, routing_map
 
     @maybe_skip_or_early_return_by_cudagraph("preprocess")
@@ -346,7 +346,7 @@ def router_and_preprocess(self, hidden_states: torch.Tensor):
         hidden_states, probs, residual = self.preprocess(hidden_states, probs, routing_map)
         return hidden_states, probs, residual
 
-    def forward(self, hidden_states: torch.Tensor):
+    def forward(self, hidden_states: torch.Tensor, padding_mask: Optional[torch.Tensor] = None):
         """Forward pass for the MoE layer.
 
         The forward pass comprises four main steps:
@@ -356,8 +356,10 @@ def forward(self, hidden_states: torch.Tensor):
         4. Combine: The outputs from the experts are combined and returned.
 
         Args:
-            hidden_states (torch.Tensor): The input tensor to the MoE layer.
-
+            hidden_states (torch.Tensor): The input tensor shape [seq_length, bsz, hidden_size].
+            padding_mask (torch.Tensor, optional): Boolean mask indicating non-padding tokens.
+                                                   Shape [seq_length, bsz]. True for valid tokens,
+                                                   False for padding tokens. Defaults to None.
         Returns:
             A tuple containing the output tensor and the MLP bias, if any.
         """
@@ -366,12 +368,15 @@ def forward(self, hidden_states: torch.Tensor):
                 "During training, performance may degrade if MoE and tensor parallelism"
                 "are enabled without also enabling sequence parallelism."
             )
+        # Transpose from [bsz, seq_length] to [seq_length, bsz] to align with hidden_states
+        if padding_mask is not None:
+            padding_mask = padding_mask.transpose(0, 1).bool()
 
         # MoE forward: route -> dispatch -> compute -> combine
         def custom_forward(hidden_states):
             try:
                 shared_expert_output = self.shared_experts_compute(hidden_states)
-                probs, routing_map = self.route(hidden_states)
+                probs, routing_map = self.route(hidden_states, padding_mask)
                 hidden_states, probs = self.preprocess(hidden_states, probs, routing_map)
             except MoECudaGraphPartialCaptureSignal as e:
                 # This signal is raised from the maybe_skip_or_early_return_by_cudagraph decorator.
@@ -398,7 +403,9 @@ def custom_forward(hidden_states):
                     hidden_states,
                 )
             else:
-                outputs = tensor_parallel.checkpoint(custom_forward, False, hidden_states)
+                outputs = tensor_parallel.checkpoint(
+                    custom_forward, False, hidden_states, padding_mask
+                )
         else:
             outputs = custom_forward(hidden_states)
Original file line number	Diff line number	Diff line change
`@@ -2161,7 +2161,7 @@ def forward_post_hook(module, *_) -> None:`
`2161`	`2161`	`"TEFusedMLP module does not support submodules with post-backward hooks"`
`2162`	`2162`	`)`
`2163`	`2163`
`2164`		`- def forward(self, hidden_states: torch.Tensor) -> Tuple[Tensor, Optional[Tensor]]:`
	`2164`	`+ def forward(self, hidden_states: torch.Tensor, **kwargs) -> Tuple[Tensor, Optional[Tensor]]:`
`2165`	`2165`	`"""Forward."""`
`2166`	`2166`
`2167`	`2167`	`# Construct fused impl if needed`
Original file line number	Diff line number	Diff line change
`@@ -148,7 +148,7 @@ def __init__(`
`148`	`148`	`tp_group=tp_group,`
`149`	`149`	`)`
`150`	`150`
`151`		`- def forward(self, hidden_states, per_token_scale=None):`
	`151`	`+ def forward(self, hidden_states, per_token_scale=None, **kwargs):`
`152`	`152`	`"""Perform the forward pass through the MLP block."""`
`153`	`153`	`# [s, b, 4 * h/p]`
`154`	`154`	`nvtx_range_push(suffix="linear_fc1")`