NVIDIA
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h‎
Lines changed: 5 additions & 0 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/triton_backend_mamba.py‎
Lines changed: 24 additions & 19 deletions b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/mamba/triton_backend_mamba.py‎
Lines changed: 24 additions & 19 deletions
diff --git a/‎tensorrt_llm/_torch/models/checkpoints/hf/nemotron_h_weight_mapper.py‎
Lines changed: 11 additions & 0 deletions b/‎tensorrt_llm/_torch/models/checkpoints/hf/nemotron_h_weight_mapper.py‎
Lines changed: 11 additions & 0 deletions
@@ -675,6 +675,11 @@ class TllmGenFmhaKernel
             {
                 continue;
             }
+            // If tileSizeQ < mNumHeadsQPerKv, this will result in 0, causing division by zero.
+            if (tileSizeQ < params.mNumHeadsQPerKv)
+            {
+                continue;
+            }
 
             // Update the tileSizeQ.
             selectKernelParamsCopy.mTileSizeQ = tileSizeQ;
 
@@ -144,8 +144,15 @@ def _triton_cached_ssm(
     num_seq = num_prefill + num_decode
     num_total_tokens = num_prefill_tokens + num_decode
 
-    y_prefill = None
-    y_decode = None
+    # Preallocate output tensor to avoid memcpy cost for merging prefill
+    # and decode outputs
+    preallocated_ssm_out = torch.empty(
+        [bs, num_heads, head_dim],
+        dtype=hidden_states.dtype,
+        device=hidden_states.device,
+    )
+    preallocated_ssm_out_p = preallocated_ssm_out[:num_prefill_tokens]
+    preallocated_ssm_out_d = preallocated_ssm_out[num_prefill_tokens:num_total_tokens]
 
     # Prefill: concatenate tokens at the front and run combined scan
     if num_prefill > 0:
@@ -165,7 +172,7 @@ def _triton_cached_ssm(
             chunk_indices = None
             chunk_offsets = None
 
-        y_prefill, varlen_states = mamba_chunk_scan_combined(
+        varlen_states = mamba_chunk_scan_combined(
             hs_prefill,
             dt_prefill,
             A,
@@ -184,11 +191,12 @@ def _triton_cached_ssm(
             dt_limit=(time_step_limit[0], time_step_limit[1]),
             return_final_states=False,
             return_varlen_states=True,
-            mamba_ssm_cache_dtype=ssm_state_cache.dtype,
+            out=preallocated_ssm_out_p.unsqueeze(0),
+            state_dtype=ssm_state_cache.dtype,
         )
 
         ssm_state_cache.index_copy_(
-            0, slot_idx[:num_prefill], varlen_states.to(ssm_state_cache.dtype)
+            0, slot_idx[:num_prefill].long(), varlen_states.to(ssm_state_cache.dtype)
         )
 
     # Decode: batch single-token updates via selective_state_update
@@ -205,7 +213,7 @@ def _triton_cached_ssm(
         A_full = A[..., None, None].expand(num_heads, head_dim, ssm_state_size)
         D_full = D[..., None].expand(num_heads, head_dim)
 
-        y_decode = selective_state_update(
+        selective_state_update(
             ssm_state_cache,
             x_decode,
             dt_hp,
@@ -217,19 +225,16 @@ def _triton_cached_ssm(
             dt_bias=dt_bias_hp,
             dt_softplus=True,
             state_batch_indices=slot_idx_decode,
-        )  # [nd, H, D]
-
-    # Dispatch return logic
-    if num_prefill > 0 and num_decode > 0:
-        y = torch.empty_like(hidden_states, memory_format=torch.contiguous_format)
-        y_flat = y.view(bs, *y.shape[2:])
-        y_flat[:num_prefill_tokens].copy_(y_prefill[0])
-        y_flat[num_prefill_tokens:num_total_tokens].copy_(y_decode)
-        return y
-    elif num_prefill > 0:
-        return y_prefill[0].view(b, s, num_heads, head_dim).to(hidden_states.dtype)
-    elif num_decode > 0:
-        return y_decode.view(b, s, num_heads, head_dim).to(hidden_states.dtype)
+            out=preallocated_ssm_out_d,
+        )
+
+    # Return the preallocated output reshaped to original dimensions
+    if num_total_tokens > 0:
+        return (
+            preallocated_ssm_out[:num_total_tokens]
+            .view(b, s, num_heads, head_dim)
+            .to(hidden_states.dtype)
+        )
     else:
         return torch.empty_like(hidden_states)
 
 
@@ -1,5 +1,6 @@
 import torch
 
+import tensorrt_llm.logger as logger
 from tensorrt_llm._torch.models.checkpoints.hf.weight_mapper import \
     HfWeightMapper
 from tensorrt_llm._torch.models.modeling_utils import register_mapper
@@ -55,6 +56,16 @@ def _split_mamba2_mixer_in_proj(w: torch.Tensor) -> torch.Tensor:
             if "embeddings" in key:
                 key = key.replace("embeddings", "embed_tokens")
 
+            # MTP layers are stored as mtp.layers.0.xxx (sublayer 0, Attention) and mtp.layers.1.xxx (sublayer 1, MoE)
+            if "mtp.layers." in key:
+                import re
+                match = re.match(r'mtp\.layers\.(\d+)\.(.*)', key)
+                if match:
+                    sublayer_idx, rest = match.groups()
+                    key = f"model.layers.{config.num_hidden_layers}.layers.{sublayer_idx}.{rest}"
+                else:
+                    logger.error(f"Failed to match MTP pattern for: {name}")
+
             if "A_log" in key:
                 key = key.replace("A_log", "A")
Original file line number	Diff line number	Diff line change
`@@ -675,6 +675,11 @@ class TllmGenFmhaKernel`
`675`	`675`	`{`
`676`	`676`	`continue;`
`677`	`677`	`}`
	`678`	`+ // If tileSizeQ < mNumHeadsQPerKv, this will result in 0, causing division by zero.`
	`679`	`+ if (tileSizeQ < params.mNumHeadsQPerKv)`
	`680`	`+ {`
	`681`	`+ continue;`
	`682`	`+ }`
`678`	`683`
`679`	`684`	`// Update the tileSizeQ.`
`680`	`685`	`selectKernelParamsCopy.mTileSizeQ = tileSizeQ;`