Perf/fuse mamba state scatter mtp verify (sgl-project#18088)

xutizhou · web-flow · commit a1b39c1c26d4 · 2026-02-25T15:40:55.000+08:00
diff --git a/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py b/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py
@@ -32,6 +32,9 @@
     ForwardMetadata,
     Mamba2Metadata,
 )
+from sglang.srt.layers.attention.mamba.mamba_state_scatter_triton import (
+    fused_mamba_state_scatter_with_mask,
+)
 from sglang.srt.layers.radix_attention import RadixAttention
 from sglang.srt.layers.radix_linear_attention import RadixLinearAttention
 from sglang.srt.mem_cache.memory_pool import HybridReqToTokenPool, MambaPool
@@ -1699,16 +1702,22 @@ def update_mamba_state_after_mtp_verify(
         mamba_steps_to_track: Optional[torch.Tensor],
         model,
     ):
+        """
+        Update mamba states after MTP verify using fully fused Triton kernel.
+
+        This replaces the original advanced indexing operations with a single fused
+        gather-scatter kernel that also handles masking internally, avoiding:
+        - index_elementwise_kernel from tensor[bool_mask]
+        - index_select kernel launches
+        - nonzero kernel launches
+        """
         request_number = accepted_steps.shape[0]
 
         state_indices_tensor = (
             self.linear_attn_backend.forward_metadata.mamba_cache_indices[
                 :request_number
             ]
         )
-        intermediate_state_indices = torch.arange(
-            request_number, dtype=torch.int32, device=state_indices_tensor.device
-        )
 
         mamba_caches = (
             self.linear_attn_backend.req_to_token_pool.get_speculative_mamba2_params_all_layers()
@@ -1719,41 +1728,34 @@ def update_mamba_state_after_mtp_verify(
         intermediate_state_cache = mamba_caches.intermediate_ssm
         intermediate_conv_window_cache = mamba_caches.intermediate_conv_window[0]
 
-        # Compute common indices once to avoid duplication
-        valid_mask = accepted_steps >= 0
-        dst_state_indices = state_indices_tensor[valid_mask].to(torch.int64)  # [N]
-        src_state_indices = intermediate_state_indices[valid_mask].to(
-            torch.int64
-        )  # [N]
-        last_steps = accepted_steps[valid_mask].to(torch.int64)  # [N]
-
-        # scatter into ssm_states at the chosen cache lines
-        ssm_states[:, dst_state_indices, :] = intermediate_state_cache[
-            :, src_state_indices, last_steps
-        ].to(ssm_states.dtype, copy=False)
-
-        # Scatter into conv_states at the chosen cache lines
-        conv_states[:, dst_state_indices, :] = intermediate_conv_window_cache[
-            :, src_state_indices, last_steps
-        ].to(conv_states.dtype, copy=False)
+        # Use fully fused kernel that handles masking internally
+        # This avoids separate nonzero() and index_select() calls
+        fused_mamba_state_scatter_with_mask(
+            ssm_states,
+            intermediate_state_cache,
+            state_indices_tensor,
+            accepted_steps,
+        )
+        fused_mamba_state_scatter_with_mask(
+            conv_states,
+            intermediate_conv_window_cache,
+            state_indices_tensor,
+            accepted_steps,
+        )
 
         # Track indices used for tracking mamba states for prefix cache
         if mamba_track_indices is not None:
             assert mamba_steps_to_track is not None
-            track_mask = mamba_steps_to_track >= 0
-            track_steps = mamba_steps_to_track[track_mask].to(torch.int64)  # [N]
-            if track_steps.numel() == 0:
-                # No track indices to update
-                return
-            dst_track_indices = mamba_track_indices[track_mask].to(torch.int64)
-            src_track_indices = intermediate_state_indices[track_mask].to(torch.int64)
-
-            # scatter into ssm_states at the chosen track states
-            ssm_states[:, dst_track_indices, :] = intermediate_state_cache[
-                :, src_track_indices, track_steps
-            ].to(ssm_states.dtype, copy=False)
-
-            # scatter into conv_states at the chosen track states
-            conv_states[:, dst_track_indices, :] = intermediate_conv_window_cache[
-                :, src_track_indices, track_steps
-            ].to(conv_states.dtype, copy=False)
+            # Use fully fused kernel for track scatter operations
+            fused_mamba_state_scatter_with_mask(
+                ssm_states,
+                intermediate_state_cache,
+                mamba_track_indices,
+                mamba_steps_to_track,
+            )
+            fused_mamba_state_scatter_with_mask(
+                conv_states,
+                intermediate_conv_window_cache,
+                mamba_track_indices,
+                mamba_steps_to_track,
+            )
diff --git a/python/sglang/srt/layers/attention/mamba/mamba_state_scatter_triton.py b/python/sglang/srt/layers/attention/mamba/mamba_state_scatter_triton.py
@@ -0,0 +1,190 @@
+"""
+Fused Triton kernel for Mamba state scatter operations.
+
+This kernel replaces the expensive advanced indexing operations in
+`update_mamba_state_after_mtp_verify` with a single fused gather-scatter kernel,
+avoiding multiple `index_elementwise_kernel` launches.
+"""
+
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _fused_mamba_state_scatter_with_mask_kernel(
+    src_ptr,
+    dst_ptr,
+    # Raw index arrays (before index_select)
+    dst_indices_raw_ptr,  # [total_requests] - state_indices_tensor
+    step_indices_raw_ptr,  # [total_requests] - accepted_steps or mamba_steps_to_track
+    # Total number of requests
+    total_requests,
+    elem_per_entry: tl.constexpr,
+    src_layer_stride,
+    src_req_stride,
+    src_step_stride,
+    dst_layer_stride,
+    dst_req_stride,
+    src_req_size,
+    src_step_size,
+    dst_req_size,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    Fused gather-scatter kernel with built-in masking.
+
+    This kernel fuses the index_select operations by:
+    1. Iterating over all requests (pid_req from 0 to total_requests-1)
+    2. Checking if step_indices_raw[pid_req] >= 0 (valid mask)
+    3. If valid, performing the scatter:
+       dst[l, dst_indices_raw[pid_req], :] = src[l, pid_req, step_indices_raw[pid_req], :]
+
+    Grid: (total_requests, num_layers, ceil(elem_per_entry / BLOCK_SIZE))
+    """
+    pid_req = tl.program_id(0)
+    pid_layer = tl.program_id(1).to(tl.int64)
+    pid_block = tl.program_id(2).to(tl.int64)
+
+    # Load step index to check validity (step >= 0 means valid)
+    step_idx = tl.load(step_indices_raw_ptr + pid_req).to(tl.int64)
+
+    # Early exit if this request is not valid (step < 0)
+    if step_idx < 0:
+        return
+
+    # Load destination index
+    dst_idx = tl.load(dst_indices_raw_ptr + pid_req).to(tl.int64)
+
+    # Source index is just the request index itself
+    src_idx = pid_req
+
+    # Bounds check to avoid illegal memory access
+    if not (
+        (dst_idx >= 0)
+        & (dst_idx < dst_req_size)
+        & (src_idx >= 0)
+        & (src_idx < src_req_size)
+        & (step_idx < src_step_size)
+    ):
+        return
+
+    # Compute base offsets
+    src_offset = (
+        pid_layer * src_layer_stride
+        + src_idx * src_req_stride
+        + step_idx * src_step_stride
+    )
+    dst_offset = pid_layer * dst_layer_stride + dst_idx * dst_req_stride
+
+    # Compute element range for this block
+    start = pid_block * BLOCK_SIZE
+    offsets = start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < elem_per_entry
+
+    # Load from source and store to destination
+    data = tl.load(src_ptr + src_offset + offsets, mask=mask)
+    tl.store(dst_ptr + dst_offset + offsets, data, mask=mask)
+
+
+def fused_mamba_state_scatter_with_mask(
+    dst: torch.Tensor,  # [num_layers, cache_size, *state_shape]
+    src: torch.Tensor,  # [num_layers, spec_size, draft_tokens, *state_shape]
+    dst_indices_raw: torch.Tensor,  # [total_requests] - raw indices (e.g., state_indices_tensor)
+    step_indices_raw: torch.Tensor,  # [total_requests] - raw step indices (step >= 0 means valid)
+):
+    """
+    Fully fused gather-scatter with built-in masking for mamba state updates.
+
+    This function fuses the following operations into a single kernel:
+    1. valid_mask = step_indices_raw >= 0
+    2. valid_indices = valid_mask.nonzero()
+    3. dst_indices = dst_indices_raw[valid_indices]  (index_select)
+    4. step_indices = step_indices_raw[valid_indices]  (index_select)
+    5. for each valid i: dst[:, dst_indices[i], :] = src[:, i, step_indices[i], :]
+
+    Args:
+        dst: Destination tensor [num_layers, cache_size, *state_shape]
+        src: Source tensor [num_layers, spec_size, draft_tokens, *state_shape]
+        dst_indices_raw: Raw destination indices for all requests [total_requests]
+        step_indices_raw: Raw step indices; entry >= 0 means valid [total_requests]
+    """
+    total_requests = step_indices_raw.shape[0]
+    if total_requests == 0:
+        return
+
+    if dst.device != src.device:
+        raise ValueError(
+            f"dst and src must be on the same device. {dst.device=} {src.device=}"
+        )
+    if not dst.is_cuda or not src.is_cuda:
+        raise ValueError(
+            "fused_mamba_state_scatter_with_mask only supports CUDA tensors."
+        )
+    if dst.ndim < 2 or src.ndim < 3:
+        raise ValueError(f"Unexpected tensor ranks: {dst.ndim=} {src.ndim=}")
+    if dst.shape[0] != src.shape[0]:
+        raise ValueError(
+            f"Layer dimension mismatch: {dst.shape[0]=} vs {src.shape[0]=}"
+        )
+    if dst.shape[2:] != src.shape[3:]:
+        raise ValueError(
+            f"Trailing dims mismatch: {dst.shape[2:]=} vs {src.shape[3:]=}"
+        )
+    if dst_indices_raw.ndim != 1 or step_indices_raw.ndim != 1:
+        raise ValueError(
+            f"indices must be 1D: {dst_indices_raw.shape=} {step_indices_raw.shape=}"
+        )
+    if dst_indices_raw.shape[0] != step_indices_raw.shape[0]:
+        raise ValueError(
+            f"indices length mismatch: {dst_indices_raw.shape[0]=} vs {step_indices_raw.shape[0]=}"
+        )
+
+    num_layers = dst.shape[0]
+    src_req_size = src.shape[1]
+    src_step_size = src.shape[2]
+    dst_req_size = dst.shape[1]
+
+    # Flatten trailing dimensions: number of elements per (layer, cache_line) entry.
+    elem_per_entry = dst.numel() // (dst.shape[0] * dst.shape[1])
+
+    # Get strides (in elements, not bytes)
+    src_layer_stride = src.stride(0)
+    src_req_stride = src.stride(1)
+    src_step_stride = src.stride(2)
+    dst_layer_stride = dst.stride(0)
+    dst_req_stride = dst.stride(1)
+
+    # Ensure indices are int32 and contiguous
+    dst_indices_raw = dst_indices_raw.to(torch.int32).contiguous()
+    step_indices_raw = step_indices_raw.to(torch.int32).contiguous()
+
+    # Ensure tensors are contiguous
+    if not dst.is_contiguous():
+        raise ValueError("dst tensor must be contiguous")
+    if not src.is_contiguous():
+        raise ValueError("src tensor must be contiguous")
+
+    # Block size for copying elements
+    BLOCK_SIZE = 1024
+
+    # Grid over all requests - invalid ones will early-exit in the kernel
+    grid = (total_requests, num_layers, triton.cdiv(elem_per_entry, BLOCK_SIZE))
+
+    _fused_mamba_state_scatter_with_mask_kernel[grid](
+        src,
+        dst,
+        dst_indices_raw,
+        step_indices_raw,
+        total_requests,
+        elem_per_entry,
+        src_layer_stride,
+        src_req_stride,
+        src_step_stride,
+        dst_layer_stride,
+        dst_req_stride,
+        src_req_size,
+        src_step_size,
+        dst_req_size,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
diff --git a/test/unit/test_mamba_state_scatter_triton.py b/test/unit/test_mamba_state_scatter_triton.py