custom flash_attn

WANDY666 · WANDY666 · commit d45865cf6418 · 2025-08-29T08:06:24.000Z
diff --git a/lightllm/common/basemodel/triton_kernel/gen_decode_params.py b/lightllm/common/basemodel/triton_kernel/gen_decode_params.py
@@ -16,7 +16,7 @@ def gen_decode_params(b_seq_len: torch.Tensor):
 
     if enable_fa3_mtp:
         b1_cu_q_seq_len, b1_cu_kv_seq_len = gen_cumsum_pad0_tensor(
-            b_q_seq_len[: len(b_seq_len) // mtp_size], b_kv_seq_len[mtp_size - 1 :: mtp_size]
+            b_q_seq_len[mtp_size - 1 :: mtp_size], b_kv_seq_len[mtp_size - 1 :: mtp_size]
         )
     else:
         b1_cu_q_seq_len, b1_cu_kv_seq_len = gen_cumsum_pad0_tensor(b_q_seq_len, b_kv_seq_len)
diff --git a/lightllm/common/flash_attn.py b/lightllm/common/flash_attn.py
@@ -1,29 +1,11 @@
-# This file is adapted from sgl-project/sglang:
-# https://github.com/sgl-project/sglang/blob/main/sgl-kernel/python/sgl_kernel/flash_attn.py
-# The original code and this file are licensed under the Apache License, Version 2.0.
-#
-# Copyright (c) sgl-project and other contributors.
-# Modifications Copyright (c) LightLLM contributors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 import torch
 from typing import List, Optional, Tuple, Union
 from lightllm.utils.log_utils import init_logger
 
 logger = init_logger(__name__)
 
 
-def maybe_contiguous(x):
+def get_contiguous(x):
     return x.contiguous() if x is not None and x.stride(-1) != 1 else x
 
 
@@ -34,152 +16,61 @@ def maybe_contiguous(x):
 
     def flash_attn_with_kvcache_mtp(
         q,
-        k_cache,
-        v_cache,
-        k=None,
-        v=None,
-        qv=None,
-        rotary_cos=None,
-        rotary_sin=None,
-        cache_seqlens: Optional[Union[(int, torch.Tensor)]] = None,
-        cache_batch_idx: Optional[torch.Tensor] = None,
-        cache_leftpad: Optional[torch.Tensor] = None,
-        page_table: Optional[torch.Tensor] = None,
+        k,
+        v,
+        k_new: Optional[torch.Tensor] = None,
+        v_new: Optional[torch.Tensor] = None,
+        q_v: Optional[torch.Tensor] = None,
         cu_seqlens_q: Optional[torch.Tensor] = None,
+        cu_seqlens_k: Optional[torch.Tensor] = None,
         cu_seqlens_k_new: Optional[torch.Tensor] = None,
+        seqused_q: Optional[torch.Tensor] = None,
+        seqused_k: Optional[torch.Tensor] = None,
         max_seqlen_q: Optional[int] = None,
+        max_seqlen_k: Optional[int] = None,
+        page_table: Optional[torch.Tensor] = None,
+        cache_batch_idx: Optional[torch.Tensor] = None,
+        cache_leftpad: Optional[torch.Tensor] = None,
+        rotary_cos: Optional[torch.Tensor] = None,
+        rotary_sin: Optional[torch.Tensor] = None,
         rotary_seqlens: Optional[torch.Tensor] = None,
         q_descale: Optional[torch.Tensor] = None,
         k_descale: Optional[torch.Tensor] = None,
         v_descale: Optional[torch.Tensor] = None,
         softmax_scale=None,
-        causal=False,
-        window_size=(-1, -1),  # -1 means infinite context window
+        is_causal=False,
+        window_size=(-1, -1),
         softcap=0.0,  # 0.0 means deactivated
-        rotary_interleaved=True,
+        is_rotary_interleaved=True,
         scheduler_metadata=None,
-        num_splits=0,  # Can be tuned for speed
-        pack_gqa=None,  # Can be tuned for speed
-        sm_margin=0,  # Can be tuned if some SMs are used for communication
-        return_softmax_lse=False,
+        num_splits=0,
+        pack_gqa=None,
+        sm_margin=0,
         mtp_step=0,
     ):
-        """
-        If k and v are not None, k_cache and v_cache will be updated *inplace* with the new values from
-        k and v. This is useful for incremental decoding: you can pass in the cached keys/values from
-        the previous step, and update them with the new keys/values from the current step, and do
-        attention with the updated cache, all in 1 kernel.
-
-        If you pass in k / v, you must make sure that the cache is large enough to hold the new values.
-        For example, the KV cache could be pre-allocated with the max sequence length, and you can use
-        cache_seqlens to keep track of the current sequence lengths of each sequence in the batch.
-
-        Also apply rotary embedding if rotary_cos and rotary_sin are passed in. The key @k will be
-        rotated by rotary_cos and rotary_sin at indices cache_seqlens, cache_seqlens + 1, etc.
-        If causal or local (i.e., window_size != (-1, -1)), the query @q will be rotated by rotary_cos
-        and rotary_sin at indices cache_seqlens, cache_seqlens + 1, etc.
-        If not causal and not local, the query @q will be rotated by rotary_cos and rotary_sin at
-        indices cache_seqlens only (i.e. we consider all tokens in @q to be at position cache_seqlens).
-
-        See tests/test_flash_attn.py::test_flash_attn_kvcache for examples of how to use this function.
-
-        Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
-        than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
-        For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
-        0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.
-
-        If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
-        For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
-            1 1 1 1 0
-            1 1 1 1 1
-        If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
-            0 0
-            0 0
-            0 0
-            1 0
-            1 1
-        If the row of the mask is all zero, the output will be zero.
-
-        If window_size != (-1, -1), implements sliding window local attention. Query at position i
-        will only attend to keys between
-        [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.
-
-        Note: Does not support backward pass.
-
-        Arguments:
-            q: (batch_size, seqlen, nheads, headdim)
-            k_cache: (batch_size_cache, seqlen_cache, nheads_k, headdim) if there's no page_table,
-                or (num_blocks, page_block_size, nheads_k, headdim) if there's a page_table (i.e. paged KV cache)
-                page_block_size must be a multiple of 256.
-            v_cache: (batch_size_cache, seqlen_cache, nheads_k, headdim_v) if there's no page_table,
-                or (num_blocks, page_block_size, nheads_k, headdim_v) if there's a page_table (i.e. paged KV cache)
-            k [optional]: (batch_size, seqlen_new, nheads_k, headdim). If not None, we concatenate
-                k with k_cache, starting at the indices specified by cache_seqlens.
-            v [optional]: (batch_size, seqlen_new, nheads_k, headdim_v). Similar to k.
-            qv [optional]: (batch_size, seqlen, nheads, headdim_v)
-            rotary_cos [optional]: (seqlen_ro, rotary_dim / 2). If not None, we apply rotary embedding
-                to k and q. Only applicable if k and v are passed in. rotary_dim must be divisible by 16.
-            rotary_sin [optional]: (seqlen_ro, rotary_dim / 2). Similar to rotary_cos.
-            cache_seqlens: int, or (batch_size,), dtype torch.int32. The sequence lengths of the
-                KV cache.
-            cache_batch_idx: (batch_size,), dtype torch.int32. The indices used to index into the KV cache.
-                If None, we assume that the batch indices are [0, 1, 2, ..., batch_size - 1].
-                If the indices are not distinct, and k and v are provided, the values updated in the cache
-                    might come from any of the duplicate indices.
-            cache_leftpad: (batch_size,), dtype torch.int32. The index that the KV cache starts. If None, assume 0.
-            page_table [optional]: (batch_size, max_num_blocks_per_seq), dtype torch.int32.
-            softmax_scale: float. The scaling of QK^T before applying softmax.
-                Default to 1 / sqrt(headdim).
-            causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
-            window_size: (left, right). If not (-1, -1), implements sliding window local attention.
-            softcap: float. Anything > 0 activates softcapping attention.
-            rotary_interleaved: bool. Only applicable if rotary_cos and rotary_sin are passed in.
-                If True, rotary embedding will combine dimensions 0 & 1, 2 & 3, etc. If False,
-                rotary embedding will combine dimensions 0 & rotary_dim / 2, 1 & rotary_dim / 2 + 1
-                (i.e. GPT-NeoX style).
-            num_splits: int. If > 1, split the key/value into this many chunks along the sequence.
-            If num_splits == 1, we don't split the key/value. If num_splits == 0, we use a heuristic
-            to automatically determine the number of splits.
-            Don't change this unless you know what you are doing.
-            return_softmax_lse: bool. Whether to return the logsumexp of the attention scores.
-
-        Return:
-            out: (batch_size, seqlen, nheads, headdim).
-            softmax_lse [optional, if return_softmax_lse=True]: (batch_size, nheads, seqlen). The
-                logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
-                normalization factor).
-        """
-        assert k_cache.stride(-1) == 1, "k_cache must have contiguous last dimension"
-        assert v_cache.stride(-1) == 1, "v_cache must have contiguous last dimension"
+        assert k.stride(-1) == 1, "k must have contiguous last dimension"
+        assert v.stride(-1) == 1, "v must have contiguous last dimension"
         if softmax_scale is None:
-            softmax_scale = (q.shape[-1] + (qv.shape[-1] if qv is not None else 0)) ** (-0.5)
-        if cache_seqlens is not None and isinstance(cache_seqlens, int):
-            cache_seqlens = torch.full((k_cache.shape[0],), cache_seqlens, dtype=torch.int32, device=k_cache.device)
-            cache_seqlens = maybe_contiguous(cache_seqlens)
-
-        q, k_cache, k, v = [maybe_contiguous(x) for x in (q, k_cache, k, v)]
-        v_cache = v_cache.contiguous() if v_cache.stride(-1) != 1 and v_cache.stride(-3) != 1 else v_cache
-        cu_seqlens_q, cu_seqlens_k_new = [maybe_contiguous(x) for x in (cu_seqlens_q, cu_seqlens_k_new)]
-        page_table, cache_batch_idx, cache_leftpad = [
-            maybe_contiguous(x) for x in (page_table, cache_batch_idx, cache_leftpad)
-        ]
-        rotary_cos, rotary_sin = [maybe_contiguous(x) for x in (rotary_cos, rotary_sin)]
-        rotary_seqlens = maybe_contiguous(rotary_seqlens)
+            softmax_scale = (q.shape[-1] + (q_v.shape[-1] if q_v is not None else 0)) ** (-0.5)
+        seqused_k = get_contiguous(seqused_k)
 
-        # out, softmax_lse, *rest = torch.ops.sgl_kernel.fwd.default(
+        q, k, k_new, v_new = [get_contiguous(x) for x in (q, k, k_new, v_new)]
+        v = v.contiguous() if v.stride(-1) != 1 and v.stride(-3) != 1 else v
+        cu_seqlens_q, cu_seqlens_k_new = [get_contiguous(x) for x in (cu_seqlens_q, cu_seqlens_k_new)]
+        page_table = get_contiguous(page_table)
         out, softmax_lse, *rest = flash_attn_3_mtp.fwd(
             q,
-            k_cache,
-            v_cache,
             k,
             v,
-            qv,
+            k_new,
+            v_new,
+            q_v,
             None,  # out
             cu_seqlens_q,
             None,  # cu_seqlens_k
             cu_seqlens_k_new,
             None,  # seqused_q
-            cache_seqlens,
+            seqused_k,
             max_seqlen_q,
             None,  # max_seqlen_k
             page_table,
@@ -192,19 +83,19 @@ def flash_attn_with_kvcache_mtp(
             k_descale,
             v_descale,
             softmax_scale,
-            causal,
+            is_causal,
             window_size[0],
             window_size[1],
             0,
             softcap,
-            rotary_interleaved,
+            is_rotary_interleaved,
             scheduler_metadata,
             num_splits,
             pack_gqa,
             sm_margin,
             mtp_step,
         )
-        return (out, softmax_lse, *rest) if return_softmax_lse else out
+        return out
 
 except:
     flash_attn_3_mtp = None
diff --git a/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py b/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py
@@ -563,21 +563,20 @@ def _token_gqa_decode_attention_mtp(
         k_descale, v_descale = None, None
         o_tensor = flash_attn_with_kvcache_mtp(
             q=q_rope.reshape(-1, self.tp_q_head_num_ * self.mtp_size, self.qk_rope_head_dim),
-            k_cache=k_rope,
-            v_cache=kv_nope,
-            qv=q_nope.reshape(-1, self.tp_q_head_num_ * self.mtp_size, self.kv_lora_rank),
+            k=k_rope,
+            v=kv_nope,
+            q_v=q_nope.reshape(-1, self.tp_q_head_num_ * self.mtp_size, self.kv_lora_rank),
             page_table=infer_state.page_table[self.mtp_size - 1 :: self.mtp_size],
-            cache_seqlens=infer_state.b_seq_len[self.mtp_size - 1 :: self.mtp_size].contiguous(),
+            seqused_k=infer_state.b_seq_len[self.mtp_size - 1 :: self.mtp_size].contiguous(),
             cu_seqlens_q=infer_state.cu_seqlens_q,
             cu_seqlens_k_new=infer_state.cu_seqlens_k,
             max_seqlen_q=1,
             softmax_scale=self.softmax_scale,
-            causal=True,
+            is_causal=True,
             window_size=(-1, -1),
             softcap=0.0,
             k_descale=k_descale,
             v_descale=v_descale,
-            return_softmax_lse=False,
             mtp_step=self.mtp_step,
         )
         return o_tensor.view(-1, self.tp_q_head_num_, self.kv_lora_rank)
diff --git a/test/benchmark/kernel/benchmark_fa3_decode_mtp.py b/test/benchmark/kernel/benchmark_fa3_decode_mtp.py
@@ -113,21 +113,20 @@ def run_fa3_mla_mtp(
     def flash_mla_fa3():
         out = flash_attn_with_kvcache_mtp(
             q=q_pe.view(-1, BLOCK_H, dpe),
-            k_cache=blocked_k_pe,
-            v_cache=blocked_k_nope,
-            qv=q_nope.view(-1, BLOCK_H, dv),
+            k=blocked_k_pe,
+            v=blocked_k_nope,
+            q_v=q_nope.view(-1, BLOCK_H, dv),
             page_table=block_table,
-            cache_seqlens=cache_seqlens,
+            seqused_k=cache_seqlens,
             cu_seqlens_q=cu_seqlens_q,
             cu_seqlens_k_new=cu_seqlens_k,
             max_seqlen_q=1,
             softmax_scale=scale,
-            causal=True,
+            is_causal=True,
             window_size=(-1, -1),
             softcap=0.0,
             k_descale=k_descale,
             v_descale=v_descale,
-            return_softmax_lse=False,
             mtp_step=1,
         )
         return out.view([b, s_q, h_q, dv])

Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,7 @@ def gen_decode_params(b_seq_len: torch.Tensor):`
`16`	`16`
`17`	`17`	`if enable_fa3_mtp:`
`18`	`18`	`b1_cu_q_seq_len, b1_cu_kv_seq_len = gen_cumsum_pad0_tensor(`
`19`		`- b_q_seq_len[: len(b_seq_len) // mtp_size], b_kv_seq_len[mtp_size - 1 :: mtp_size]`
	`19`	`+ b_q_seq_len[mtp_size - 1 :: mtp_size], b_kv_seq_len[mtp_size - 1 :: mtp_size]`
`20`	`20`	`)`
`21`	`21`	`else:`
`22`	`22`	`b1_cu_q_seq_len, b1_cu_kv_seq_len = gen_cumsum_pad0_tensor(b_q_seq_len, b_kv_seq_len)`