[feat]fa support cu_seqlens

sangchengmeng · sangchengmeng · commit 5e62f980e99f · 2025-07-03T19:16:13.000+08:00
diff --git a/lightllm/models/vit/triton_kernel/flashattention_nopad.py b/lightllm/models/vit/triton_kernel/flashattention_nopad.py
@@ -13,26 +13,20 @@ def _fwd_kernel(
         K,
         V,
         sm_scale,
-        seq_len,
         Out,
-        q_stride_b,
         q_stride_s,
         q_stride_h,
         q_stride_d,
-        k_stride_b,
         k_stride_s,
         k_stride_h,
         k_stride_d,
-        v_stride_b,
         v_stride_s,
         v_stride_h,
         v_stride_d,
-        o_stride_b,
         o_stride_s,
         o_stride_h,
         o_stride_d,
         head_dim_act,
-        is_varlen: tl.constexpr,
         cu_seqlens,
         BLOCK_M: tl.constexpr,
         BLOCK_DMODEL: tl.constexpr,
@@ -42,29 +36,17 @@ def _fwd_kernel(
         cur_head = tl.program_id(1)
         start_m = tl.program_id(0)
 
-        if is_varlen == 1:
-            seq_start = tl.load(cu_seqlens + cur_batch).to(tl.int32)
-            seq_end = tl.load(cu_seqlens + cur_batch + 1).to(tl.int32)
-            seq_len = seq_end - seq_start
-            q_stride_b = 0
-            k_stride_b = 0
-            v_stride_b = 0
-            o_stride_b = 0
-        else:
-            seq_start = 0
+        seq_start = tl.load(cu_seqlens + cur_batch).to(tl.int32)
+        seq_end = tl.load(cu_seqlens + cur_batch + 1).to(tl.int32)
+        seq_len = seq_end - seq_start
 
         # initialize offsets
         offs_n = tl.arange(0, BLOCK_N)
         offs_d = tl.arange(0, BLOCK_DMODEL)
         offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
 
         mask_d = offs_d < head_dim_act
-        off_q = (
-            cur_batch * q_stride_b
-            + cur_head * q_stride_h
-            + (seq_start + offs_m[:, None]) * q_stride_s
-            + offs_d[None, :] * q_stride_d
-        )
+        off_q = cur_head * q_stride_h + (seq_start + offs_m[:, None]) * q_stride_s + offs_d[None, :] * q_stride_d
         q = tl.load(Q + off_q, mask=(offs_m[:, None] < seq_len) & mask_d[None, :], other=0.0)
         # initialize pointer to m and l
         m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
@@ -75,15 +57,14 @@ def _fwd_kernel(
             start_n = tl.multiple_of(start_n, BLOCK_N)
             # -- compute qk ----
             off_k = (
-                cur_batch * k_stride_b
-                + (seq_start + start_n + offs_n[None, :]) * k_stride_s
+                (seq_start + start_n + offs_n[None, :]) * k_stride_s
                 + cur_head * k_stride_h
                 + offs_d[:, None] * k_stride_d
             )
             k = tl.load(K + off_k, mask=((start_n + offs_n[None, :]) < seq_len) & mask_d[:, None], other=0.0)
 
             qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk += tl.dot(q, k, out_dtype=tl.float32, allow_tf32=False)
+            qk += tl.dot(q, k)
             qk *= sm_scale
             qk += tl.where((start_n + offs_n[None, :]) < seq_len, 0, float("-inf"))
 
@@ -97,8 +78,7 @@ def _fwd_kernel(
 
             # update acc
             off_v = (
-                cur_batch * v_stride_b
-                + (seq_start + start_n + offs_n[:, None]) * v_stride_s
+                (seq_start + start_n + offs_n[:, None]) * v_stride_s
                 + cur_head * v_stride_h
                 + offs_d[None, :] * v_stride_d
             )
@@ -115,12 +95,7 @@ def _fwd_kernel(
         o_scale = tl.exp(m_i - l_i)
         acc = acc * o_scale[:, None]
         # initialize pointers to output
-        off_o = (
-            cur_batch * o_stride_b
-            + (seq_start + offs_m[:, None]) * o_stride_s
-            + cur_head * o_stride_h
-            + offs_d[None, :] * o_stride_d
-        )
+        off_o = (seq_start + offs_m[:, None]) * o_stride_s + cur_head * o_stride_h + offs_d[None, :] * o_stride_d
         out_ptrs = Out + off_o
         tl.store(out_ptrs, acc, mask=(offs_m[:, None] < seq_len) & mask_d[None, :])
         return
@@ -132,49 +107,57 @@ def _flash_attention_triton_fwd(
         v,
         o,
         cu_seqlens=None,  # q k v cu_seqlens,
-        max_seqlens=None,
+        max_seqlen=None,
     ):
         BLOCK = 64
         # shape constraints
+        assert q.shape == k.shape == v.shape == o.shape, "q, k, v, o must have the same shape"
 
-        batch_size, seq_len, head_num, head_dim = q.shape
-        if cu_seqlens is not None and max_seqlens is not None:
-            assert q.shape[0] == 1
+        if q.ndim == 4:
+            bs, seq_len, head_num, head_dim = q.shape
+            total_len = bs * seq_len
+            reshape_fn = lambda t: t.view(total_len, head_num, head_dim)
+            q, k, v, o = [reshape_fn(x) for x in (q, k, v, o)]
+        elif q.ndim == 3:
+            total_len, head_num, head_dim = q.shape
+        else:
+            raise ValueError("q,k,v,o must be 3d or 4d")
+
+        if cu_seqlens is None:  # 说明是定长的
+            cu_seqlens = torch.arange(bs + 1, dtype=torch.int32, device=q.device) * seq_len
+        else:
             cu_seqlens = cu_seqlens.to(q.device, torch.int32)
-            seq_len = max_seqlens
-            batch_size = cu_seqlens.numel() - 1
+
+        if max_seqlen is None:
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+
+        batch_size = cu_seqlens.numel() - 1
 
         d_pad = triton.next_power_of_2(head_dim)
         sm_scale = 1.0 / (head_dim ** 0.5)  # 计算scale系数
-        # grid = (batch_size, head_num, triton.cdiv(seq_len, BLOCK))  # batch, head,
-        grid = (triton.cdiv(seq_len, BLOCK), head_num, batch_size)  # batch, head,
+
+        grid = (triton.cdiv(max_seqlen, BLOCK), head_num, batch_size)  # batch, head,
         num_warps = 4
         _fwd_kernel[grid](
             q,
             k,
             v,
             sm_scale,
-            seq_len,
             o,
             q.stride(0),
             q.stride(1),
             q.stride(2),
-            q.stride(3),
             k.stride(0),
             k.stride(1),
             k.stride(2),
-            k.stride(3),
             v.stride(0),
             v.stride(1),
             v.stride(2),
-            v.stride(3),
             o.stride(0),
             o.stride(1),
             o.stride(2),
-            o.stride(3),
             head_dim,
-            is_varlen=1 if cu_seqlens is not None else 0,
-            cu_seqlens=0 if cu_seqlens is None else cu_seqlens,
+            cu_seqlens,
             BLOCK_M=BLOCK,
             BLOCK_DMODEL=d_pad,
             BLOCK_N=BLOCK,
@@ -198,10 +181,17 @@ def flash_attention_v3_fwd(
         v,
         o,
         cu_seqlens=None,
-        max_seqlens=None,
+        max_seqlen=None,
     ):
         head_dim = q.shape[-1]
         softmax_scale = head_dim ** -0.5
+        if cu_seqlens is not None:
+            cu_seqlens = cu_seqlens.to(q.device, torch.int32)
+            if q.ndim == 4:
+                bs, seq_len, head_num, head_dim = q.shape
+                total_len = bs * seq_len
+                reshape_fn = lambda t: t.view(total_len, head_num, head_dim)
+                q, k, v, o = [reshape_fn(x) for x in (q, k, v, o)]
         _flash_attn_forward(
             q,
             k,
@@ -214,8 +204,8 @@ def flash_attention_v3_fwd(
             None,  # cu_seqlens_q/k/k_new
             None,
             None,  # seqused_q/k
-            max_seqlens,
-            max_seqlens,  # max_seqlen_q/k
+            max_seqlen,
+            max_seqlen,  # max_seqlen_q/k
             None,
             None,
             None,  # page_table, kv_batch_idx, leftpad_k,
@@ -239,15 +229,15 @@ def flash_attention_v3_fwd(
     _flash_attn_v3_available = False
 
 
-def flash_attention_fwd(q, k, v, o, cu_seqlens=None, max_seqlens=None):
+def flash_attention_fwd(q, k, v, o, cu_seqlens=None, max_seqlen=None):
     """
     统一的 Flash Attention 接口。如果 _flash_attn_forward 存在，
     则使用 flash_attention_v3_fwd，否则使用 Triton 版本。
     """
     if _flash_attn_v3_available and is_hopper():
-        flash_attention_v3_fwd(q, k, v, o, cu_seqlens, max_seqlens)
+        flash_attention_v3_fwd(q, k, v, o, cu_seqlens, max_seqlen)
     else:
-        _flash_attention_triton_fwd(q, k, v, o, cu_seqlens, max_seqlens)
+        _flash_attention_triton_fwd(q, k, v, o, cu_seqlens, max_seqlen)
 
 
 def torch_att(q, k, v):