[feat]fa support cu_seqlens

sangchengmeng · sangchengmeng · commit b2183e691a7a · 2025-07-03T20:59:44.000+08:00
diff --git a/lightllm/models/qwen2_5_vl/qwen2_5_visual.py b/lightllm/models/qwen2_5_vl/qwen2_5_visual.py
@@ -23,6 +23,7 @@
 from lightllm.server.multimodal_params import MultimodalParams, ImageItem
 from lightllm.models.qwen2_vl.qwen2_visual import PatchEmbed, VisionRotaryEmbedding
 from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd
+from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
 
 # adapted from
 # https://github.com/huggingface/transformers/blob/
@@ -149,12 +150,9 @@ def forward(
             cos, sin = position_embeddings
         q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
 
-        q = q.unsqueeze(0)
-        k = k.unsqueeze(0)
-        v = v.unsqueeze(0)
-
+        cu_seqlens = cu_seqlens.to(q.device, torch.int32)
         max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        attn_output = torch.empty_like(q)
+        attn_output = g_cache_manager.alloc_tensor(q.shape, q.dtype, device=q.device)
         flash_attention_fwd(q, k, v, attn_output, cu_seqlens, max_seqlen)
         attn_output = attn_output.reshape(seq_length, -1)
         attn_output = self.proj(attn_output)
diff --git a/lightllm/models/qwen2_vl/qwen2_visual.py b/lightllm/models/qwen2_vl/qwen2_visual.py
@@ -44,6 +44,7 @@
 from lightllm.server.multimodal_params import MultimodalParams, ImageItem
 from lightllm.models.qwen2_vl.vision_process import Qwen2VLImageProcessor
 from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd
+from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
 
 from transformers.utils import is_flash_attn_2_available
 
@@ -224,10 +225,13 @@ def forward(
         q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
         q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb)
         k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb)
-        v = v.unsqueeze(0)
+        q = q.squeeze(0)
+        k = k.squeeze(0)
 
+        cu_seqlens = cu_seqlens.to(q.device, torch.int32)
         max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        attn_output = torch.empty_like(q, dtype=q.dtype, device=q.device)
+        attn_output = g_cache_manager.alloc_tensor(q.shape, q.dtype, device=q.device)
+
         flash_attention_fwd(q, k, v, attn_output, cu_seqlens, max_seqlen)
         attn_output = attn_output.reshape(seq_length, -1)
         attn_output = self.proj(attn_output)
diff --git a/lightllm/models/vit/layer_infer/transformer_layer_infer.py b/lightllm/models/vit/layer_infer/transformer_layer_infer.py
@@ -103,9 +103,13 @@ def _get_qkv(self, input, layer_weight: ViTTransformerLayerWeight) -> torch.Tens
 
     def _context_attention_kernel(self, q, k, v) -> torch.Tensor:
         out = g_cache_manager.alloc_tensor(q.shape, q.dtype, device=q.device)
-        batch_size = q.shape[0]
-        seq_len = q.shape[1]
-        flash_attention_fwd(q, k, v, out)
+        batch_size, seq_len, head_num, head_dim = q.shape
+        total_len = batch_size * seq_len
+        reshape = lambda t: t.view(total_len, head_num, head_dim)
+        q, k, v, out = map(reshape, (q, k, v, out))
+        cu_seqlens = torch.arange(batch_size + 1, dtype=torch.int32, device=q.device) * seq_len
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        flash_attention_fwd(q, k, v, out, cu_seqlens, max_seqlen)
         return out.reshape(batch_size, seq_len, -1)
 
     def _get_o(self, input, layer_weight: ViTTransformerLayerWeight) -> torch.Tensor:
diff --git a/lightllm/models/vit/triton_kernel/flashattention_nopad.py b/lightllm/models/vit/triton_kernel/flashattention_nopad.py
@@ -2,7 +2,9 @@
 import triton
 import triton.language as tl
 import math
+import time
 import torch.nn.functional as F
+from typing import Optional, Tuple
 from lightllm.utils.device_utils import is_hopper
 
 if triton.__version__ >= "2.1.0":
@@ -82,9 +84,7 @@ def _fwd_kernel(
                 + cur_head * v_stride_h
                 + offs_d[None, :] * v_stride_d
             )
-            v = tl.load(V + off_v, mask=((start_n + offs_n[:, None]) < seq_len) & mask_d[None, :], other=0.0).to(
-                tl.float32
-            )
+            v = tl.load(V + off_v, mask=((start_n + offs_n[:, None]) < seq_len) & mask_d[None, :], other=0.0)
             p = p.to(v.dtype)
             acc += tl.dot(p, v)
             # update m_i and l_i
@@ -106,36 +106,17 @@ def _flash_attention_triton_fwd(
         k,
         v,
         o,
-        cu_seqlens=None,  # q k v cu_seqlens,
-        max_seqlen=None,
+        cu_seqlens,  # q k v cu_seqlens,
+        max_seqlen,
     ):
         BLOCK = 64
         # shape constraints
-        assert q.shape == k.shape == v.shape == o.shape, "q, k, v, o must have the same shape"
-
-        if q.ndim == 4:
-            bs, seq_len, head_num, head_dim = q.shape
-            total_len = bs * seq_len
-            reshape_fn = lambda t: t.view(total_len, head_num, head_dim)
-            q, k, v, o = [reshape_fn(x) for x in (q, k, v, o)]
-        elif q.ndim == 3:
-            total_len, head_num, head_dim = q.shape
-        else:
-            raise ValueError("q,k,v,o must be 3d or 4d")
-
-        if cu_seqlens is None:  # 说明是定长的
-            cu_seqlens = torch.arange(bs + 1, dtype=torch.int32, device=q.device) * seq_len
-        else:
-            cu_seqlens = cu_seqlens.to(q.device, torch.int32)
-
-        if max_seqlen is None:
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-
+        assert q.ndim == k.ndim == v.ndim == o.ndim == 3, "q, k, v, o must be 3D tensors"
+        _, head_num, head_dim = q.shape
         batch_size = cu_seqlens.numel() - 1
 
-        d_pad = triton.next_power_of_2(head_dim)
         sm_scale = 1.0 / (head_dim ** 0.5)  # 计算scale系数
-
+        d_pad = triton.next_power_of_2(head_dim)
         grid = (triton.cdiv(max_seqlen, BLOCK), head_num, batch_size)  # batch, head,
         num_warps = 4
         _fwd_kernel[grid](
@@ -180,18 +161,11 @@ def flash_attention_v3_fwd(
         k,
         v,
         o,
-        cu_seqlens=None,
-        max_seqlen=None,
+        cu_seqlens,
+        max_seqlen,
     ):
         head_dim = q.shape[-1]
         softmax_scale = head_dim ** -0.5
-        if cu_seqlens is not None:
-            cu_seqlens = cu_seqlens.to(q.device, torch.int32)
-            if q.ndim == 4:
-                bs, seq_len, head_num, head_dim = q.shape
-                total_len = bs * seq_len
-                reshape_fn = lambda t: t.view(total_len, head_num, head_dim)
-                q, k, v, o = [reshape_fn(x) for x in (q, k, v, o)]
         _flash_attn_forward(
             q,
             k,
@@ -229,7 +203,7 @@ def flash_attention_v3_fwd(
     _flash_attn_v3_available = False
 
 
-def flash_attention_fwd(q, k, v, o, cu_seqlens=None, max_seqlen=None):
+def flash_attention_fwd(q, k, v, o, cu_seqlens, max_seqlen):
     """
     统一的 Flash Attention 接口。如果 _flash_attn_forward 存在，
     则使用 flash_attention_v3_fwd，否则使用 Triton 版本。
@@ -238,44 +212,3 @@ def flash_attention_fwd(q, k, v, o, cu_seqlens=None, max_seqlen=None):
         flash_attention_v3_fwd(q, k, v, o, cu_seqlens, max_seqlen)
     else:
         _flash_attention_triton_fwd(q, k, v, o, cu_seqlens, max_seqlen)
-
-
-def torch_att(q, k, v):
-    head_dim = q.shape[-1]
-    q = q.transpose(1, 2)
-    k = k.transpose(1, 2)
-    v = v.transpose(1, 2)
-    scale = head_dim ** -0.5
-    attn = (q * scale) @ k.transpose(-2, -1)
-    attn = attn.softmax(dim=-1)
-    out = attn @ v
-    out = out.transpose(1, 2).contiguous()
-    return out
-
-
-def test():
-    import torch
-    import numpy as np
-
-    B, L, H, D = 4, 1025, 7, 128
-    dtype = torch.float16
-    q = torch.empty((B, L, H, D), dtype=dtype, device="cuda").normal_(mean=0.1, std=0.2)
-    k = torch.empty((B, L, H, D), dtype=dtype, device="cuda").normal_(mean=0.1, std=0.2)
-    v = torch.empty((B, L, H, D), dtype=dtype, device="cuda").normal_(mean=0.1, std=0.2)
-    o = torch.empty((B, L, H, D), dtype=dtype, device="cuda").normal_(mean=0.1, std=0.2)
-    torch_out = torch_att(q, k, v)
-    import time
-
-    torch.cuda.synchronize()
-    a = time.time()
-    for i in range(100):
-        flash_attention_fwd(q, k, v, o)
-        # o = torch_att(q, k, v)
-    torch.cuda.synchronize()
-    b = time.time()
-    # print(o.shape, torch_out.shape)
-    print((b - a) / 100 * 1000)
-
-    print("max ", torch.max(torch.abs(torch_out - o)))
-    print("mean ", torch.mean(torch.abs(torch_out - o)))
-    assert torch.allclose(torch_out, o, atol=1e-2, rtol=0)
diff --git a/unit_tests/models/vit/test_flash_attention_forward.py b/unit_tests/models/vit/test_flash_attention_forward.py
@@ -0,0 +1,68 @@
+import torch
+import triton
+import triton.language as tl
+import math
+import time
+import torch.nn.functional as F
+from typing import Optional, Tuple
+from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd
+
+
+def reference_attention_varlen(q, k, v, cu):
+    """
+    q, k, v : (total_len, n_head, D)
+    cu_seqlen      : prefix sums (batch+1,)
+    """
+    total, n_head, d = q.shape
+    out = torch.empty_like(q)
+    scale = 1.0 / math.sqrt(d)
+
+    for b in range(cu.numel() - 1):
+        s, e = cu[b].item(), cu[b + 1].item()
+        q_b, k_b, v_b = q[s:e], k[s:e], v[s:e]  # (seq, head, D)
+
+        q_hsd = q_b.permute(1, 0, 2)  # (head, seq, D)
+        k_hds = k_b.permute(1, 2, 0)  # (head, D,  seq)
+        v_hsd = v_b.permute(1, 0, 2)  # (head, seq, D)
+
+        scores = torch.matmul(q_hsd, k_hds) * scale  # (head, seq, seq)
+        probs = torch.softmax(scores.float(), dim=-1)
+
+        out_hsd = torch.matmul(probs, v_hsd.float())  # (head, seq, D)
+        out[s:e] = out_hsd.permute(1, 0, 2).to(q.dtype)  # back to (seq, head, D)
+
+    return out
+
+
+def test_varlen(batch=4, heads=8, d=80, dtype=torch.bfloat16, atol=1e-2, device="cuda:0"):
+    torch.manual_seed(0)
+    lengths = torch.randint(1, 257, (batch,))
+    max_len = int(lengths.max().item())
+
+    cu = torch.zeros(batch + 1, dtype=torch.int32, device=device)
+    cu[1:] = torch.cumsum(lengths, 0)
+    tot = int(cu[-1])
+
+    q = torch.randn(tot, heads, d, dtype=dtype, device=device)
+    k = torch.randn_like(q)
+    v = torch.randn_like(q)
+    out_tri = torch.randn_like(q)
+    flash_attention_fwd(q, k, v, out_tri, cu, max_len)
+    a = time.time()
+    for _ in range(1000):
+        flash_attention_fwd(q, k, v, out_tri, cu, max_len)
+    b = time.time()
+    print(f"flash_attention_fwd time: {(b - a) / 1000 * 1000:.2f} ms")
+    out_ref = reference_attention_varlen(q, k, v, cu)
+
+    max_err = (out_ref - out_tri).abs().max().item()
+    mean_err = (out_ref - out_tri).abs().mean().item()
+    print(f"{dtype}: max {max_err:.6f}, mean {mean_err:.6f}")
+    torch.testing.assert_close(out_tri, out_ref, atol=atol, rtol=0)
+
+
+if __name__ == "__main__":
+    tests = [(torch.float16, 1e-2), (torch.bfloat16, 2e-2)]
+    for dt, tol in tests:
+        test_varlen(dtype=dt, atol=tol)
+    print("✓ variable-length Flash-Attention all dtypes pass")