fix

hiworldwzj · hiworldwzj · commit 696be85234f8 · 2025-07-04T01:57:03.000Z
diff --git a/lightllm/common/basemodel/layer_infer/cache_tensor_manager.py b/lightllm/common/basemodel/layer_infer/cache_tensor_manager.py
@@ -135,7 +135,7 @@ def alloc_tensor(
             # shape 类型转换
             if isinstance(shape, list):
                 shape = torch.Size(shape)
-            
+
             # cache manager 没有被正常使用时
             if not self.cache_env_ok:
                 return torch.empty(shape, dtype=data_type, device=device, requires_grad=False)
diff --git a/lightllm/models/vit/layer_infer/transformer_layer_infer.py b/lightllm/models/vit/layer_infer/transformer_layer_infer.py
@@ -1,13 +1,8 @@
 import torch
-import torch.functional as F
 import torch.distributed as dist
-import numpy as np
-from typing import Tuple
-from functools import partial
-import triton
+
 
 from lightllm.models.vit.layer_weights.transformer_layer_weight import ViTTransformerLayerWeight
-from lightllm.models.llama.triton_kernel.rmsnorm import rmsnorm_forward, torch_rms_norm
 from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd
 from lightllm.utils.dist_utils import get_current_rank_in_dp, get_dp_world_size
 from lightllm.models.vit.triton_kernel.gelu_vit import gelu_fwd
@@ -108,7 +103,7 @@ def _context_attention_kernel(self, q, k, v) -> torch.Tensor:
         reshape = lambda t: t.view(total_len, head_num, head_dim)
         q, k, v, out = map(reshape, (q, k, v, out))
         cu_seqlens = torch.arange(batch_size + 1, dtype=torch.int32, device=q.device) * seq_len
-        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        max_seqlen = seq_len
         flash_attention_fwd(q, k, v, out, cu_seqlens, max_seqlen)
         return out.reshape(batch_size, seq_len, -1)
 
diff --git a/unit_tests/models/vit/test_flash_attention_forward.py b/unit_tests/models/vit/test_flash_attention_forward.py
@@ -1,10 +1,7 @@
 import torch
-import triton
-import triton.language as tl
 import math
 import time
-import torch.nn.functional as F
-from typing import Optional, Tuple
+import pytest
 from lightllm.models.vit.triton_kernel.flashattention_nopad import flash_attention_fwd
 
 
@@ -34,7 +31,8 @@ def reference_attention_varlen(q, k, v, cu):
     return out
 
 
-def test_varlen(batch=4, heads=8, d=80, dtype=torch.bfloat16, atol=1e-2, device="cuda:0"):
+@pytest.mark.parametrize("dtype,atol", [(torch.float16, 1e-2), (torch.bfloat16, 2e-2)])
+def test_varlen(dtype, atol, batch=4, heads=8, d=80, device="cuda:0"):
     torch.manual_seed(0)
     lengths = torch.randint(1, 257, (batch,))
     max_len = int(lengths.max().item())
@@ -49,10 +47,10 @@ def test_varlen(batch=4, heads=8, d=80, dtype=torch.bfloat16, atol=1e-2, device=
     out_tri = torch.randn_like(q)
     flash_attention_fwd(q, k, v, out_tri, cu, max_len)
     a = time.time()
-    for _ in range(1000):
+    for _ in range(100):
         flash_attention_fwd(q, k, v, out_tri, cu, max_len)
     b = time.time()
-    print(f"flash_attention_fwd time: {(b - a) / 1000 * 1000:.2f} ms")
+    print(f"flash_attention_fwd time: {(b - a) / 100 * 1000:.2f} ms")
     out_ref = reference_attention_varlen(q, k, v, cu)
 
     max_err = (out_ref - out_tri).abs().max().item()
@@ -62,7 +60,4 @@ def test_varlen(batch=4, heads=8, d=80, dtype=torch.bfloat16, atol=1e-2, device=
 
 
 if __name__ == "__main__":
-    tests = [(torch.float16, 1e-2), (torch.bfloat16, 2e-2)]
-    for dt, tol in tests:
-        test_varlen(dtype=dt, atol=tol)
-    print("✓ variable-length Flash-Attention all dtypes pass")
+    pytest.main()