fix pos_emb

sangchengmeng · sangchengmeng · commit bc7e97f03423 · 2025-11-25T10:07:46.000Z
diff --git a/lightllm/models/qwen2_vl/triton_kernel/rotary_pos_emb.py b/lightllm/models/qwen2_vl/triton_kernel/rotary_pos_emb.py
@@ -17,37 +17,43 @@ def rotary_kernel(
     stride_cos_d,
     stride_sin_l,
     stride_sin_d,
+    max_L,
     D: tl.constexpr,
     HALF_D: tl.constexpr,
     BLOCK_D: tl.constexpr,
+    BLOCK_L: tl.constexpr,
 ):
     pid_h = tl.program_id(0).to(tl.int64)
-    pid_l = tl.program_id(1).to(tl.int64)
-    pid_blk = tl.program_id(2).to(tl.int64)
+    pid_l_blk = tl.program_id(1).to(tl.int64)
+    pid_blk_d = tl.program_id(2).to(tl.int64)
 
     offs_d = tl.arange(0, BLOCK_D)
-    d = pid_blk * BLOCK_D + offs_d
+    d = pid_blk_d * BLOCK_D + offs_d
     mask = d < D
 
-    base = pid_l * stride_l + pid_h * stride_h
+    seq_start = pid_l_blk * BLOCK_L
+    seq_end = (pid_l_blk + 1) * BLOCK_L
+    seq_end = tl.where(seq_end < max_L, seq_end, max_L)
 
-    in_ptr = inp_ptr + base + d * stride_d
-    cos_ptr_ = cos_ptr + pid_l * stride_cos_l + d
-    sin_ptr_ = sin_ptr + pid_l * stride_sin_l + d
+    for seq in tl.range(seq_start, seq_end):
+        base = seq * stride_l + pid_h * stride_h
 
-    x = tl.load(in_ptr, mask=mask)
-    cos = tl.load(cos_ptr_, mask=mask)
-    sin = tl.load(sin_ptr_, mask=mask)
+        in_ptr = inp_ptr + base + d * stride_d
+        cos_ptr_ = cos_ptr + seq * stride_cos_l + d * stride_cos_d
+        sin_ptr_ = sin_ptr + seq * stride_sin_l + d * stride_sin_d
 
-    partner_d = tl.where(d < HALF_D, d + HALF_D, d - HALF_D)
-    partner_ptr = inp_ptr + base + partner_d * stride_d
-    partner_val = tl.load(partner_ptr, mask=mask)
-    rotated = tl.where(d < HALF_D, -partner_val, partner_val)
+        x = tl.load(in_ptr, mask=mask, other=0.0)
+        cos = tl.load(cos_ptr_, mask=mask, other=0.0)
+        sin = tl.load(sin_ptr_, mask=mask, other=0.0)
 
-    y = x * cos + rotated * sin
+        partner_d = tl.where(d < HALF_D, d + HALF_D, d - HALF_D)
+        partner_ptr = inp_ptr + base + partner_d * stride_d
+        partner_val = tl.load(partner_ptr, mask=mask, other=0.0)
+        rotated = tl.where(d < HALF_D, -partner_val, partner_val)
 
-    out_ptr_ = out_ptr + base + d
-    tl.store(out_ptr_, y, mask=mask)
+        y = x * cos + rotated * sin
+        out_ptr_ = out_ptr + base + d * stride_d
+        tl.store(out_ptr_, y, mask=mask)
 
 
 def apply_rotary_pos_emb_triton(
@@ -67,7 +73,8 @@ def apply_rotary_pos_emb_triton(
     HALF_D = D // 2
     y = torch.empty_like(x)
 
-    grid = (H, L, triton.cdiv(D, BLOCK_D))
+    BLOCK_L = 16
+    grid = (H, triton.cdiv(L, BLOCK_L), triton.cdiv(D, BLOCK_D))
 
     rotary_kernel[grid](
         inp_ptr=x,
@@ -81,9 +88,11 @@ def apply_rotary_pos_emb_triton(
         stride_cos_d=cos.stride(1),
         stride_sin_l=sin.stride(0),
         stride_sin_d=sin.stride(1),
+        max_L=L,
         D=D,
         HALF_D=HALF_D,
         BLOCK_D=BLOCK_D,
+        BLOCK_L=BLOCK_L,
     )
 
     return y.to(orig_dtype)