refactor mrope

shihaobai · shihaobai · commit 08a3484e7098 · 2025-12-18T05:47:05.000Z
diff --git a/lightllm/models/qwen2_vl/infer_struct.py b/lightllm/models/qwen2_vl/infer_struct.py
@@ -33,8 +33,8 @@ def init_some_extra_state(self, model, input_ids: torch.Tensor):
             self.position_ids = position_ids.unsqueeze(0).expand(3, -1)
 
         self.position_ids = self.position_ids.contiguous()
-        self._cos_cached = model._cos_cached
-        self._sin_cached = model._sin_cached
+        self.position_cos = model._cos_cached[self.position_ids]
+        self.position_sin = model._sin_cached[self.position_ids]
         if get_env_start_args().enable_fa3:
             self.max_seq_len = self.max_kv_seq_len
             self.q_max_seq_len = self.max_q_seq_len
diff --git a/lightllm/models/qwen2_vl/layer_infer/transformer_layer_infer.py b/lightllm/models/qwen2_vl/layer_infer/transformer_layer_infer.py
@@ -21,9 +21,8 @@ def _get_qkv(self, input, infer_state, layer_weight):
         mrope_triton_fused(
             q.view(-1, self.tp_q_head_num_, self.head_dim_),
             cache_kv[:, : self.tp_k_head_num_, :],
-            infer_state._cos_cached,
-            infer_state._sin_cached,
-            infer_state.position_ids,
+            infer_state.position_cos,
+            infer_state.position_sin,
             self.mrope_section,
             is_interleaved=False,
         )
diff --git a/lightllm/models/qwen2_vl/triton_kernel/mrope.py b/lightllm/models/qwen2_vl/triton_kernel/mrope.py
@@ -74,40 +74,33 @@ def _mrope_triton_fused_kernel(
     Cos,
     Sin,
     mrope_section,
-    position_ids,
-    stride_positions,
+    stride_cosld,
+    stride_cosd,
+    stride_sinld,
+    stride_sind,
     stride_qbs,
     stride_qh,
     stride_qd,
     stride_kbs,
     stride_kh,
     stride_kd,
-    stride_cosbs,
-    stride_cosd,
-    stride_sinbs,
-    stride_sind,
     is_interleaved: tl.constexpr,
     HEAD_Q: tl.constexpr,
     HEAD_K: tl.constexpr,
     BLOCK_DMODEL: tl.constexpr,
-    NUM_STAGE: tl.constexpr,
 ):
     head_index = tl.program_id(0)
     seq_index = tl.program_id(1)
 
     dim_range0 = tl.arange(0, BLOCK_DMODEL // 2)
     dim_range1 = dim_range0 + BLOCK_DMODEL // 2
 
-    t = tl.load(position_ids + 0 * stride_positions + seq_index)
-    h = tl.load(position_ids + 1 * stride_positions + seq_index)
-    w = tl.load(position_ids + 2 * stride_positions + seq_index)
-
-    t_cos = Cos + t * stride_cosbs
-    h_cos = Cos + h * stride_cosbs
-    w_cos = Cos + w * stride_cosbs
-    t_sin = Sin + t * stride_sinbs
-    h_sin = Sin + h * stride_sinbs
-    w_sin = Sin + w * stride_sinbs
+    t_cos = Cos + seq_index * stride_cosd
+    h_cos = Cos + stride_cosld + seq_index * stride_cosd
+    w_cos = Cos + 2 * stride_cosld + seq_index * stride_cosd
+    t_sin = Sin + seq_index * stride_sind
+    h_sin = Sin + stride_sinld + seq_index * stride_sind
+    w_sin = Sin + 2 * stride_sinld + seq_index * stride_sind
 
     mrope_section_t = tl.load(mrope_section + 0)
     mrope_section_h = tl.load(mrope_section + 1)
@@ -198,7 +191,6 @@ def mrope_triton_fused(
     k: torch.Tensor,
     cos: torch.Tensor,
     sin: torch.Tensor,
-    position_ids: torch.Tensor,
     mrope_section: torch.Tensor,
     is_interleaved: bool,
     run_config: Optional[dict] = None,
@@ -224,24 +216,21 @@ def mrope_triton_fused(
         k=k,
         Cos=cos,
         Sin=sin,
-        position_ids=position_ids,
         mrope_section=mrope_section,
-        stride_positions=position_ids.stride(0),
+        stride_cosld=cos.stride(0),
+        stride_cosd=cos.stride(1),
+        stride_sinld=sin.stride(0),
+        stride_sind=sin.stride(1),
         stride_qbs=q.stride(0),
         stride_qh=q.stride(1),
         stride_qd=q.stride(2),
         stride_kbs=k.stride(0),
         stride_kh=k.stride(1),
         stride_kd=k.stride(2),
-        stride_cosbs=cos.stride(0),
-        stride_cosd=cos.stride(1),
-        stride_sinbs=sin.stride(0),
-        stride_sind=sin.stride(1),
         is_interleaved=is_interleaved,
         HEAD_Q=head_num_q,
         HEAD_K=head_num_k,
         BLOCK_DMODEL=head_dim,
-        NUM_STAGE=num_stages,
         num_warps=num_warps,
         num_stages=num_stages,
     )
diff --git a/lightllm/models/qwen3_vl/layer_infer/transformer_layer_infer.py b/lightllm/models/qwen3_vl/layer_infer/transformer_layer_infer.py
@@ -51,9 +51,8 @@ def _get_qkv(
         mrope_triton_fused(
             q.view(-1, self.tp_q_head_num_, self.head_dim_),
             cache_kv[:, : self.tp_k_head_num_, :],
-            infer_state._cos_cached,
-            infer_state._sin_cached,
-            infer_state.position_ids,
+            infer_state.position_cos,
+            infer_state.position_sin,
             self.mrope_section,
             is_interleaved=True,
         )
diff --git a/unit_tests/models/qwen2_vl/test_mrope.py b/unit_tests/models/qwen2_vl/test_mrope.py
@@ -54,9 +54,8 @@ def test_mrope_triton_correctness(B, H_q, H_k, L, D, mrope_section):
 
     q = q.transpose(1, 2).contiguous().view(L, H_q, D)
     k = k.transpose(1, 2).contiguous().view(L, H_k, D)
-    position_ids = torch.arange(L, dtype=torch.int32, device="cuda").unsqueeze(0).expand(3, L).contiguous()
     mrope_section = torch.tensor(mrope_section, dtype=torch.int32, device="cuda")
-    mrope_triton_fused(q, k, cos_half[0], sin_half[0], position_ids, mrope_section, is_interleaved=False)
+    mrope_triton_fused(q, k, cos_half, sin_half, mrope_section, is_interleaved=False)
     q = q.transpose(0, 1).contiguous().view(B, H_q, L, D)
     k = k.transpose(0, 1).contiguous().view(B, H_k, L, D)
     assert torch.allclose(q, ref_q, rtol=1e-3, atol=1e-3)