Merge branch 'qwen1-vl-mrope-fix' of https://github.com/ModelTC/lightllm into qwen2-vl-mrope-fix

shihaobai · shihaobai · commit 9bf9d5d935ab · 2025-12-15T07:33:13.000Z
diff --git a/lightllm/models/llama/model.py b/lightllm/models/llama/model.py
@@ -118,7 +118,7 @@ def _init_custom(self):
             scaling_type = rope_scaling["type"]
         else:
             raise ValueError(f"Unknown RoPE scaling format {rope_scaling}")
-        if scaling_type == "default":
+        if scaling_type == "default" or "mrope_section" in rope_scaling:
             self._init_to_get_rotary()
         elif scaling_type == "yarn":
             self._init_to_get_yarn_rotary()
@@ -129,7 +129,7 @@ def _init_custom(self):
         elif scaling_type == "llama3":
             self._init_to_get_llama3_rotary()
         elif scaling_type == "mrope":
-            self._init_to_get_mrope_rotary()
+            self._init_to_get_rotary()
         else:
             raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
         return
@@ -373,47 +373,3 @@ def _init_to_get_llama3_rotary(self, default_base=10000):
         self._cos_cached = torch.cos(freqs).to(self.data_type).cuda()
         self._sin_cached = torch.sin(freqs).to(self.data_type).cuda()
         return
-
-    def _init_to_get_mrope_rotary(self, default_base=10000):
-        partial_head_dim = int(self.config.get("partial_rotary_factor", 1) * self.head_dim_)
-        if self.config.get("rope_scaling", {}) is None:
-            rope_scaling_factor = 1.0
-        else:
-            rope_scaling_factor = self.config.get("rope_scaling", {}).get("factor", 1.0)
-
-        base = self.config.get("rope_theta", float(default_base))
-
-        if "max_sequence_length" in self.config:
-            max_seq_len = self.config["max_sequence_length"]
-        else:
-            max_position_embeddings = self.config.get(
-                "max_position_embeddings", 2048 if base <= 10000.0 + 1e-5 else 16384
-            )
-            max_seq_len = max_position_embeddings * rope_scaling_factor
-
-        # NTK
-        try:
-            ntk_alpha = float(os.environ.get("LIGHTLLM_NTK_ALPHA", 1))
-            assert ntk_alpha >= 1
-            if ntk_alpha > 1:
-                logger.info(f"Note: NTK enabled, alpha set to {ntk_alpha}")
-            max_seq_len *= ntk_alpha
-            base = base * (ntk_alpha ** (partial_head_dim / (partial_head_dim - 2)))  # Base change formula
-        except:
-            pass
-
-        self.inv_freq = 1.0 / (
-            base ** (torch.arange(0, partial_head_dim, 2, device="cpu", dtype=torch.float32) / partial_head_dim)
-        )
-
-        t = (
-            torch.arange(max(max_seq_len + 1024 * 128, self.max_seq_length), device="cpu", dtype=torch.float32)
-            / rope_scaling_factor
-        )
-        freqs = torch.outer(t, self.inv_freq)  # (T, D/2)
-        freqs = torch.cat((freqs, freqs), dim=-1)  # (T, D)
-
-        self._cos_cached = torch.cos(freqs).to(self.data_type).cuda()
-        self._sin_cached = torch.sin(freqs).to(self.data_type).cuda()
-
-        return
diff --git a/lightllm/models/qwen2_vl/infer_struct.py b/lightllm/models/qwen2_vl/infer_struct.py
@@ -31,8 +31,8 @@ def init_some_extra_state(self, model, input_ids: torch.Tensor):
                 b_position_delta[batch_idx] = position_delta
             position_ids = self.position_ids + torch.tensor(b_position_delta, device=self.position_ids.device)
             position_ids = position_ids.unsqueeze(0).expand(3, -1)
-        self.position_cos = model._cos_cached[position_ids.unsqueeze(1)]  # (3, 1, L, D)
-        self.position_sin = model._sin_cached[position_ids.unsqueeze(1)]  # (3, 1, L, D)
+        self.position_cos = model._cos_cached[position_ids]  # (3, L, D)
+        self.position_sin = model._sin_cached[position_ids]  # (3, L, D)
         if get_env_start_args().enable_fa3:
             self.max_seq_len = self.max_kv_seq_len
             self.q_max_seq_len = self.max_q_seq_len
diff --git a/lightllm/models/qwen2_vl/triton_kernel/mrope.py b/lightllm/models/qwen2_vl/triton_kernel/mrope.py
@@ -21,6 +21,7 @@ def mrope_kernel(
     HALF: tl.constexpr,
     s_tok: tl.int32,
     s_ax: tl.int32,
+    s_d: tl.int32,
     q_sb: tl.int32,
     q_sh: tl.int32,
     q_sl: tl.int32,
@@ -77,7 +78,8 @@ def mrope_kernel(
     rot_vals = tl.where(offs < HALF, -rot_vals, rot_vals)
 
     axis_id = tl.load(AXIS_ptr + offs, mask=mask, other=0)  # 0,1,2
-    cos_idx = pid_l * s_tok + axis_id * s_ax + offs
+    idx_d = tl.where(offs < HALF, offs, offs - HALF)
+    cos_idx = pid_l * s_tok + axis_id * s_ax + idx_d * s_d
     c = tl.load(COS_ptr + cos_idx, mask=mask, other=0.0)
     s = tl.load(SIN_ptr + cos_idx, mask=mask, other=0.0)
 
@@ -101,12 +103,11 @@ def mrope_triton(q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch
     qo_sb, qo_sh, qo_sl, qo_sd = map(int, q_out.stride())
     ko_sb, ko_sh, ko_sl, ko_sd = map(int, k_out.stride())
 
-    assert len(cos.shape) == 4
-    token_dim = 2
-    axis_dim = 0
+    assert len(cos.shape) == 3
 
-    s_token = int(cos.stride(token_dim))
-    s_axis = int(cos.stride(axis_dim))
+    s_axis = int(cos.stride(0))
+    s_token = int(cos.stride(1))
+    s_d = int(cos.stride(2))
 
     grid = (B * (H_q + H_k), L)
 
@@ -126,6 +127,7 @@ def mrope_triton(q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch
         HALF,
         s_token,
         s_axis,
+        s_d,
         q_sb,
         q_sh,
         q_sl,
diff --git a/lightllm/models/qwen2_vl/triton_kernel/rotary_pos_emb.py b/lightllm/models/qwen2_vl/triton_kernel/rotary_pos_emb.py
@@ -34,8 +34,9 @@ def rotary_kernel(
     partner_d = tl.where(d < HALF_D, d + HALF_D, d - HALF_D)
 
     for pid_l in tl.range(pid_l_start, total_len, step=tl.num_programs(axis=1)):
-        cos_ptr_ = cos_ptr + pid_l * stride_cos_l + d
-        sin_ptr_ = sin_ptr + pid_l * stride_sin_l + d
+        idx_d = tl.where(d < HALF_D, d, d - HALF_D)
+        cos_ptr_ = cos_ptr + pid_l * stride_cos_l + idx_d * stride_cos_d
+        sin_ptr_ = sin_ptr + pid_l * stride_sin_l + idx_d * stride_sin_d
         cos = tl.load(cos_ptr_, mask=mask)
         sin = tl.load(sin_ptr_, mask=mask)
 
@@ -52,7 +53,7 @@ def rotary_kernel(
 
             y = x * cos + rotated * sin
 
-            out_ptr_ = out_ptr + base + d
+            out_ptr_ = out_ptr + base + d * stride_d
             tl.store(out_ptr_, y, mask=mask)
 
 
@@ -66,8 +67,8 @@ def apply_rotary_pos_emb_triton(
     orig_dtype = tensor.dtype
     x = tensor.float()
 
-    cos = cos.repeat(1, 2).view(cos.size(0), -1).contiguous().float()
-    sin = sin.repeat(1, 2).view(sin.size(0), -1).contiguous().float()
+    cos = cos.contiguous().float()
+    sin = sin.contiguous().float()
 
     L, H, D = x.shape
     HALF_D = D // 2
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
@@ -270,6 +270,7 @@ async def generate(
         start_time = time.time()
         request_headers = request.headers if request is not None else {}
         group_request_id = self.alloc_req_id(sampling_params, is_health_req)
+
         try:
             original_multimodal_params = None
             if self.is_multinode_tp_master: