qwen2vl-fix

sangchengmeng · sangchengmeng · commit bea6d68cf6ca · 2025-04-15T19:15:04.000+08:00
diff --git a/lightllm/models/llama/model.py b/lightllm/models/llama/model.py
@@ -88,6 +88,11 @@ def _init_custom(self):
             and self.config.get("rope_scaling", {}).get("rope_type", "base") == "llama3"
         ):
             self._init_to_get_llama3_rotary()
+        elif (
+            self.config.get("rope_scaling", None) is not None
+            and self.config.get("rope_scaling", {}).get("type", "base") == "mrope"
+        ):
+            self._init_to_get_mrope_rotary()
         else:
             self._init_to_get_rotary()
         return
@@ -332,3 +337,47 @@ def _init_to_get_llama3_rotary(self, default_base=10000):
         self._cos_cached = torch.cos(freqs).to(self.data_type).cuda()
         self._sin_cached = torch.sin(freqs).to(self.data_type).cuda()
         return
+
+    def _init_to_get_mrope_rotary(self, default_base=10000):
+        partial_head_dim = int(self.config.get("partial_rotary_factor", 1) * self.head_dim_)
+        if self.config.get("rope_scaling", {}) is None:
+            rope_scaling_factor = 1.0
+        else:
+            rope_scaling_factor = self.config.get("rope_scaling", {}).get("factor", 1.0)
+
+        base = self.config.get("rope_theta", float(default_base))
+
+        if "max_sequence_length" in self.config:
+            max_seq_len = self.config["max_sequence_length"]
+        else:
+            max_position_embeddings = self.config.get(
+                "max_position_embeddings", 2048 if base <= 10000.0 + 1e-5 else 16384
+            )
+            max_seq_len = max_position_embeddings * rope_scaling_factor
+
+        # NTK
+        try:
+            ntk_alpha = float(os.environ.get("LIGHTLLM_NTK_ALPHA", 1))
+            assert ntk_alpha >= 1
+            if ntk_alpha > 1:
+                logger.info(f"Note: NTK enabled, alpha set to {ntk_alpha}")
+            max_seq_len *= ntk_alpha
+            base = base * (ntk_alpha ** (partial_head_dim / (partial_head_dim - 2)))  # Base change formula
+        except:
+            pass
+
+        inv_freq = 1.0 / (
+            base ** (torch.arange(0, partial_head_dim, 2, device="cpu", dtype=torch.float32) / partial_head_dim)
+        )
+
+        t = (
+            torch.arange(max(max_seq_len + 1024 * 128, self.max_seq_length), device="cpu", dtype=torch.float32)
+            / rope_scaling_factor
+        )
+        freqs = torch.outer(t, inv_freq).unsqueeze(0).expand(3, -1, -1)
+        freqs = torch.cat((freqs, freqs), dim=-1)
+
+        self._cos_cached = torch.cos(freqs).to(self.data_type).cuda()
+        self._sin_cached = torch.sin(freqs).to(self.data_type).cuda()
+
+        return
diff --git a/lightllm/models/qwen2_vl/infer_struct.py b/lightllm/models/qwen2_vl/infer_struct.py
@@ -0,0 +1,26 @@
+import torch
+import numpy as np
+from lightllm.models.llama.infer_struct import LlamaInferStateInfo
+
+
+class Qwen2VLInferStateInfo(LlamaInferStateInfo):
+    def __init__(self):
+        super().__init__()
+        self.position_cos = None
+        self.position_sin = None
+
+    def init_some_extra_state(self, model, input_ids: torch.Tensor):
+        if self.is_prefill:
+            b_seq_len_numpy = self.b_seq_len.cpu().numpy()
+            self.max_seq_len = b_seq_len_numpy.max()
+            position_ids = torch.from_numpy(
+                np.concatenate([np.arange(0, b_seq_len_numpy[i]) for i in range(len(b_seq_len_numpy))])
+            ).cuda()
+            self.position_sin = model._sin_cached[:, position_ids, :].unsqueeze(1)
+            self.position_cos = model._cos_cached[:, position_ids, :].unsqueeze(1)
+            position_ids = None
+        else:
+            position_ids = self.b_seq_len - 1
+            self.position_sin = model._sin_cached[:, position_ids, :].unsqueeze(1)
+            self.position_cos = model._cos_cached[:, position_ids, :].unsqueeze(1)
+        return
diff --git a/lightllm/models/qwen2_vl/layer_infer/transformer_layer_infer.py b/lightllm/models/qwen2_vl/layer_infer/transformer_layer_infer.py
@@ -0,0 +1,71 @@
+import torch
+import torch.functional as F
+import torch.distributed as dist
+import numpy as np
+
+from lightllm.models.llama.layer_infer.transformer_layer_infer import LlamaTransformerLayerInfer
+
+import torch.nn as nn
+from functools import partial
+
+
+def rotate_half(x):
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
+    mrope_section = mrope_section * 2
+    cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(unsqueeze_dim)
+    sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(unsqueeze_dim)
+
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+
+    return q_embed, k_embed
+
+
+class Qwen2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, device, eps=1e-6):
+        super().__init__()
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states, weight):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        return (weight * hidden_states).to(input_dtype)
+
+
+class Qwen2VLTransformerLayerInfer(LlamaTransformerLayerInfer):
+    def __init__(self, layer_num, network_config, mode=[]):
+        super().__init__(layer_num, network_config, mode)
+        self.mrope_section = network_config["rope_scaling"]["mrope_section"]
+        self.norm_fwd = Qwen2RMSNorm(
+            network_config["hidden_size"], device="cuda", eps=network_config.get("rms_norm_eps", 1e-06)
+        )
+
+    def _bind_norm(self):
+        self._ffn_norm = partial(LlamaTransformerLayerInfer._ffn_norm, self)
+
+    def _att_norm(self, input_embedding, infer_state, layer_weight) -> torch.Tensor:
+        return self.norm_fwd(input_embedding, weight=layer_weight.att_norm_weight_.weight)
+
+    def _get_qkv(self, input, cache_kv, infer_state, layer_weight):
+        q = layer_weight.q_proj.mm(input)
+        cache_kv = layer_weight.kv_proj.mm(
+            input, out=cache_kv.view(-1, (self.tp_k_head_num_ + self.tp_v_head_num_) * self.head_dim_)
+        ).view(-1, (self.tp_k_head_num_ + self.tp_v_head_num_), self.head_dim_)
+        seq_len, _ = q.shape
+        q = q.view(1, seq_len, -1, self.head_dim_).transpose(1, 2)
+        k = cache_kv[:, : self.tp_k_head_num_, :].view(1, seq_len, -1, self.head_dim_).transpose(1, 2)
+        new_q, new_k = apply_multimodal_rotary_pos_emb(
+            q, k, infer_state.position_cos, infer_state.position_sin, self.mrope_section
+        )
+        new_q = new_q.transpose(1, 2).reshape(1, seq_len, -1)
+        cache_kv[:, : self.tp_k_head_num_, :] = new_k.squeeze(0).permute(1, 0, 2)
+
+        return new_q, cache_kv
diff --git a/lightllm/models/qwen2_vl/model.py b/lightllm/models/qwen2_vl/model.py
@@ -12,17 +12,16 @@
 from typing import List, Optional, Union
 from transformers.utils import TensorType, logging
 from lightllm.common.build_utils import repair_config
+from lightllm.models.qwen2_vl.infer_struct import Qwen2VLInferStateInfo
+from lightllm.models.qwen2_vl.layer_infer.transformer_layer_infer import Qwen2VLTransformerLayerInfer
 
-# from lightllm.models.qwen2_vl.vision_process import Qwen2VLImageProcessor
 import torch
 from PIL import Image
 from .vision_process import smart_resize
 from lightllm.models.qwen2.layer_weights import transformer_layer_weight, pre_and_post_layer_weight
 from lightllm.models.qwen2.model import Qwen2TpPartModel
 import os
 
-# from lightllm.models.qwen2_vl.layer_weight.pre_and_post_layer_weight import Qwen2VLPreAndPostLayerWeight
-
 # Warp of the origal tokenizer
 class QWen2VLTokenizer:
     def __init__(self, tokenizer=None, image_processor=None, **kwargs):
@@ -89,10 +88,10 @@ def __getattr__(self, name):
 
 class Qwen2VLTpPartModel(Qwen2TpPartModel):
 
-    # weight class
-    # pre_and_post_weight_class = Qwen2VLPreAndPostLayerWeight
-    # infer class
     pre_layer_infer_class = LlamaMultimodalPreLayerInfer
+    transformer_layer_infer_class = Qwen2VLTransformerLayerInfer
+
+    infer_state_class = Qwen2VLInferStateInfo
 
     def __init__(self, kvargs):
         super().__init__(kvargs)