ModelTC
diff --git a/‎lightllm/models/llama/model.py‎
Lines changed: 2 additions & 46 deletions b/‎lightllm/models/llama/model.py‎
Lines changed: 2 additions & 46 deletions
diff --git a/‎lightllm/models/qwen2_vl/flashattention_infer_struct.py‎
Lines changed: 0 additions & 30 deletions b/‎lightllm/models/qwen2_vl/flashattention_infer_struct.py‎
Lines changed: 0 additions & 30 deletions
diff --git a/‎lightllm/models/qwen2_vl/infer_struct.py‎
Lines changed: 63 additions & 10 deletions b/‎lightllm/models/qwen2_vl/infer_struct.py‎
Lines changed: 63 additions & 10 deletions
diff --git a/‎lightllm/models/qwen2_vl/layer_infer/transformer_layer_infer.py‎
Lines changed: 0 additions & 2 deletions b/‎lightllm/models/qwen2_vl/layer_infer/transformer_layer_infer.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎lightllm/models/qwen2_vl/model.py‎
Lines changed: 7 additions & 19 deletions b/‎lightllm/models/qwen2_vl/model.py‎
Lines changed: 7 additions & 19 deletions
@@ -118,7 +118,7 @@ def _init_custom(self):
             scaling_type = rope_scaling["type"]
         else:
             raise ValueError(f"Unknown RoPE scaling format {rope_scaling}")
-        if scaling_type == "default":
+        if scaling_type == "default" or "mrope_section" in rope_scaling:
             self._init_to_get_rotary()
         elif scaling_type == "yarn":
             self._init_to_get_yarn_rotary()
@@ -129,7 +129,7 @@ def _init_custom(self):
         elif scaling_type == "llama3":
             self._init_to_get_llama3_rotary()
         elif scaling_type == "mrope":
-            self._init_to_get_mrope_rotary()
+            self._init_to_get_rotary()
         else:
             raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
         return
@@ -373,47 +373,3 @@ def _init_to_get_llama3_rotary(self, default_base=10000):
         self._cos_cached = torch.cos(freqs).to(self.data_type).cuda()
         self._sin_cached = torch.sin(freqs).to(self.data_type).cuda()
         return
-
-    def _init_to_get_mrope_rotary(self, default_base=10000):
-        partial_head_dim = int(self.config.get("partial_rotary_factor", 1) * self.head_dim_)
-        if self.config.get("rope_scaling", {}) is None:
-            rope_scaling_factor = 1.0
-        else:
-            rope_scaling_factor = self.config.get("rope_scaling", {}).get("factor", 1.0)
-
-        base = self.config.get("rope_theta", float(default_base))
-
-        if "max_sequence_length" in self.config:
-            max_seq_len = self.config["max_sequence_length"]
-        else:
-            max_position_embeddings = self.config.get(
-                "max_position_embeddings", 2048 if base <= 10000.0 + 1e-5 else 16384
-            )
-            max_seq_len = max_position_embeddings * rope_scaling_factor
-
-        # NTK
-        try:
-            ntk_alpha = float(os.environ.get("LIGHTLLM_NTK_ALPHA", 1))
-            assert ntk_alpha >= 1
-            if ntk_alpha > 1:
-                logger.info(f"Note: NTK enabled, alpha set to {ntk_alpha}")
-            max_seq_len *= ntk_alpha
-            base = base * (ntk_alpha ** (partial_head_dim / (partial_head_dim - 2)))  # Base change formula
-        except:
-            pass
-
-        inv_freq = 1.0 / (
-            base ** (torch.arange(0, partial_head_dim, 2, device="cpu", dtype=torch.float32) / partial_head_dim)
-        )
-
-        t = (
-            torch.arange(max(max_seq_len + 1024 * 128, self.max_seq_length), device="cpu", dtype=torch.float32)
-            / rope_scaling_factor
-        )
-        freqs = torch.outer(t, inv_freq).unsqueeze(0).expand(3, -1, -1)
-        freqs = torch.cat((freqs, freqs), dim=-1)
-
-        self._cos_cached = torch.cos(freqs).to(self.data_type).cuda()
-        self._sin_cached = torch.sin(freqs).to(self.data_type).cuda()
-
-        return
@@ -1,10 +1,16 @@
+from typing import Optional, List
 import torch
 import numpy as np
 from lightllm.models.llama.infer_struct import LlamaInferStateInfo
 from lightllm.common.basemodel.infer_struct import InferStateInfo
+from lightllm.models.qwen2_vl.triton_kernel.get_mrope_position_ids import get_mrope_position_triton
+from lightllm.models.llama.flashattention_infer_struct import FlashAttentionStateInfo
+from lightllm.utils.envs_utils import get_env_start_args
 
 
 class Qwen2VLInferStateInfo(LlamaInferStateInfo):
+    init_flash_attention_state_func = FlashAttentionStateInfo._init_flash_attention_state
+
     def __init__(self):
         super().__init__()
         self.position_cos = None
@@ -13,17 +19,64 @@ def __init__(self):
     def init_some_extra_state(self, model, input_ids: torch.Tensor):
         rope_scaling = model.config.get("rope_scaling", {})
         self.rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))
-        if self.rope_type != "mrope":
-            super().init_some_extra_state(model, input_ids)
-            return
         InferStateInfo.init_some_extra_state(self, model, input_ids)
         if self.is_prefill:
-            position_ids = self.position_ids
-            self.position_sin = model._sin_cached[:, position_ids, :].unsqueeze(1)
-            self.position_cos = model._cos_cached[:, position_ids, :].unsqueeze(1)
-            position_ids = None
+            self.position_ids = self.get_mrope_position(self.multimodal_params)
         else:
-            position_ids = self.position_ids
-            self.position_sin = model._sin_cached[:, position_ids, :].unsqueeze(1)
-            self.position_cos = model._cos_cached[:, position_ids, :].unsqueeze(1)
+            b_position_delta = [0 for _ in range(self.b_seq_len.shape[0])]
+            for batch_idx, p in enumerate(self.multimodal_params):
+                position_delta = 0
+                for image in p["images"]:
+                    position_delta += image["grid_thwd"][3]
+                b_position_delta[batch_idx] = position_delta
+            position_ids = self.position_ids + torch.tensor(b_position_delta, device=self.position_ids.device)
+            self.position_ids = position_ids.unsqueeze(0).expand(3, -1)
+
+        self.position_ids = self.position_ids.contiguous()
+        self.position_cos = model._cos_cached[self.position_ids]  # (3, L, D)
+        self.position_sin = model._sin_cached[self.position_ids]  # (3, L, D)
+        if get_env_start_args().enable_fa3:
+            self.max_seq_len = self.max_kv_seq_len
+            self.q_max_seq_len = self.max_q_seq_len
+            self.init_flash_attention_state_func(model, input_ids)
         return
+
+    def get_mrope_position(self, multimodal_params: List[dict]) -> torch.Tensor:
+        if len(multimodal_params) == 0:
+            return self.position_ids.unsqueeze(0).expand(3, -1)
+        b_image_start_idx = []
+        b_image_nums = []
+        b_image_start_num = []
+        b_image_len = []
+        image_start_num = 0
+        b_image_thwd = []
+        for _, p in enumerate(multimodal_params):
+            images = p.get("images", [])
+            for img in images:
+                b_image_start_idx.append(img["start_idx"])
+                b_image_len.append(img["token_num"])
+                b_image_thwd.append(img["grid_thwd"])
+            b_image_nums.append(len(images))
+            b_image_start_num.append(image_start_num)
+            image_start_num += len(images)
+        # 没有任何图片
+        if image_start_num == 0:
+            return self.position_ids.unsqueeze(0).expand(3, -1).contiguous()
+        b_image_start_idx = torch.tensor(b_image_start_idx, device="cpu").cuda(non_blocking=True)
+        b_image_thwd = torch.tensor(b_image_thwd, device="cpu").cuda(non_blocking=True)  # image_num x 4
+        b_image_nums = torch.tensor(b_image_nums, device="cpu").cuda(non_blocking=True)
+        b_image_start_num = torch.tensor(b_image_start_num, device="cpu").cuda(non_blocking=True)
+        b_image_len = torch.tensor(b_image_len, device=self.position_ids.device)
+        position_ids = self.position_ids.unsqueeze(0).expand(3, -1).contiguous()
+        get_mrope_position_triton(
+            b_image_start_idx=b_image_start_idx,
+            b_image_thwd=b_image_thwd,
+            b_image_nums=b_image_nums,
+            b_image_start_num=b_image_start_num,
+            b_image_len=b_image_len,
+            position_ids=position_ids,
+            b_ready_cache_len=self.b_ready_cache_len,
+            b_q_seq_len=self.b_q_seq_len,
+            b_start_loc=self.b_start_loc,
+        )
+        return position_ids
@@ -19,8 +19,6 @@ def __init__(self, layer_num, network_config, mode=[]):
         self.axis_map = torch.tensor(axis_map, dtype=torch.int32, device="cuda")
 
     def _get_qkv(self, input, infer_state, layer_weight):
-        if infer_state.rope_type != "mrope":
-            return super()._get_qkv(input, infer_state, layer_weight)
         q = layer_weight.q_proj.mm(input)
         cache_kv = layer_weight.kv_proj.mm(input).view(-1, (self.tp_k_head_num_ + self.tp_v_head_num_), self.head_dim_)
         seq_len, _ = q.shape
 
@@ -1,28 +1,15 @@
 import json
 import numpy as np
-import unicodedata
 from lightllm.common.basemodel.multimodal_tokenizer import BaseMultiModalTokenizer
-from lightllm.models.qwen.model import QWenTpPartModel
 from lightllm.models.qwen_vl.layer_infer.pre_layer_infer import LlamaMultimodalPreLayerInfer
 from lightllm.server.multimodal_params import AudioItem, MultimodalParams, ImageItem
-from transformers.feature_extraction_utils import BatchFeature
-from transformers.image_utils import ImageInput
-from transformers.processing_utils import ProcessorMixin
 from lightllm.server.core.objs import SamplingParams
-from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from typing import List, Optional, Union
-from transformers.utils import TensorType, logging
-from lightllm.models.qwen2_vl.flashattention_infer_struct import Qwen2VLFlashAttentionStateInfo
 from lightllm.common.build_utils import repair_config
 from lightllm.models.registry import ModelRegistry
 from lightllm.models.qwen2_vl.infer_struct import Qwen2VLInferStateInfo
 from lightllm.models.qwen2_vl.layer_infer.transformer_layer_infer import Qwen2VLTransformerLayerInfer
 
-import torch
-from PIL import Image
 from .vision_process import smart_resize
-from lightllm.utils.envs_utils import enable_env_vars, get_env_start_args
-from lightllm.models.qwen2.layer_weights import transformer_layer_weight, pre_and_post_layer_weight
 from lightllm.models.qwen2.model import Qwen2TpPartModel
 import os
 
@@ -57,6 +44,9 @@ def get_image_token_length(self, img: ImageItem):
         )
         grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
         token_num = (grid_h * grid_w) // (self.merge_size ** 2)
+        position_delta = max(grid_h // self.merge_size, grid_w // self.merge_size) - token_num
+        # delta 是为了mrope准备的，记录由于图片引入，position_id 产生的偏移量
+        img.grid_thwd = (1, grid_h // self.merge_size, grid_w // self.merge_size, position_delta)
         return token_num
 
     def get_audio_token_length(self, audio: AudioItem):
@@ -71,26 +61,25 @@ def encode(self, prompt, multimodal_params: MultimodalParams = None, **kwargs):
         # <img></img> --> <img>id,id+1...id+num</img>
         input_ids = []
         image_id = 0
-        start_idx = 0
         while True:
             try:
-                start_idx = origin_ids.index(self.image_start_id, start_idx)
+                start_idx = origin_ids.index(self.image_start_id)
                 if start_idx + 1 >= len(origin_ids):
                     break
                 if origin_ids[start_idx + 1] == self.image_end_id:
                     input_ids.extend(origin_ids[: start_idx + 1])
                     token_id = multimodal_params.images[image_id].token_id
                     token_num = multimodal_params.images[image_id].token_num
+                    multimodal_params.images[image_id].start_idx = len(input_ids)
                     input_ids.extend(range(token_id, token_id + token_num))
                     input_ids.append(self.image_end_id)
                     origin_ids = origin_ids[start_idx + 2 :]
-                    start_idx = 0
                     image_id += 1
                 else:
                     raise ValueError("image token error")
             except ValueError:
                 break
-        input_ids.extend(origin_ids[start_idx:])
+        input_ids.extend(origin_ids)
         return input_ids
 
 
@@ -107,8 +96,7 @@ def __init__(self, kvargs):
         return
 
     def _init_inferstate_cls(self):
-        if get_env_start_args().enable_fa3:
-            self.infer_state_class = Qwen2VLFlashAttentionStateInfo
+        pass
 
     def _init_config(self):
         with open(os.path.join(self.weight_dir_, "config.json"), "r") as json_file: