[fix] vit0331

sangchengmeng · shihaobai · commit 339d98e34434 · 2025-03-31T10:00:07.000Z
diff --git a/lightllm/models/internvl/internvl_visual.py b/lightllm/models/internvl/internvl_visual.py
@@ -8,6 +8,7 @@
 from torchvision import transforms as T
 from torchvision.transforms.functional import InterpolationMode
 from transformers import AutoModel, AutoTokenizer
+from lightllm.server.multimodal_params import MultimodalParams, ImageItem
 from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
 from io import BytesIO
 from lightllm.models.internvl.img_process import load_image
@@ -43,21 +44,21 @@ def load_model(self, weight_dir):
     def cuda(self):
         return self
 
-    def encode(self, image_uuids: List):
+    def encode(self, images: List[ImageItem]):
         img_tensors = []
         valid_ids = []
         valid_id = 0
         uuids = []
 
-        for i, url in enumerate(image_uuids):
-            if isinstance(url, int):
-                uuids.append(url)
-                image_data = read_shm(get_shm_name_data(url))
+        for i, img in enumerate(images):
+            if isinstance(img, ImageItem):
+                uuids.append(img.uuid)
+                image_data = read_shm(get_shm_name_data(img.uuid))
                 image_data = Image.open(BytesIO(image_data))
-                t = self.load_image_func(image_data)
+                t = self.load_image_func(image_data, max_num=img.extra_params["image_patch_max_num"])
                 img_tensors.append(t)
             else:
-                raise Exception("Unsupport input types: {} for {}".format(type(url), url))
+                raise Exception("Unsupport input types: {} for {}".format(type(img), img))
 
             cur_num = img_tensors[-1].shape[0]
             valid_ids.append([valid_id, valid_id + cur_num])
diff --git a/lightllm/models/internvl/model.py b/lightllm/models/internvl/model.py
@@ -40,9 +40,20 @@ def __init__(self, tokenizer, model_cfg, **kwargs):
         self.image_end_id = tokenizer.convert_tokens_to_ids(self.image_end_tag)
         self.get_image_patch_func = get_image_patch_func(kwargs["weight_dir"])
 
-    def get_image_token_length(self, img: ImageItem, max_num):
+    def init_imageItem_extral_params(self, img: ImageItem, num_images):
+        if img.extra_params["image_patch_max_num"] > 0:
+            return
+        if num_images == 1:
+            img.extra_params["image_patch_max_num"] = 12
+        elif num_images > 1 and num_images <= 6:
+            img.extra_params["image_patch_max_num"] = 6
+        elif num_images > 6:
+            img.extra_params["image_patch_max_num"] = 0
+        return
+
+    def get_image_token_length(self, img: ImageItem):
         return (
-            self.get_image_patch_func(img.image_w, img.image_h, max_num=max_num, use_thumbnail=True) * self.image_length
+            self.get_image_patch_func(img.image_w, img.image_h, max_num=img.extra_params["image_patch_max_num"], use_thumbnail=True) * self.image_length
         )
 
     # only change the impl of the encode func:
diff --git a/lightllm/models/llava/llava_visual.py b/lightllm/models/llava/llava_visual.py
@@ -6,6 +6,7 @@
 from typing import List, Union
 from safetensors import safe_open
 from io import BytesIO
+from lightllm.server.multimodal_params import MultimodalParams, ImageItem
 from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
 from lightllm.utils.log_utils import init_logger
 
@@ -123,16 +124,16 @@ def forward(self, x):
         x = x.view(B, L, -1)
         return x
 
-    def encode(self, image_uuids: List):
+    def encode(self, images: List[ImageItem]):
         img_tensors = []
         uuids = []
         valid_id = 0
         valid_ids = []
 
-        for i, item in enumerate(image_uuids):
-            if isinstance(item, int):
-                uuids.append(item)
-                image_data = read_shm(get_shm_name_data(item))
+        for i, img in enumerate(images):
+            if isinstance(img, ImageItem):
+                uuids.append(img.uuid)
+                image_data = read_shm(get_shm_name_data(img.uuid))
                 image_data = Image.open(BytesIO(image_data)).convert("RGB")
                 t = self.image_processor.preprocess(image_data, return_tensors="pt")["pixel_values"]
                 img_tensors.append(t)
diff --git a/lightllm/models/llava/model.py b/lightllm/models/llava/model.py
@@ -33,6 +33,8 @@ def __init__(self, tokenizer, model_cfg):
         self.image_length = (image_size // patch_size) ** 2
         self.skip_start = model_cfg.get("skip_start", True)
 
+    def init_imageItem_extral_params(self, img: ImageItem, num_images):
+        return
     def get_image_token_length(self, img: ImageItem):
         return self.image_length
 
diff --git a/lightllm/models/qwen2_vl/model.py b/lightllm/models/qwen2_vl/model.py
@@ -31,6 +31,9 @@ def __init__(self, tokenizer=None, image_processor=None, **kwargs):
         self.image_end_id = kwargs["model_cfg"]["vision_end_token_id"]
         self.image_token_id = kwargs["model_cfg"]["image_token_id"]
 
+    def init_imageItem_extral_params(self, img: ImageItem, num_images):
+        return
+    
     def get_image_token_length(self, img: ImageItem):
         width = img.image_w
         height = img.image_h
diff --git a/lightllm/models/qwen2_vl/qwen2_visual.py b/lightllm/models/qwen2_vl/qwen2_visual.py
@@ -41,6 +41,7 @@
 from transformers import AutoProcessor
 from safetensors import safe_open
 from transformers.utils import TensorType
+from lightllm.server.multimodal_params import MultimodalParams,ImageItem
 from lightllm.models.qwen2_vl.vision_process import Qwen2VLImageProcessor
 
 
@@ -425,17 +426,17 @@ def load_model(self, weight_dir):
 
         self.load_state_dict(weight_dict)
 
-    def encode(self, image_uuids: List):
+    def encode(self, images: List[ImageItem]):
         img_tensors = []
         valid_ids = []
         valid_id = 0
         img_grids = []
         uuids = []
 
-        for i, url in enumerate(image_uuids):
-            if isinstance(url, int):
-                uuids.append(url)
-                image_data = read_shm(get_shm_name_data(url))
+        for i, img in enumerate(images):
+            if isinstance(img, ImageItem):
+                uuids.append(img.uuid)
+                image_data = read_shm(get_shm_name_data(img.uuid))
                 image_data = Image.open(BytesIO(image_data))
                 image_data = get_image(image_data)
                 image_inputs = self.processor.preprocess(images=image_data, return_tensors="pt")
diff --git a/lightllm/models/qwen_vl/model.py b/lightllm/models/qwen_vl/model.py
@@ -19,6 +19,9 @@ def __init__(self, tokenizer, model_cfg):
         # <imgpad>: 151859
         self.image_length = model_cfg["visual"].get("n_queries", 256)
 
+    def init_imageItem_extral_params(self, img: ImageItem, num_images):
+        return
+
     def _list_find(self, input_list, target, start_idx):
         cur_list = input_list[start_idx:]
         if target in cur_list:
diff --git a/lightllm/models/vit/model.py b/lightllm/models/vit/model.py
@@ -7,6 +7,7 @@
 from lightllm.models.vit.layer_weights.pre_and_post_layer_weight import ViTPreAndPostLayerWeight
 from lightllm.models.vit.layer_weights.transformer_layer_weight import ViTTransformerLayerWeight
 from lightllm.models.vit.layer_weights.hf_load_utils import load_hf_weights
+from lightllm.server.multimodal_params import MultimodalParams, ImageItem
 from lightllm.common.build_utils import repair_config
 from lightllm.utils.log_utils import init_logger
 from lightllm.models.vit import get_load_image_func
@@ -135,21 +136,20 @@ def forward(self, pixel_values):
         return input_embs
 
     @torch.no_grad()
-    def encode(self, image_uuids: List, max_num_list: List):
+    def encode(self, images: List[ImageItem]):
         img_tensors = []
         valid_ids = []
         valid_id = 0
         uuids = []
-        for i, url in enumerate(image_uuids):
-            if isinstance(url, int):
-                uuids.append(url)
-                image_data = read_shm(get_shm_name_data(url))
+        for i, img in enumerate(images):
+            if isinstance(img, ImageItem):
+                uuids.append(img.uuid)
+                image_data = read_shm(get_shm_name_data(img.uuid))
                 image_data = Image.open(BytesIO(image_data))
-                max_num = max_num_list[i]
-                t = self.load_image_func(image_data, max_num=max_num)
+                t = self.load_image_func(image_data, max_num=img.extra_params["image_patch_max_num"])
                 img_tensors.append(t)
             else:
-                raise Exception("Unsupport input types: {} for {}".format(type(url), url))
+                raise Exception("Unsupport input types: {} for {}".format(type(img), img))
 
             cur_num = img_tensors[-1].shape[0]
             valid_ids.append([valid_id, valid_id + cur_num])
@@ -160,7 +160,6 @@ def encode(self, image_uuids: List, max_num_list: List):
 
         imgs = torch.cat(img_tensors, dim=0)
         pixel_values = imgs.cuda().to(dtype=self.data_type)
-        print(pixel_values.shape, pixel_values.dtype)
         all_img_embeds = self.forward(pixel_values)
         return all_img_embeds, uuids, valid_ids
 
diff --git a/lightllm/server/embed_cache/impl/naive_memory_cache.py b/lightllm/server/embed_cache/impl/naive_memory_cache.py
@@ -14,7 +14,6 @@
 class Record(object):
     id: int
     md5sum: str
-    max_num: int
     ref: int
     data: bool
     embed: bool
@@ -70,14 +69,11 @@ def alloc(self, md5sum: str, token_num: int) -> dict:
                     self._clear()
                     if self.occupied >= self.capacity:
                         return None
-                _, max_num_str = md5sum.rsplit("_", 1)
-                max_num = int(max_num_str)
                 id = uuid.uuid1()
                 id = id.int
                 record = Record(
                     id=id,
                     md5sum=md5sum,
-                    max_num=max_num,
                     ref=1,
                     data=False,
                     embed=False,
@@ -113,7 +109,4 @@ def set_item_embed(self, id: int) -> None:
         self._records[id].embed = True
 
     def get_item_embed(self, id: int) -> bool:
-        return self._records[id].embed
-
-    def get_max_num(self, id: int) -> int:
-        return self._records[id].max_num
+        return self._records[id].embed
diff --git a/lightllm/server/embed_cache/manager.py b/lightllm/server/embed_cache/manager.py
@@ -48,10 +48,6 @@ def exposed_get_item_embed(self, id: int) -> bool:
         id = obtain(id)
         return self._impl.get_item_embed(id=id)
 
-    def exposed_get_max_num(self, id: int) -> int:
-        id = obtain(id)
-        return self._impl.get_max_num(id=id)
-
 
 def start_cache_manager(port: int, args, pipe_writer):
     # 注册graceful 退出的处理
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
@@ -9,6 +9,7 @@
 import hashlib
 import datetime
 import websockets
+from frozendict import frozendict
 import pickle
 import ujson as json
 import multiprocessing
@@ -107,11 +108,12 @@ def __init__(
         return
 
     # connect cache server, calculate md5, alloc resource, return uuid
-    async def _alloc_resource(self, data, num, max_num):
-        md5sum = hashlib.md5(data).hexdigest() + "_" + str(max_num)
+    async def _alloc_resource(self, img:ImageItem, num_tokens):
+        data = img.read()
+        md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params)))
         wait_time = 1
         while True:
-            record = self.cache_client.root.alloc(md5sum, num)
+            record = self.cache_client.root.alloc(md5sum, num_tokens)
             # hit or new
             if record:
                 uid = record["id"]
@@ -127,10 +129,12 @@ async def _alloc_resource(self, data, num, max_num):
     async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams):
         # 只有 P 和 NORMAL 节点需要真的管理多模态资源
         if self.pd_mode.is_P_or_NORMAL():
-            max_num = multimodal_params.max_num
+            num_images = len(multimodal_params.images)
             for img in multimodal_params.images:
+                self.tokenizer.init_imageItem_extral_params(img, num_images)
+                num_tokens = self.tokenizer.get_image_token_length(img)
                 record = await self._alloc_resource(
-                    img.read(), self.tokenizer.get_image_token_length(img, max_num), max_num
+                    img, num_tokens
                 )
                 img.uuid = record["id"]
                 img.token_id = record["token_id"]
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
@@ -8,6 +8,7 @@
 
 
 class ImageItem:
+    
     def __init__(self, **kwargs):
         self._type = kwargs["type"]
         self._data = kwargs["data"]
@@ -21,6 +22,7 @@ def __init__(self, **kwargs):
         self.image_h = 0
 
         self._preload_data = None
+        self.extra_params = {"image_patch_max_num": kwargs.get("max_num", None)}
 
     def preload(self):
         try:
@@ -76,18 +78,8 @@ def __init__(
     ) -> None:
         self.images = [ImageItem(**i) for i in images]
         max_num = int(os.getenv("MAX_PATCH_NUM", max_num))
-        if max_num > 0:
-            self.max_num = max_num
-            return
-        num_image = len(self.images)
-        if num_image == 1:
-            max_num = 12
-        elif num_image > 1 and num_image <= 6:
-            max_num = 6
-        elif num_image > 6:
-            max_num = 0
-        self.max_num = max_num
-        return
+        for image in self.images:
+            image.extra_params["image_patch_max_num"] = max_num
 
     def verify_and_preload(self):
         for image in self.images:
diff --git a/lightllm/server/visualserver/manager.py b/lightllm/server/visualserver/manager.py
@@ -117,27 +117,27 @@ async def loop_for_fwd(self):
                     multimodal_params = group_req_indexes.multimodal_params
 
                     for img in multimodal_params.images:
-                        if not self.cache_client.root.get_item_embed(img.uuid):
-                            uuids_need_infer.append(img.uuid)
+                        # if not self.cache_client.root.get_item_embed(img.uuid):
+                        #     uuids_need_infer.append(img.uuid)
 
-                        if len(uuids_need_infer) == self.infer_batch_size:
-                            await self.infer_imgs(uuids_need_infer)
-                            uuids_need_infer = []
+                        if len(multimodal_params.images) == self.infer_batch_size:
+                            await self.infer_imgs(multimodal_params.images)
+                            # uuids_need_infer = []
                             for _group_req_indexes in processing_group_reqs:
                                 self.send_to_router.send_pyobj(_group_req_indexes, protocol=pickle.HIGHEST_PROTOCOL)
                             processing_group_reqs = []
 
-                    if len(uuids_need_infer) == 0:
+                    if len(multimodal_params.images) == 0:
                         self.send_to_router.send_pyobj(group_req_indexes, protocol=pickle.HIGHEST_PROTOCOL)
                     else:
                         processing_group_reqs.append(group_req_indexes)
 
-                if len(uuids_need_infer) > 0:
-                    await self.infer_imgs(uuids_need_infer)
+                if len(multimodal_params.images) > 0:
+                    await self.infer_imgs(multimodal_params.images)
                     for _group_req_indexes in processing_group_reqs:
                         self.send_to_router.send_pyobj(_group_req_indexes, protocol=pickle.HIGHEST_PROTOCOL)
                     processing_group_reqs = []
-                    uuids_need_infer = []
+                    # uuids_need_infer = []
 
     async def loop_for_netio_req(self):
         while True:
diff --git a/lightllm/server/visualserver/model_infer/model_rpc.py b/lightllm/server/visualserver/model_infer/model_rpc.py
@@ -12,6 +12,7 @@
 from lightllm.models.llava.llava_visual import LlavaVisionModel
 from lightllm.models.internvl.internvl_visual import InternVLVisionModel
 from lightllm.models.vit.model import VisionTransformer
+from lightllm.server.multimodal_params import MultimodalParams, ImageItem
 from lightllm.models.qwen2_vl.qwen2_visual import Qwen2VisionTransformerPretrainedModel
 from lightllm.server.embed_cache.utils import tensor2bytes, read_shm, create_shm, get_shm_name_data, get_shm_name_embed
 from lightllm.utils.infer_utils import set_random_seed
@@ -74,18 +75,13 @@ def exposed_init_model(self, kvargs):
 
     # @calculate_time(show=True, min_cost_ms=150)
     @torch.no_grad()
-    def forward(self, images_uuids):
-        max_num_list = []
-        for i in range(len(images_uuids)):
-            uid = images_uuids[i]
-            max_num_list.append(self.cache_client.root.get_max_num(uid))
-        return self.model.encode(images_uuids, max_num_list)
+    def forward(self, images:List[ImageItem]):
+        return self.model.encode(images)
 
     # @calculate_time(show=False, min_cost_ms=300)
-    def exposed_encode(self, images_uuids):
-        images_uuids = obtain(images_uuids)
-
-        all_img_embeds, uuids, valid_ids = self.forward(images_uuids)
+    def exposed_encode(self, images:List[ImageItem]):
+        images = obtain(images)
+        all_img_embeds, uuids, valid_ids = self.forward(images)
         all_img_embeds = all_img_embeds.to(torch.device("cpu"))
         if self.tp_rank_id == 0:
             for i in range(len(uuids)):
@@ -132,8 +128,8 @@ async def init_model(self, kvargs):
         else:
             return
 
-    async def encode(self, uuids):
-        ans = self._encode(uuids)
+    async def encode(self, images:List[ImageItem]):
+        ans = self._encode(images)
         if self.use_rpc:
             return await ans
         else: