0401fix

sangchengmeng · sangchengmeng · commit edb87dec5b4a · 2025-04-01T18:22:39.000+08:00
diff --git a/lightllm/models/internvl/model.py b/lightllm/models/internvl/model.py
@@ -11,6 +11,7 @@
     InternVLLlamaPreAndPostLayerWeight,
     InternVLPhi3PreAndPostLayerWeight,
 )
+from lightllm.server.core.objs import SamplingParams
 from lightllm.models.internvl.layer_weights.pre_and_post_layer_weight import InternVLInternlm2PreAndPostLayerWeight
 from lightllm.models.llava.llava_visual import LlavaVisionModel
 
@@ -40,20 +41,29 @@ def __init__(self, tokenizer, model_cfg, **kwargs):
         self.image_end_id = tokenizer.convert_tokens_to_ids(self.image_end_tag)
         self.get_image_patch_func = get_image_patch_func(kwargs["weight_dir"])
 
-    def init_imageItem_extral_params(self, img: ImageItem, num_images):
-        if img.extra_params["image_patch_max_num"] > 0:
+    def init_imageItem_extral_params(self, img: ImageItem, multi_params: MultimodalParams, image_max_patch_num: int):
+        if image_max_patch_num >= 0:
+            img.extra_params["image_patch_max_num"] = image_max_patch_num
             return
-        if num_images == 1:
-            img.extra_params["image_patch_max_num"] = 12
-        elif num_images > 1 and num_images <= 6:
-            img.extra_params["image_patch_max_num"] = 6
-        elif num_images > 6:
-            img.extra_params["image_patch_max_num"] = 0
+        elif os.getenv("MAX_PATCH_NUM"):
+            img.extra_params["image_patch_max_num"] = int(os.getenv("MAX_PATCH_NUM"))
+            return
+        else:
+            num_images = len(multi_params.images)
+            if num_images == 1:
+                img.extra_params["image_patch_max_num"] = 12
+            elif num_images > 1 and num_images <= 6:
+                img.extra_params["image_patch_max_num"] = 6
+            elif num_images > 6:
+                img.extra_params["image_patch_max_num"] = 0
         return
 
     def get_image_token_length(self, img: ImageItem):
         return (
-            self.get_image_patch_func(img.image_w, img.image_h, max_num=img.extra_params["image_patch_max_num"], use_thumbnail=True) * self.image_length
+            self.get_image_patch_func(
+                img.image_w, img.image_h, max_num=img.extra_params["image_patch_max_num"], use_thumbnail=True
+            )
+            * self.image_length
         )
 
     # only change the impl of the encode func:
diff --git a/lightllm/models/llava/llava_visual.py b/lightllm/models/llava/llava_visual.py
@@ -138,7 +138,7 @@ def encode(self, images: List[ImageItem]):
                 t = self.image_processor.preprocess(image_data, return_tensors="pt")["pixel_values"]
                 img_tensors.append(t)
             else:
-                raise Exception("Unsupport input types: {} for {}".format(type(item), item))
+                raise Exception("Unsupport input types: {} for {}".format(type(img), img))
 
             cur_num = img_tensors[-1].shape[0]
             valid_ids.append([valid_id, valid_id + cur_num])
diff --git a/lightllm/models/llava/model.py b/lightllm/models/llava/model.py
@@ -35,6 +35,7 @@ def __init__(self, tokenizer, model_cfg):
 
     def init_imageItem_extral_params(self, img: ImageItem, num_images):
         return
+
     def get_image_token_length(self, img: ImageItem):
         return self.image_length
 
diff --git a/lightllm/models/qwen2_vl/model.py b/lightllm/models/qwen2_vl/model.py
@@ -33,7 +33,7 @@ def __init__(self, tokenizer=None, image_processor=None, **kwargs):
 
     def init_imageItem_extral_params(self, img: ImageItem, num_images):
         return
-    
+
     def get_image_token_length(self, img: ImageItem):
         width = img.image_w
         height = img.image_h
diff --git a/lightllm/models/qwen2_vl/qwen2_visual.py b/lightllm/models/qwen2_vl/qwen2_visual.py
@@ -41,7 +41,7 @@
 from transformers import AutoProcessor
 from safetensors import safe_open
 from transformers.utils import TensorType
-from lightllm.server.multimodal_params import MultimodalParams,ImageItem
+from lightllm.server.multimodal_params import MultimodalParams, ImageItem
 from lightllm.models.qwen2_vl.vision_process import Qwen2VLImageProcessor
 
 
@@ -445,7 +445,7 @@ def encode(self, images: List[ImageItem]):
                 img_tensors.append(pixel_values)
                 img_grids.append(image_grid_thw)
             else:
-                raise Exception("Unsupport input types: {} for {}".format(type(url), url))
+                raise Exception("Unsupport input types: {} for {}".format(type(img), img))
 
             # must devide merge_length
             cur_num = img_tensors[-1].shape[0] // (self.spatial_merge_size ** 2)
diff --git a/lightllm/models/vit/triton_kernel/flashattention_nopad.py b/lightllm/models/vit/triton_kernel/flashattention_nopad.py
@@ -204,6 +204,7 @@ def flash_attention_v3_fwd(
 
 except ImportError:
     print("Failed to import _flash_attn_forward from hopper.flash_attn_interface.")
+    _flash_attn_v3_available = False
 
 
 def flash_attention_fwd(q, k, v, o):
diff --git a/lightllm/server/core/objs/py_sampling_params.py b/lightllm/server/core/objs/py_sampling_params.py
@@ -37,6 +37,7 @@ def __init__(
         top_p: float = None,
         top_k: int = None,  # -1 is for all
         ignore_eos: bool = False,
+        image_max_patch_num: int = -1,
         max_new_tokens: int = 16,
         min_new_tokens: int = 1,
         stop_sequences: Optional[Union[str, List[str], List[List[int]]]] = None,  # 停止句子条件
@@ -75,6 +76,7 @@ def __init__(
         self.top_p = top_p if top_p is not None else SamplingParams._top_p
         self.top_k = top_k if top_k is not None else SamplingParams._top_k
         self.ignore_eos = ignore_eos
+        self.image_max_patch_num = image_max_patch_num
         self.max_new_tokens = max_new_tokens
         self.min_new_tokens = min_new_tokens
         self.stop_sequences = stop_sequences if stop_sequences is not None else SamplingParams._stop_sequences
@@ -254,6 +256,7 @@ def to_dict(self):
         ret["temperature"] = self.temperature
         ret["top_p"] = self.top_p
         ret["top_k"] = self.top_k
+        ret["image_max_patch_num"] = self.image_max_patch_num
         ret["min_new_tokens"] = self.min_new_tokens
         ret["ignore_eos"] = self.ignore_eos
         ret["max_new_tokens"] = self.max_new_tokens
diff --git a/lightllm/server/core/objs/sampling_params.py b/lightllm/server/core/objs/sampling_params.py
@@ -249,6 +249,7 @@ class SamplingParams(ctypes.Structure):
         ("top_p", ctypes.c_float),
         ("top_k", ctypes.c_int),
         ("ignore_eos", ctypes.c_bool),
+        ("image_max_patch_num", ctypes.c_int),
         ("max_new_tokens", ctypes.c_int),
         ("min_new_tokens", ctypes.c_int),
         # Whether to count input tokens for presence_penalty, frequency_penalty and repetition_penalty
@@ -294,6 +295,7 @@ def init(self, tokenizer, **kwargs):
         self.top_p = kwargs.get("top_p", SamplingParams._top_p)
         self.top_k = kwargs.get("top_k", SamplingParams._top_k)
         self.ignore_eos = kwargs.get("ignore_eos", False)
+        self.image_max_patch_num = kwargs.get("image_max_patch_num", -1)
         self.max_new_tokens = kwargs.get("max_new_tokens", 16)
         self.min_new_tokens = kwargs.get("min_new_tokens", 1)
         self.input_penalty = kwargs.get("input_penalty", DEFAULT_INPUT_PENALTY)
@@ -424,6 +426,7 @@ def to_dict(self):
             "top_p": self.top_p,
             "top_k": self.top_k,
             "ignore_eos": self.ignore_eos,
+            "image_max_patch_num": self.image_max_patch_num,
             "max_new_tokens": self.max_new_tokens,
             "min_new_tokens": self.min_new_tokens,
             "exponential_decay_length_penalty": self.exponential_decay_length_penalty.to_tuple(),
diff --git a/lightllm/server/embed_cache/impl/naive_memory_cache.py b/lightllm/server/embed_cache/impl/naive_memory_cache.py
@@ -109,4 +109,4 @@ def set_item_embed(self, id: int) -> None:
         self._records[id].embed = True
 
     def get_item_embed(self, id: int) -> bool:
-        return self._records[id].embed
+        return self._records[id].embed
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
@@ -108,8 +108,9 @@ def __init__(
         return
 
     # connect cache server, calculate md5, alloc resource, return uuid
-    async def _alloc_resource(self, img:ImageItem, num_tokens):
+    async def _alloc_resource(self, img: ImageItem):
         data = img.read()
+        num_tokens = self.tokenizer.get_image_token_length(img)
         md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params)))
         wait_time = 1
         while True:
@@ -126,16 +127,12 @@ async def _alloc_resource(self, img:ImageItem, num_tokens):
                 await asyncio.sleep(wait_time)
                 wait_time = min(wait_time + 2, 9)
 
-    async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams):
+    async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, image_max_patch_num):
         # 只有 P 和 NORMAL 节点需要真的管理多模态资源
         if self.pd_mode.is_P_or_NORMAL():
-            num_images = len(multimodal_params.images)
             for img in multimodal_params.images:
-                self.tokenizer.init_imageItem_extral_params(img, num_images)
-                num_tokens = self.tokenizer.get_image_token_length(img)
-                record = await self._alloc_resource(
-                    img, num_tokens
-                )
+                self.tokenizer.init_imageItem_extral_params(img, multimodal_params, image_max_patch_num)
+                record = await self._alloc_resource(img)
                 img.uuid = record["id"]
                 img.token_id = record["token_id"]
                 img.token_num = record["token_num"]
@@ -234,9 +231,7 @@ async def generate(
             await self._log_req_header(request_headers, group_request_id)
             # 监控
 
-            prompt_ids = await self._encode(
-                prompt, multimodal_params, add_special_tokens=sampling_params.add_special_tokens
-            )
+            prompt_ids = await self._encode(prompt, multimodal_params, sampling_params)
             prompt_tokens = len(prompt_ids)
             # 监控
             if group_request_id > 0:
@@ -307,15 +302,19 @@ async def _log_req_header(self, request_headers, group_request_id: int):
         return
 
     async def _encode(
-        self, prompt: Union[str, List[int]], multimodal_params: MultimodalParams, add_special_tokens: bool
+        self, prompt: Union[str, List[int]], multimodal_params: MultimodalParams, sampling_params: SamplingParams
     ):
         if isinstance(prompt, str):
             if self.enable_multimodal:
                 assert len(multimodal_params.images) <= self.args.cache_capacity, "too many images!"
-                await self._alloc_multimodal_resources(multimodal_params)
-                prompt_ids = self.tokenizer.encode(prompt, multimodal_params, add_special_tokens=add_special_tokens)
+                await self._alloc_multimodal_resources(
+                    multimodal_params, image_max_patch_num=sampling_params.image_max_patch_num
+                )
+                prompt_ids = self.tokenizer.encode(
+                    prompt, multimodal_params, add_special_tokens=sampling_params.add_special_tokens
+                )
             else:
-                prompt_ids = self.tokenizer.encode(prompt, add_special_tokens=add_special_tokens)
+                prompt_ids = self.tokenizer.encode(prompt, add_special_tokens=sampling_params.add_special_tokens)
             return prompt_ids
 
         # 这里的校验对多模态不是很充分, to do
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
@@ -8,7 +8,6 @@
 
 
 class ImageItem:
-    
     def __init__(self, **kwargs):
         self._type = kwargs["type"]
         self._data = kwargs["data"]
@@ -22,7 +21,7 @@ def __init__(self, **kwargs):
         self.image_h = 0
 
         self._preload_data = None
-        self.extra_params = {"image_patch_max_num": kwargs.get("max_num", None)}
+        self.extra_params = {}
 
     def preload(self):
         try:
@@ -74,12 +73,9 @@ class MultimodalParams:
     def __init__(
         self,
         images: List[dict] = [],
-        max_num: int = -1,
     ) -> None:
         self.images = [ImageItem(**i) for i in images]
-        max_num = int(os.getenv("MAX_PATCH_NUM", max_num))
-        for image in self.images:
-            image.extra_params["image_patch_max_num"] = max_num
+        return
 
     def verify_and_preload(self):
         for image in self.images:
diff --git a/lightllm/server/visualserver/manager.py b/lightllm/server/visualserver/manager.py
@@ -10,7 +10,7 @@
 from lightllm.server.core.objs import ShmReqManager
 
 asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
-
+from lightllm.server.multimodal_params import MultimodalParams, ImageItem
 from .model_infer.model_rpc import start_model_process, VisualModelRpcClient
 from lightllm.utils.log_utils import init_logger
 from lightllm.utils.graceful_utils import graceful_registry
@@ -80,16 +80,16 @@ async def wait_to_model_ready(self):
         await asyncio.gather(*init_model_ret)
         return
 
-    async def infer_imgs(self, uuids):
-        if len(uuids) == 0:
+    async def infer_imgs(self, images: List[ImageItem]):
+        if len(images) == 0:
             return
 
         tasks = []
         for vit_dp_rank in range(self.vit_dp):
-            assigned_uuids = [uuids[i] for i in range(vit_dp_rank, len(uuids), self.vit_dp)]
-            if assigned_uuids:
+            assigned_images = [images[i] for i in range(vit_dp_rank, len(images), self.vit_dp)]
+            if assigned_images:
                 for vit_tp_rank in range(self.vit_tp):
-                    task = asyncio.create_task(self.model_rpcs[vit_dp_rank][vit_tp_rank].encode(assigned_uuids))
+                    task = asyncio.create_task(self.model_rpcs[vit_dp_rank][vit_tp_rank].encode(assigned_images))
                     tasks.append(task)
 
         await asyncio.gather(*tasks)
@@ -101,7 +101,7 @@ async def loop_for_fwd(self):
                 await asyncio.sleep(0.01)  # 10ms
             else:
                 processing_group_reqs = []
-                uuids_need_infer = []
+                images_need_infer = []
                 while len(self.waiting_reqs) > 0:
                     group_req_indexes = self.waiting_reqs.pop(0)
                     shm_req = self.shm_req_manager.get_req_obj_by_index(group_req_indexes.shm_req_indexes[0])
@@ -117,27 +117,27 @@ async def loop_for_fwd(self):
                     multimodal_params = group_req_indexes.multimodal_params
 
                     for img in multimodal_params.images:
-                        # if not self.cache_client.root.get_item_embed(img.uuid):
-                        #     uuids_need_infer.append(img.uuid)
+                        if not self.cache_client.root.get_item_embed(img.uuid):
+                            images_need_infer.append(img)
 
-                        if len(multimodal_params.images) == self.infer_batch_size:
-                            await self.infer_imgs(multimodal_params.images)
-                            # uuids_need_infer = []
+                        if len(images_need_infer) == self.infer_batch_size:
+                            await self.infer_imgs(images_need_infer)
+                            images_need_infer = []
                             for _group_req_indexes in processing_group_reqs:
                                 self.send_to_router.send_pyobj(_group_req_indexes, protocol=pickle.HIGHEST_PROTOCOL)
                             processing_group_reqs = []
 
-                    if len(multimodal_params.images) == 0:
+                    if len(images_need_infer) == 0:
                         self.send_to_router.send_pyobj(group_req_indexes, protocol=pickle.HIGHEST_PROTOCOL)
                     else:
                         processing_group_reqs.append(group_req_indexes)
 
-                if len(multimodal_params.images) > 0:
-                    await self.infer_imgs(multimodal_params.images)
+                if len(images_need_infer) > 0:
+                    await self.infer_imgs(images_need_infer)
                     for _group_req_indexes in processing_group_reqs:
                         self.send_to_router.send_pyobj(_group_req_indexes, protocol=pickle.HIGHEST_PROTOCOL)
                     processing_group_reqs = []
-                    # uuids_need_infer = []
+                    images_need_infer = []
 
     async def loop_for_netio_req(self):
         while True:
diff --git a/lightllm/server/visualserver/model_infer/model_rpc.py b/lightllm/server/visualserver/model_infer/model_rpc.py
@@ -75,11 +75,11 @@ def exposed_init_model(self, kvargs):
 
     # @calculate_time(show=True, min_cost_ms=150)
     @torch.no_grad()
-    def forward(self, images:List[ImageItem]):
+    def forward(self, images: List[ImageItem]):
         return self.model.encode(images)
 
     # @calculate_time(show=False, min_cost_ms=300)
-    def exposed_encode(self, images:List[ImageItem]):
+    def exposed_encode(self, images: List[ImageItem]):
         images = obtain(images)
         all_img_embeds, uuids, valid_ids = self.forward(images)
         all_img_embeds = all_img_embeds.to(torch.device("cpu"))
@@ -128,7 +128,7 @@ async def init_model(self, kvargs):
         else:
             return
 
-    async def encode(self, images:List[ImageItem]):
+    async def encode(self, images: List[ImageItem]):
         ans = self._encode(images)
         if self.use_rpc:
             return await ans