[add] add skip image cache and disable_prompt_cache para (#1061)

SangChengC · wangzaijun · web-flow · commit a5f188fb2ae4 · 2025-09-24T19:14:58.000+08:00
Co-authored-by: wangzaijun &lt;wangzaijun@sensetime.com&gt;
diff --git a/lightllm/server/audioserver/manager.py b/lightllm/server/audioserver/manager.py
@@ -86,6 +86,7 @@ async def loop_for_fwd(self):
                 while len(self.waiting_reqs) > 0:
                     group_req_indexes = self.waiting_reqs.pop(0)
                     shm_req = self.shm_req_manager.get_req_obj_by_index(group_req_indexes.shm_req_indexes[0])
+                    disable_prompt_cache = shm_req.sample_params.disable_prompt_cache
                     is_aborted = shm_req.is_aborted
                     self.shm_req_manager.put_back_req_obj(shm_req)
                     if is_aborted:
@@ -98,7 +99,11 @@ async def loop_for_fwd(self):
                     multimodal_params = group_req_indexes.multimodal_params
 
                     audio_uuids = [audio.uuid for audio in multimodal_params.audios]
-                    ready_audio = obtain(self.cache_client.root.get_items_embed(audio_uuids))
+                    # disable prompt cache通常用来测试，需要也去掉audio cache的影响
+                    if disable_prompt_cache:
+                        ready_audio = [False] * len(audio_uuids)
+                    else:
+                        ready_audio = obtain(self.cache_client.root.get_items_embed(audio_uuids))
 
                     for audio, ready in zip(multimodal_params.audios, ready_audio):
                         if not ready:
diff --git a/lightllm/server/core/objs/sampling_params.py b/lightllm/server/core/objs/sampling_params.py
@@ -320,6 +320,7 @@ class SamplingParams(ctypes.Structure):
             ctypes.c_bool,
         ),  # whether to add spaces between special tokens when decoding
         ("print_eos_token", ctypes.c_bool),  # eos_id will be always ignored except the value is set to True
+        ("disable_prompt_cache", ctypes.c_bool),  # whether to disable prompt cache
     ]
 
     _do_sample: bool = False
@@ -350,6 +351,7 @@ def init(self, tokenizer, **kwargs):
         self.suggested_dp_index = kwargs.get("suggested_dp_index", -1)
 
         self.skip_special_tokens = kwargs.get("skip_special_tokens", SKIP_SPECIAL_TOKENS)
+        self.disable_prompt_cache = kwargs.get("disable_prompt_cache", False)
 
         self.add_special_tokens = kwargs.get("add_special_tokens", True)
         self.add_spaces_between_special_tokens = kwargs.get("add_spaces_between_special_tokens", True)
@@ -494,6 +496,7 @@ def to_dict(self):
             "add_special_tokens": self.add_special_tokens,
             "add_spaces_between_special_tokens": self.add_spaces_between_special_tokens,
             "print_eos_token": self.print_eos_token,
+            "disable_prompt_cache": self.disable_prompt_cache,
         }
 
     def to_origin_dict(self):
diff --git a/lightllm/server/httpserver_for_pd_master/manager.py b/lightllm/server/httpserver_for_pd_master/manager.py
@@ -144,16 +144,6 @@ async def _log_req_header(self, request: Request, group_request_id: int):
         )
         return
 
-    async def _to_req_info(
-        self, prompt: Union[str, List[int]], sampling_params: SamplingParams, multimodal_params: MultimodalParams
-    ):
-        req = {
-            "inputs": prompt,
-            "parameters": sampling_params.to_origin_dict(),
-            "multimodal_params": multimodal_params.to_origin_dict(),
-        }
-        return req
-
     async def fetch_stream(
         self,
         p_node: PD_Client_Obj,
@@ -323,6 +313,9 @@ async def _wait_to_token_package(
         multimodal_params: MultimodalParams,
         request: Request,
     ):
+        if sampling_params.disable_prompt_cache:
+            assert False, "pd mode dont support set disable_prompt_cache to True"
+
         out_token_counter = 0
         first_token_cost_ms = float("inf")
         group_request_id = sampling_params.group_request_id
diff --git a/lightllm/server/router/model_infer/infer_batch.py b/lightllm/server/router/model_infer/infer_batch.py
@@ -238,6 +238,7 @@ def __init__(
         vocab_size: int,
     ) -> None:
         self.shm_param = shm_req.sample_params
+        self.disable_prompt_cache = self.shm_param.disable_prompt_cache
         if self.shm_param.top_k == -1:
             self.shm_param.top_k = vocab_size
 
@@ -358,6 +359,8 @@ def _init_all_state(self):
         return
 
     def _match_radix_cache(self):
+        if self.sampling_param.disable_prompt_cache:
+            return
         if g_infer_context.radix_cache is not None and self.get_cur_total_len() > 1 and self.cur_kv_len == 0:
             input_token_ids = self.shm_req.shm_prompt_ids.arr[0 : self.get_cur_total_len()]
             key = torch.tensor(input_token_ids, dtype=torch.int64, device="cpu")
diff --git a/lightllm/server/visualserver/manager.py b/lightllm/server/visualserver/manager.py
@@ -113,6 +113,7 @@ async def loop_for_fwd(self):
                     group_req_indexes = self.waiting_reqs.pop(0)
                     shm_req = self.shm_req_manager.get_req_obj_by_index(group_req_indexes.shm_req_indexes[0])
                     is_aborted = shm_req.is_aborted
+                    disable_prompt_cache = shm_req.sample_params.disable_prompt_cache
                     self.shm_req_manager.put_back_req_obj(shm_req)
                     if is_aborted:
                         # 因为连接断开 aborted 掉的请求也需要传输到后续的模块进行处理
@@ -124,7 +125,11 @@ async def loop_for_fwd(self):
                     multimodal_params = group_req_indexes.multimodal_params
 
                     img_uuids = [img.uuid for img in multimodal_params.images]
-                    ready_image = obtain(self.cache_client.root.get_items_embed(img_uuids))
+                    # disable prompt cache通常用来测试，需要也去掉image cache的影响
+                    if disable_prompt_cache:
+                        ready_image = [False] * len(img_uuids)
+                    else:
+                        ready_image = obtain(self.cache_client.root.get_items_embed(img_uuids))
 
                     for img, ready in zip(multimodal_params.images, ready_image):
                         if not ready:
diff --git a/lightllm/utils/shm_size_check.py b/lightllm/utils/shm_size_check.py
@@ -117,6 +117,8 @@ def _get_recommended_shm_size_gb(args, max_image_resolution=(3940, 2160), dtype_
         )
         fake_image_item.image_w = fake_image_item._data[0]
         fake_image_item.image_h = fake_image_item._data[1]
+        # for internvl model shm check
+        fake_image_item.extra_params["image_patch_max_num"] = 12
         max_image_tokens = tokenizer.get_image_token_length(fake_image_item)
 
         # 估算图片 token 所需的资源

Original file line number	Diff line number	Diff line change
`@@ -117,6 +117,8 @@ def _get_recommended_shm_size_gb(args, max_image_resolution=(3940, 2160), dtype_`
`117`	`117`	`)`
`118`	`118`	`fake_image_item.image_w = fake_image_item._data[0]`
`119`	`119`	`fake_image_item.image_h = fake_image_item._data[1]`
	`120`	`+ # for internvl model shm check`
	`121`	`+ fake_image_item.extra_params["image_patch_max_num"] = 12`
`120`	`122`	`max_image_tokens = tokenizer.get_image_token_length(fake_image_item)`
`121`	`123`
`122`	`124`	`# 估算图片 token 所需的资源`