[add]add disable_prompt_cache parameters

SangChengC · SangChengC · commit 1fe1574c122b · 2025-09-18T09:46:00.000Z
diff --git a/lightllm/server/core/objs/sampling_params.py b/lightllm/server/core/objs/sampling_params.py
@@ -307,6 +307,7 @@ class SamplingParams(ctypes.Structure):
             ctypes.c_bool,
         ),  # whether to add spaces between special tokens when decoding
         ("print_eos_token", ctypes.c_bool),  # eos_id will be always ignored except the value is set to True
+        ("disable_prompt_cache", ctypes.c_bool),  # eos_id will be always ignored except the value is set to True
     ]
 
     _do_sample: bool = False
@@ -337,6 +338,7 @@ def init(self, tokenizer, **kwargs):
         self.suggested_dp_index = kwargs.get("suggested_dp_index", -1)
 
         self.skip_special_tokens = kwargs.get("skip_special_tokens", SKIP_SPECIAL_TOKENS)
+        self.disable_prompt_cache = kwargs.get("disable_prompt_cache", False)
 
         self.add_special_tokens = kwargs.get("add_special_tokens", True)
         self.add_spaces_between_special_tokens = kwargs.get("add_spaces_between_special_tokens", True)
@@ -477,6 +479,7 @@ def to_dict(self):
             "add_special_tokens": self.add_special_tokens,
             "add_spaces_between_special_tokens": self.add_spaces_between_special_tokens,
             "print_eos_token": self.print_eos_token,
+            "disable_prompt_cache": self.disable_prompt_cache,
         }
 
     def to_origin_dict(self):
diff --git a/lightllm/server/router/model_infer/infer_batch.py b/lightllm/server/router/model_infer/infer_batch.py
@@ -86,7 +86,7 @@ def add_reqs(self, requests: List[Tuple[int, int, Any, int]], init_prefix_cache:
         return req_objs
 
     def free_a_req_mem(self, free_token_index: List, req: "InferReq", is_group_finished: bool):
-        if self.radix_cache is None:
+        if self.radix_cache is None or req.sampling_param.disable_prompt_cache:
             if is_group_finished:
                 free_token_index.append(self.req_manager.req_to_token_indexs[req.req_idx][0 : req.cur_kv_len])
             else:
@@ -236,6 +236,7 @@ def __init__(
         vocab_size: int,
     ) -> None:
         self.shm_param = shm_req.sample_params
+        self.disable_prompt_cache = self.shm_param.disable_prompt_cache
         if self.shm_param.top_k == -1:
             self.shm_param.top_k = vocab_size
 
@@ -308,7 +309,7 @@ def __init__(
         self.mtp_step: int = get_env_start_args().mtp_step
 
         self._init_all_state()
-        if init_prefix_cache:
+        if init_prefix_cache and not self.sampling_param.disable_prompt_cache:
             self._match_radix_cache()
         return
 
@@ -335,6 +336,11 @@ def _init_all_state(self):
         return
 
     def _match_radix_cache(self):
+        if self.sampling_param.disable_prompt_cache:
+            self.shared_kv_node = None
+            self.shm_req.prompt_cache_len = 0
+            self.shm_req.shm_cur_kv_len = self.cur_kv_len
+            return
         if g_infer_context.radix_cache is not None and self.get_cur_total_len() > 1 and self.cur_kv_len == 0:
             input_token_ids = self.shm_req.shm_prompt_ids.arr[0 : self.get_cur_total_len()]
             key = torch.tensor(input_token_ids, dtype=torch.int64, device="cpu")
diff --git a/lightllm/server/visualserver/manager.py b/lightllm/server/visualserver/manager.py
@@ -123,12 +123,12 @@ async def loop_for_fwd(self):
 
                     multimodal_params = group_req_indexes.multimodal_params
 
-                    img_uuids = list(dict.fromkeys(img.uuid for img in multimodal_params.images))
+                    img_uuids = [img.uuid for img in multimodal_params.images]
                     if multimodal_params.skip_image_cache:
                         ready_image = [False] * len(img_uuids)
                     else:
                         ready_image = obtain(self.cache_client.root.get_items_embed(img_uuids))
-                    for img, ready in zip(img_uuids, ready_image):
+                    for img, ready in zip(multimodal_params.images, ready_image):
                         if not ready:
                             images_need_infer.append(img)
 
diff --git a/lightllm/server/visualserver/model_infer/model_rpc.py b/lightllm/server/visualserver/model_infer/model_rpc.py
@@ -100,8 +100,9 @@ def exposed_encode(self, images: List[ImageItem]):
         all_img_embeds = all_img_embeds.to(torch.device("cpu"))
 
         if self.tp_rank_id == 0:
+            ready_flags = obtain(self.cache_client.root.get_items_embed(uuids))
             ids_to_set = []
-            for i, ready in uuids:
+            for i, ready in enumerate(ready_flags):
                 if ready:
                     continue
                 uid = uuids[i]
diff --git a/lightllm/utils/shm_size_check.py b/lightllm/utils/shm_size_check.py
@@ -117,6 +117,7 @@ def _get_recommended_shm_size_gb(args, max_image_resolution=(3940, 2160), dtype_
         )
         fake_image_item.image_w = fake_image_item._data[0]
         fake_image_item.image_h = fake_image_item._data[1]
+        fake_image_item.extra_params["image_patch_max_num"] = 12
         max_image_tokens = tokenizer.get_image_token_length(fake_image_item)
 
         # 估算图片 token 所需的资源

Original file line number	Diff line number	Diff line change
`@@ -117,6 +117,7 @@ def _get_recommended_shm_size_gb(args, max_image_resolution=(3940, 2160), dtype_`
`117`	`117`	`)`
`118`	`118`	`fake_image_item.image_w = fake_image_item._data[0]`
`119`	`119`	`fake_image_item.image_h = fake_image_item._data[1]`
	`120`	`+ fake_image_item.extra_params["image_patch_max_num"] = 12`
`120`	`121`	`max_image_tokens = tokenizer.get_image_token_length(fake_image_item)`
`121`	`122`
`122`	`123`	`# 估算图片 token 所需的资源`