[add]add disable_prompt_cache

SangChengC · SangChengC · commit 13b0c837ef94 · 2025-09-24T10:10:48.000Z
diff --git a/lightllm/server/audioserver/manager.py b/lightllm/server/audioserver/manager.py
@@ -86,6 +86,7 @@ async def loop_for_fwd(self):
                 while len(self.waiting_reqs) > 0:
                     group_req_indexes = self.waiting_reqs.pop(0)
                     shm_req = self.shm_req_manager.get_req_obj_by_index(group_req_indexes.shm_req_indexes[0])
+                    disable_prompt_cache = shm_req.sample_params.disable_prompt_cache
                     is_aborted = shm_req.is_aborted
                     self.shm_req_manager.put_back_req_obj(shm_req)
                     if is_aborted:
@@ -98,7 +99,11 @@ async def loop_for_fwd(self):
                     multimodal_params = group_req_indexes.multimodal_params
 
                     audio_uuids = [audio.uuid for audio in multimodal_params.audios]
-                    ready_audio = obtain(self.cache_client.root.get_items_embed(audio_uuids))
+                    # disable prompt cache通常用来测试，需要也去掉audio cache的影响
+                    if disable_prompt_cache:
+                        ready_audio = [False] * len(audio_uuids)
+                    else:
+                        ready_audio = obtain(self.cache_client.root.get_items_embed(audio_uuids))
 
                     for audio, ready in zip(multimodal_params.audios, ready_audio):
                         if not ready:
diff --git a/lightllm/server/core/objs/sampling_params.py b/lightllm/server/core/objs/sampling_params.py
@@ -307,7 +307,7 @@ class SamplingParams(ctypes.Structure):
             ctypes.c_bool,
         ),  # whether to add spaces between special tokens when decoding
         ("print_eos_token", ctypes.c_bool),  # eos_id will be always ignored except the value is set to True
-        ("disable_prompt_cache", ctypes.c_bool),  # eos_id will be always ignored except the value is set to True
+        ("disable_prompt_cache", ctypes.c_bool),  # whether to disable prompt cache
     ]
 
     _do_sample: bool = False
diff --git a/lightllm/server/httpserver_for_pd_master/manager.py b/lightllm/server/httpserver_for_pd_master/manager.py
@@ -153,6 +153,9 @@ async def fetch_stream(
     ):
         group_request_id = sampling_params.group_request_id
 
+        # PD分离模式下，use prompt cache必须为True
+        sampling_params.disable_prompt_cache = False
+
         req_status = ReqStatus(group_request_id, p_node, d_node)
         self.req_id_to_out_inf[group_request_id] = req_status
 
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
@@ -137,11 +137,9 @@ def __init__(
         self,
         images: List[dict] = [],
         audios: List[dict] = [],
-        skip_image_cache: bool = False,
     ) -> None:
         self.images = [ImageItem(**i) for i in images]
         self.audios = [AudioItem(**a) for a in audios]
-        self.skip_image_cache = skip_image_cache
         return
 
     async def verify_and_preload(self, request: Request):
diff --git a/lightllm/server/router/model_infer/infer_batch.py b/lightllm/server/router/model_infer/infer_batch.py
@@ -309,7 +309,7 @@ def __init__(
         self.mtp_step: int = get_env_start_args().mtp_step
 
         self._init_all_state()
-        if init_prefix_cache and not self.sampling_param.disable_prompt_cache:
+        if init_prefix_cache:
             self._match_radix_cache()
         return
 
@@ -337,9 +337,6 @@ def _init_all_state(self):
 
     def _match_radix_cache(self):
         if self.sampling_param.disable_prompt_cache:
-            self.shared_kv_node = None
-            self.shm_req.prompt_cache_len = 0
-            self.shm_req.shm_cur_kv_len = self.cur_kv_len
             return
         if g_infer_context.radix_cache is not None and self.get_cur_total_len() > 1 and self.cur_kv_len == 0:
             input_token_ids = self.shm_req.shm_prompt_ids.arr[0 : self.get_cur_total_len()]
diff --git a/lightllm/server/visualserver/manager.py b/lightllm/server/visualserver/manager.py
@@ -113,6 +113,7 @@ async def loop_for_fwd(self):
                     group_req_indexes = self.waiting_reqs.pop(0)
                     shm_req = self.shm_req_manager.get_req_obj_by_index(group_req_indexes.shm_req_indexes[0])
                     is_aborted = shm_req.is_aborted
+                    disable_prompt_cache = shm_req.sample_params.disable_prompt_cache
                     self.shm_req_manager.put_back_req_obj(shm_req)
                     if is_aborted:
                         # 因为连接断开 aborted 掉的请求也需要传输到后续的模块进行处理
@@ -124,10 +125,12 @@ async def loop_for_fwd(self):
                     multimodal_params = group_req_indexes.multimodal_params
 
                     img_uuids = [img.uuid for img in multimodal_params.images]
-                    if multimodal_params.skip_image_cache:
+                    # disable prompt cache通常用来测试，需要也去掉image cache的影响
+                    if disable_prompt_cache:
                         ready_image = [False] * len(img_uuids)
                     else:
                         ready_image = obtain(self.cache_client.root.get_items_embed(img_uuids))
+
                     for img, ready in zip(multimodal_params.images, ready_image):
                         if not ready:
                             images_need_infer.append(img)

Original file line number	Diff line number	Diff line change
`@@ -307,7 +307,7 @@ class SamplingParams(ctypes.Structure):`
`307`	`307`	`ctypes.c_bool,`
`308`	`308`	`), # whether to add spaces between special tokens when decoding`
`309`	`309`	`("print_eos_token", ctypes.c_bool), # eos_id will be always ignored except the value is set to True`
`310`		`- ("disable_prompt_cache", ctypes.c_bool), # eos_id will be always ignored except the value is set to True`
	`310`	`+ ("disable_prompt_cache", ctypes.c_bool), # whether to disable prompt cache`
`311`	`311`	`]`
`312`	`312`
`313`	`313`	`_do_sample: bool = False`