fix

hiworldwzj · web-flow · commit ff04aedfdf31 · 2025-08-20T10:53:28.000+08:00
diff --git a/lightllm/common/basemodel/basemodel.py b/lightllm/common/basemodel/basemodel.py
@@ -62,7 +62,6 @@ def __init__(self, kvargs):
         self.is_token_healing = kvargs.get("is_token_healing", False)
         self.return_all_prompt_logics = kvargs.get("return_all_prompt_logics", False)
         assert not (self.is_token_healing and self.return_all_prompt_logics), "can not be true in same time"
-        self.use_dynamic_prompt_cache = kvargs.get("use_dynamic_prompt_cache", False)
         self.data_type = kvargs.get("data_type", "float16")
         self.graph_max_batch_size = kvargs.get("graph_max_batch_size", 16)
         self.graph_max_batch_size = (
diff --git a/lightllm/common/basemodel/infer_struct.py b/lightllm/common/basemodel/infer_struct.py
@@ -35,7 +35,6 @@ def __init__(self):
 
         self.is_token_healing: bool = False
         self.return_all_prompt_logics: bool = False
-        self.use_dynamic_prompt_cache: bool = False
         self.multimodal_params: dict = None
         self.is_cuda_graph: bool = False  # 标记是否是cuda graph的捕获推理
         self.dist_group: CustomProcessGroup = None
diff --git a/lightllm/server/router/model_infer/mode_backend/base_backend.py b/lightllm/server/router/model_infer/mode_backend/base_backend.py
@@ -124,7 +124,6 @@ def init_model(self, kvargs):
             "max_seq_length": kvargs.get("max_seq_length", 1024 * 5),
             "is_token_healing": kvargs.get("is_token_healing", False),
             "return_all_prompt_logics": self.return_all_prompt_logprobs,
-            "use_dynamic_prompt_cache": self.use_dynamic_prompt_cache,
             "disable_chunked_prefill": self.disable_chunked_prefill,
             "data_type": kvargs.get("data_type", "float16"),
             "graph_max_batch_size": kvargs.get("graph_max_batch_size", 16),
@@ -231,7 +230,6 @@ def init_mtp_draft_model(self, main_kvargs: dict):
                 "max_seq_length": main_kvargs.get("max_seq_length", 1024 * 5),
                 "is_token_healing": False,
                 "return_all_prompt_logics": False,
-                "use_dynamic_prompt_cache": self.use_dynamic_prompt_cache,
                 "disable_chunked_prefill": self.disable_chunked_prefill,
                 "data_type": main_kvargs.get("data_type", "float16"),
                 "graph_max_batch_size": main_kvargs.get("graph_max_batch_size", 16),
diff --git a/test/benchmark/static_inference/model_infer_mtp.py b/test/benchmark/static_inference/model_infer_mtp.py
@@ -27,7 +27,6 @@ def init_mtp_model(args: StartArgs, kvargs, main_model):
         {
             "weight_dir": args.mtp_draft_model_dir,
             "max_total_token_num": main_model.mem_manager.size,
-            "use_dynamic_prompt_cache": False,
             "disable_chunked_prefill": True,
             "mtp_mode": args.mtp_mode,
             "main_model": main_model,
@@ -39,7 +38,6 @@ def init_mtp_model(args: StartArgs, kvargs, main_model):
             {
                 "weight_dir": args.spec_model_dir,
                 "max_total_token_num": main_model.mem_manager.size,
-                "use_dynamic_prompt_cache": False,
                 "disable_chunked_prefill": True,
                 "mtp_mode": args.mtp_mode,
                 "main_model": main_model,