[TRTLLM-8477][chore] Replace KvCacheConfigCpp with KvCacheConfig inside PyExecutor (#8259)

leslie-fang25 · web-flow · commit 8d1b068b1aa1 · 2025-10-13T14:55:36.000+08:00
Signed-off-by: leslie-fang25 &lt;leslief@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -10,7 +10,7 @@
 from tensorrt_llm._torch.model_config import ModelConfig
 from tensorrt_llm._utils import str_dtype_to_binding, torch_dtype_to_str
 from tensorrt_llm.bindings.executor import DecodingMode
-from tensorrt_llm.llmapi.llm_args import (EagleDecodingConfig,
+from tensorrt_llm.llmapi.llm_args import (EagleDecodingConfig, KvCacheConfig,
                                           MTPDecodingConfig, PeftCacheConfig,
                                           SamplerType, SpeculativeConfig,
                                           TorchLlmArgs)
@@ -58,7 +58,7 @@ def __init__(
         tokens_per_block: int,
         max_seq_len: int,
         max_batch_size: int,
-        kv_cache_config: trtllm.KvCacheConfig,
+        kv_cache_config: KvCacheConfig,
         pytorch_backend_config: PyTorchConfig,
         speculative_config: SpeculativeConfig,
     ):
@@ -790,7 +790,7 @@ def instantiate_sampler(engine: PyTorchModelEngine,
                         max_seq_len: int, mm_encoder_only: bool,
                         speculative_config: SpeculativeConfig,
                         decoding_config: trtllm.DecodingConfig,
-                        kv_cache_config: trtllm.KvCacheConfig):
+                        kv_cache_config: KvCacheConfig):
     sampler_args = create_torch_sampler_args(
         mapping,
         max_seq_len=engine.max_seq_len,
diff --git a/tensorrt_llm/_torch/pyexecutor/mamba_cache_manager.py b/tensorrt_llm/_torch/pyexecutor/mamba_cache_manager.py
@@ -19,9 +19,9 @@
 
 from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequest
 from tensorrt_llm._torch.pyexecutor.resource_manager import (
-    BaseResourceManager, CacheTypeCpp, DataType, KvCacheConfigCpp,
-    KVCacheManager, get_pp_layers)
+    BaseResourceManager, CacheTypeCpp, DataType, KVCacheManager, get_pp_layers)
 from tensorrt_llm._torch.pyexecutor.scheduler import ScheduledRequests
+from tensorrt_llm.llmapi.llm_args import KvCacheConfig
 from tensorrt_llm.mapping import Mapping
 
 
@@ -180,7 +180,7 @@ def __init__(
         mamba_ssm_cache_dtype: torch.dtype,
 
         # kv cache parameters
-        kv_cache_config: KvCacheConfigCpp,
+        kv_cache_config: KvCacheConfig,
         kv_cache_type: CacheTypeCpp,
         *,
         num_layers: int,
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -223,7 +223,7 @@ def create_py_executor(
             llm_args.peft_cache_config)
 
     assert llm_args.kv_cache_config, "Expect llm_args.kv_cache_config is not None"
-    kv_cache_config = PybindMirror.maybe_to_pybind(llm_args.kv_cache_config)
+    kv_cache_config = llm_args.kv_cache_config
     if os.getenv("FORCE_DETERMINISTIC", "0") == "1":
         # Disable KV cache reuse for deterministic mode
         kv_cache_config.enable_block_reuse = False
@@ -251,7 +251,7 @@ def create_py_executor(
     if max_num_tokens is None:
         max_num_tokens = 8192
 
-    tokens_per_block = llm_args.kv_cache_config.tokens_per_block
+    tokens_per_block = kv_cache_config.tokens_per_block
 
     if pytorch_backend_config.attn_backend in [
             "FLASHINFER", "FLASHINFER_STAR_ATTENTION"
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -11,6 +11,7 @@
 import tensorrt_llm.bindings
 from tensorrt_llm._utils import mpi_disabled
 from tensorrt_llm.bindings.BuildInfo import ENABLE_MULTI_DEVICE
+from tensorrt_llm.llmapi.llm_args import KvCacheConfig, PybindMirror
 from tensorrt_llm.lora_helper import LoraConfig
 from tensorrt_llm.lora_manager import LoraManager, LoraModelConfig
 from tensorrt_llm.runtime import ModelConfig as ModelConfigPython
@@ -31,7 +32,6 @@
 
 BufferManagerCpp = tensorrt_llm.bindings.internal.runtime.BufferManager
 KVCacheManagerCpp = tensorrt_llm.bindings.internal.batch_manager.KVCacheManager
-KvCacheConfigCpp = tensorrt_llm.bindings.executor.KvCacheConfig
 CacheTypeCpp = tensorrt_llm.bindings.internal.batch_manager.CacheType
 ModelConfigCpp = tensorrt_llm.bindings.ModelConfig
 DataType = tensorrt_llm.bindings.DataType
@@ -145,7 +145,7 @@ class KVCacheManager(BaseResourceManager):
 
     def __init__(
         self,
-        kv_cache_config: KvCacheConfigCpp,
+        kv_cache_config: KvCacheConfig,
         kv_cache_type: CacheTypeCpp,
         *,
         num_layers: int,
@@ -268,8 +268,8 @@ def append_to_kv_heads_per_layer(num_kv_heads_per_layer: List[int],
                 )
             # kv cache config check
             assert isinstance(
-                kv_cache_config, KvCacheConfigCpp
-            ), "calculate_max_num_blocks_from_cpp only accepts KvCacheConfigCpp"
+                kv_cache_config, KvCacheConfig
+            ), "calculate_max_num_blocks_from_cpp only accepts KvCacheConfig"
             blocks_per_window = self.calculate_max_num_blocks_from_cpp(
                 kv_cache_config=kv_cache_config,
                 model_config=model_config,
@@ -370,28 +370,6 @@ def append_to_kv_heads_per_layer(num_kv_heads_per_layer: List[int],
     def shutdown(self):
         self.impl.release_pools()
 
-    @classmethod
-    def from_model_config(cls,
-                          model_config: ModelConfigCpp,
-                          kv_cache_config: KvCacheConfigCpp,
-                          mapping: Mapping,
-                          kv_cache_type: CacheTypeCpp = CacheTypeCpp.SELF,
-                          dtype: DataType = DataType.HALF) -> "KVCacheManager":
-        return cls(
-            kv_cache_config,
-            kv_cache_type,
-            num_layers=model_config.num_attention_layers(mapping.pp_size),
-            # NOTE: this preserves existing behavior in KV cache manager.
-            # But we should change this to pass a list at some point.
-            # We're assuming the KV cache is homogeneous here.
-            num_kv_heads=model_config.num_kv_heads(0),
-            head_dim=model_config.size_per_head,
-            tokens_per_block=model_config.tokens_per_block,
-            max_seq_len=model_config.max_seq_len,
-            max_batch_size=model_config.max_batch_size,
-            mapping=mapping,
-            dtype=dtype)
-
     def get_max_resource_count(self) -> int:
         return self.impl.max_num_blocks
 
@@ -566,7 +544,7 @@ def calculate_scaling_factor_size_bytes(
                                  scaling_factor_dtype)
 
     def calculate_max_num_blocks(self,
-                                 kv_cache_config: KvCacheConfigCpp,
+                                 kv_cache_config: KvCacheConfig,
                                  head_dim: int,
                                  tokens_per_block: int,
                                  mapping: Mapping,
@@ -772,7 +750,7 @@ def _get_window_size_to_layers(self) -> dict[int, list[int]]:
     def adjust_window_sizes_for_vswa(
         window_size_to_layers: Dict[int, List[int]],
         max_attention_window_vec: List[int],
-        kv_cache_config: KvCacheConfigCpp,
+        kv_cache_config: KvCacheConfig,
         model_config: ModelConfigCpp,
         pool_memory_bytes: int,
         kv_factor: int,
@@ -887,7 +865,7 @@ def calculate_cache_size_per_token(layers: Set[int]) -> int:
 
     def calculate_max_num_blocks_from_cpp(
             self,
-            kv_cache_config: KvCacheConfigCpp,
+            kv_cache_config: KvCacheConfig,
             model_config: ModelConfigCpp,
             extra_cost_memory: int = 0) -> dict[int, tuple[int, int]]:
         """
@@ -945,7 +923,7 @@ def calculate_max_num_blocks_from_cpp(
         self.max_attention_window_vec = max_attention_window_vec
 
         blocks_per_window = KVCacheManagerCpp.calculate_max_num_blocks(
-            config=kv_cache_config,
+            config=PybindMirror.maybe_to_pybind(kv_cache_config),
             # TODO: support cross attention
             is_cross_attention=is_cross_attention,
             dtype=self.dtype,
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -17,14 +17,15 @@
 from tensorrt_llm.bindings import (CudaStream, DataType, ModelConfig,
                                    WorldConfig, make_sampling_config)
 from tensorrt_llm.bindings.executor import (DecodingConfig, DecodingMode,
-                                            FinishReason, KvCacheConfig)
+                                            FinishReason)
 from tensorrt_llm.bindings.internal.algorithms import CreateNewDecoderRequests
 from tensorrt_llm.bindings.internal.batch_manager import (
     DecoderInputBuffers, add_new_tokens_to_requests, make_decoding_batch_input)
 from tensorrt_llm.bindings.internal.runtime import (BufferManager, CudaEvent,
                                                     DecoderState,
                                                     GptDecoderBatched)
 from tensorrt_llm.executor.result import Logprob
+from tensorrt_llm.llmapi.llm_args import KvCacheConfig
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.sampling_params import SamplingParams
 
diff --git a/tests/unittest/_torch/executor/test_resource_manager.py b/tests/unittest/_torch/executor/test_resource_manager.py
@@ -20,6 +20,7 @@
 from tensorrt_llm.bindings import executor as tllm
 from tensorrt_llm.bindings.internal.batch_manager import \
     PeftTaskNotCachedException
+from tensorrt_llm.llmapi.llm_args import KvCacheConfig
 from tensorrt_llm.lora_helper import LoraConfig
 from tensorrt_llm.mapping import Mapping
 
@@ -574,11 +575,11 @@ def _create_model_config_for_kv_cache_manager() -> ModelConfigCpp:
 
     @staticmethod
     def _create_kv_cache_config_for_kv_cache_manager(
-            params: dict) -> tllm.KvCacheConfig:
+            params: dict) -> KvCacheConfig:
         """
         Create a KV cache config for KVCacheManager test.
         """
-        return tllm.KvCacheConfig(**params)
+        return KvCacheConfig(**params)
 
     def test_calculate_max_num_blocks_from_cpp(self):
         # Construct a minimal mapping (single-rank, no TP/PP)
@@ -633,9 +634,8 @@ class MemTestCase(NamedTuple):
                     "free_gpu_memory_fraction": free_gpu_memory_fraction,
                     "enable_block_reuse": enable_block_reuse,
                 },
-                # NOTE: use np.float32 to avoid float precision issue between python(double in most cases) and cpp binding(float)
-                expected_memory_bytes=(int(
-                    fixed_free_mem * np.float32(free_gpu_memory_fraction)), 0),
+                expected_memory_bytes=(int(fixed_free_mem *
+                                           free_gpu_memory_fraction), 0),
             ),
         ]