[TRTLLM-8817][chore] Set default value of KvCacheConfig.free_gpu_memory_fraction explicitly (#8561)

QiJune · web-flow · commit 6ee1c87595e8 · 2025-10-24T08:55:49.000+08:00
Signed-off-by: junq &lt;22017000+QiJune@users.noreply.github.com&gt;
diff --git a/examples/llm-api/quickstart_advanced.py b/examples/llm-api/quickstart_advanced.py
@@ -84,7 +84,6 @@ def add_llm_args(parser):
     parser.add_argument('--disable_kv_cache_reuse',
                         default=False,
                         action='store_true')
-    parser.add_argument("--kv_cache_fraction", type=float, default=None)
 
     # Runtime
     parser.add_argument('--disable_overlap_scheduler',
@@ -170,6 +169,7 @@ def parse_arguments():
     parser = argparse.ArgumentParser(
         description="LLM models with the PyTorch workflow.")
     parser = add_llm_args(parser)
+    parser.add_argument("--kv_cache_fraction", type=float, default=0.9)
     args = parser.parse_args()
     return args
 
diff --git a/examples/llm-api/quickstart_multimodal.py b/examples/llm-api/quickstart_multimodal.py
@@ -150,13 +150,10 @@ def parse_arguments():
     parser = argparse.ArgumentParser(
         description="Multimodal models with the PyTorch workflow.")
     parser = add_llm_args(parser)
+    parser.add_argument("--kv_cache_fraction", type=float, default=0.6)
     parser = add_multimodal_args(parser)
     parser = add_lora_args(parser)
     args = parser.parse_args()
-
-    if args.kv_cache_fraction is None:
-        args.kv_cache_fraction = 0.6  # lower the default kv cache fraction for multimodal
-
     return args
 
 
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -102,12 +102,6 @@ def __init__(
         self._kv_cache_manager_cls = get_kv_cache_manager_cls(
             model_engine.model.model_config)
 
-    def _get_free_gpu_memory_fraction(self) -> float:
-        fraction = self._kv_cache_config.free_gpu_memory_fraction
-        if fraction is None:
-            fraction = 0.9
-        return fraction
-
     def _get_kv_size_per_token(self):
         model_config = self._model_engine.model.model_config
         mapping = self._mapping
@@ -300,7 +294,7 @@ def configure_kv_cache_capacity(self, py_executor: PyExecutor) -> None:
         # TODO: support CP by generating dummy requests for it.
         assert 'cp_type' not in mapping.cp_config
 
-        fraction = self._get_free_gpu_memory_fraction()
+        fraction = self._kv_cache_config.free_gpu_memory_fraction
 
         torch.cuda.empty_cache()
         torch.cuda.reset_peak_memory_stats()
diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py
@@ -84,7 +84,7 @@ def get_llm_args(model: str,
                  pipeline_parallel_size: int = 1,
                  moe_expert_parallel_size: Optional[int] = None,
                  gpus_per_node: Optional[int] = None,
-                 free_gpu_memory_fraction: Optional[float] = None,
+                 free_gpu_memory_fraction: float = 0.9,
                  num_postprocess_workers: int = 0,
                  trust_remote_code: bool = False,
                  reasoning_parser: Optional[str] = None,
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -1241,7 +1241,7 @@ class KvCacheConfig(StrictBaseModel, PybindMirror):
         description=
         "Number of sink tokens (tokens to always keep in attention window).")
     free_gpu_memory_fraction: Optional[float] = Field(
-        default=None,
+        default=0.9,
         description=
         "The fraction of GPU memory fraction that should be allocated for the KV cache. Default is 90%. If both `max_tokens` and `free_gpu_memory_fraction` are specified, memory corresponding to the minimum will be used."
     )
@@ -1323,6 +1323,16 @@ def _to_pybind(self):
             attention_dp_events_gather_period_ms,
             max_gpu_total_bytes=self.max_gpu_total_bytes)
 
+    @field_validator('free_gpu_memory_fraction')
+    @classmethod
+    def validate_free_gpu_memory_fraction(cls, v: float):
+        """Validates that the fraction is between 0.0 and 1.0."""
+        if not 0 <= v <= 1:
+            raise ValueError(
+                "kv_cache_config.free_gpu_memory_fraction must be a float between 0 and 1"
+            )
+        return v
+
     @field_validator('max_gpu_total_bytes')
     @classmethod
     def validate_max_gpu_total_bytes(cls, v: int):
diff --git a/tests/unittest/llmapi/test_llm_args.py b/tests/unittest/llmapi/test_llm_args.py
@@ -180,10 +180,6 @@ def test_KvCacheConfig_declaration():
     assert pybind_config.attention_dp_events_gather_period_ms == 10
 
 
-def test_KvCacheConfig_default_values():
-    check_defaults(KvCacheConfig, tle.KvCacheConfig)
-
-
 def test_CapacitySchedulerPolicy():
     val = CapacitySchedulerPolicy.MAX_UTILIZATION
     assert PybindMirror.maybe_to_pybind(