Skip to content

Commit 6ee1c87

Browse files
authored
[TRTLLM-8817][chore] Set default value of KvCacheConfig.free_gpu_memory_fraction explicitly (#8561)
Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
1 parent 2392022 commit 6ee1c87

File tree

6 files changed

+15
-18
lines changed

6 files changed

+15
-18
lines changed

examples/llm-api/quickstart_advanced.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,6 @@ def add_llm_args(parser):
8484
parser.add_argument('--disable_kv_cache_reuse',
8585
default=False,
8686
action='store_true')
87-
parser.add_argument("--kv_cache_fraction", type=float, default=None)
8887

8988
# Runtime
9089
parser.add_argument('--disable_overlap_scheduler',
@@ -170,6 +169,7 @@ def parse_arguments():
170169
parser = argparse.ArgumentParser(
171170
description="LLM models with the PyTorch workflow.")
172171
parser = add_llm_args(parser)
172+
parser.add_argument("--kv_cache_fraction", type=float, default=0.9)
173173
args = parser.parse_args()
174174
return args
175175

examples/llm-api/quickstart_multimodal.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -150,13 +150,10 @@ def parse_arguments():
150150
parser = argparse.ArgumentParser(
151151
description="Multimodal models with the PyTorch workflow.")
152152
parser = add_llm_args(parser)
153+
parser.add_argument("--kv_cache_fraction", type=float, default=0.6)
153154
parser = add_multimodal_args(parser)
154155
parser = add_lora_args(parser)
155156
args = parser.parse_args()
156-
157-
if args.kv_cache_fraction is None:
158-
args.kv_cache_fraction = 0.6 # lower the default kv cache fraction for multimodal
159-
160157
return args
161158

162159

tensorrt_llm/_torch/pyexecutor/_util.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -102,12 +102,6 @@ def __init__(
102102
self._kv_cache_manager_cls = get_kv_cache_manager_cls(
103103
model_engine.model.model_config)
104104

105-
def _get_free_gpu_memory_fraction(self) -> float:
106-
fraction = self._kv_cache_config.free_gpu_memory_fraction
107-
if fraction is None:
108-
fraction = 0.9
109-
return fraction
110-
111105
def _get_kv_size_per_token(self):
112106
model_config = self._model_engine.model.model_config
113107
mapping = self._mapping
@@ -300,7 +294,7 @@ def configure_kv_cache_capacity(self, py_executor: PyExecutor) -> None:
300294
# TODO: support CP by generating dummy requests for it.
301295
assert 'cp_type' not in mapping.cp_config
302296

303-
fraction = self._get_free_gpu_memory_fraction()
297+
fraction = self._kv_cache_config.free_gpu_memory_fraction
304298

305299
torch.cuda.empty_cache()
306300
torch.cuda.reset_peak_memory_stats()

tensorrt_llm/commands/serve.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def get_llm_args(model: str,
8484
pipeline_parallel_size: int = 1,
8585
moe_expert_parallel_size: Optional[int] = None,
8686
gpus_per_node: Optional[int] = None,
87-
free_gpu_memory_fraction: Optional[float] = None,
87+
free_gpu_memory_fraction: float = 0.9,
8888
num_postprocess_workers: int = 0,
8989
trust_remote_code: bool = False,
9090
reasoning_parser: Optional[str] = None,

tensorrt_llm/llmapi/llm_args.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1241,7 +1241,7 @@ class KvCacheConfig(StrictBaseModel, PybindMirror):
12411241
description=
12421242
"Number of sink tokens (tokens to always keep in attention window).")
12431243
free_gpu_memory_fraction: Optional[float] = Field(
1244-
default=None,
1244+
default=0.9,
12451245
description=
12461246
"The fraction of GPU memory fraction that should be allocated for the KV cache. Default is 90%. If both `max_tokens` and `free_gpu_memory_fraction` are specified, memory corresponding to the minimum will be used."
12471247
)
@@ -1323,6 +1323,16 @@ def _to_pybind(self):
13231323
attention_dp_events_gather_period_ms,
13241324
max_gpu_total_bytes=self.max_gpu_total_bytes)
13251325

1326+
@field_validator('free_gpu_memory_fraction')
1327+
@classmethod
1328+
def validate_free_gpu_memory_fraction(cls, v: float):
1329+
"""Validates that the fraction is between 0.0 and 1.0."""
1330+
if not 0 <= v <= 1:
1331+
raise ValueError(
1332+
"kv_cache_config.free_gpu_memory_fraction must be a float between 0 and 1"
1333+
)
1334+
return v
1335+
13261336
@field_validator('max_gpu_total_bytes')
13271337
@classmethod
13281338
def validate_max_gpu_total_bytes(cls, v: int):

tests/unittest/llmapi/test_llm_args.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -180,10 +180,6 @@ def test_KvCacheConfig_declaration():
180180
assert pybind_config.attention_dp_events_gather_period_ms == 10
181181

182182

183-
def test_KvCacheConfig_default_values():
184-
check_defaults(KvCacheConfig, tle.KvCacheConfig)
185-
186-
187183
def test_CapacitySchedulerPolicy():
188184
val = CapacitySchedulerPolicy.MAX_UTILIZATION
189185
assert PybindMirror.maybe_to_pybind(

0 commit comments

Comments
 (0)