Skip to content

Commit 1ef38f2

Browse files
authored
[https://nvbugs/5570599][fix] Set KVCache free_gpu_memory_fraction fo… (#8780)
Signed-off-by: Jin Li <[email protected]>
1 parent 69dec20 commit 1ef38f2

File tree

3 files changed

+15
-3
lines changed

3 files changed

+15
-3
lines changed

tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,14 @@ def __init__(
5858
)
5959

6060
def calculate_max_num_blocks(
61-
self, kv_cache_config, head_dim, tokens_per_block, mapping, dtype, kv_factor
61+
self,
62+
kv_cache_config,
63+
head_dim,
64+
tokens_per_block,
65+
mapping,
66+
dtype,
67+
kv_factor,
68+
enforce_memory_limit,
6269
) -> Tuple[int, int]:
6370
"""Calculate the maximum number of blocks needed for the cache."""
6471
# TODO: this is VERY hacky... Ideally, we want to compute the number of blocks

tensorrt_llm/_torch/pyexecutor/_util.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,7 @@ def _create_kv_cache_manager(
417417
is_draft=model_engine.is_draft_model,
418418
kv_connector_manager=self._kv_connector_manager
419419
if not estimating_kv_cache else None,
420+
enforce_memory_limit=estimating_kv_cache,
420421
)
421422
elif is_nemotron_hybrid(config):
422423
if self._max_beam_width > 1:
@@ -490,6 +491,7 @@ def _create_kv_cache_manager(
490491
is_draft=model_engine.is_draft_model,
491492
kv_connector_manager=self._kv_connector_manager
492493
if not estimating_kv_cache else None,
494+
enforce_memory_limit=estimating_kv_cache,
493495
)
494496
# KVCacheManager (Non-draft) modifies the max_seq_len field, update it to self
495497
if model_engine.kv_cache_manager_key == ResourceManagerType.KV_CACHE_MANAGER:

tensorrt_llm/_torch/pyexecutor/resource_manager.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ def __init__(
164164
max_beam_width: int = 1,
165165
is_draft: bool = False,
166166
kv_connector_manager: Optional[KvCacheConnectorManager] = None,
167+
enforce_memory_limit: bool = False,
167168
) -> None:
168169
self.mapping = mapping
169170
self.dtype = dtype
@@ -283,6 +284,7 @@ def append_to_kv_heads_per_layer(num_kv_heads_per_layer: List[int],
283284
mapping=mapping,
284285
dtype=dtype,
285286
kv_factor=self.kv_factor,
287+
enforce_memory_limit=enforce_memory_limit,
286288
)
287289
blocks_per_window = {
288290
self.max_attention_window_vec[0]:
@@ -563,7 +565,8 @@ def calculate_max_num_blocks(self,
563565
tokens_per_block: int,
564566
mapping: Mapping,
565567
dtype: DataType,
566-
kv_factor: int = 2):
568+
kv_factor: int = 2,
569+
enforce_memory_limit: bool = False):
567570
free_mem_fraction = (kv_cache_config.free_gpu_memory_fraction
568571
if kv_cache_config.free_gpu_memory_fraction
569572
is not None else 0.9)
@@ -591,7 +594,7 @@ def calculate_max_num_blocks(self,
591594
# If user specified a number of tokens
592595
if kv_cache_config.max_tokens is not None:
593596
# If user also specified a free gpu memory fraction, take the min
594-
if kv_cache_config.free_gpu_memory_fraction is not None:
597+
if kv_cache_config.free_gpu_memory_fraction is not None or enforce_memory_limit:
595598
max_tokens = min(kv_cache_config.max_tokens, max_tokens)
596599
logger.warning(
597600
f'Both free_gpu_memory_fraction and max_tokens are set (to {free_mem_fraction} and {max_tokens} with free memory {free_mem / (1 << 32)} of total memory {total_mem / (1<<32)}, respectively). The smaller value will be used.'

0 commit comments

Comments
 (0)