fix: adjust window sizes of VSWA at torch backend (NVIDIA#5880)

jaedeok-nvidia · web-flow · commit ab1c54709d42 · 2025-07-15T17:41:54.000+08:00
Signed-off-by: Jaedeok Kim &lt;jaedeokk@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -1,8 +1,9 @@
+import copy
 import enum
 import math
 from abc import ABC, abstractmethod
 from collections import OrderedDict, defaultdict
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Set, Tuple, Union
 
 import torch
 
@@ -11,7 +12,7 @@
 from tensorrt_llm.bindings.BuildInfo import ENABLE_MULTI_DEVICE
 from tensorrt_llm.sampling_params import SamplingParams
 
-from ..._utils import nvtx_range
+from ..._utils import binding_dtype_size, nvtx_range
 from ...logger import logger
 from ...mapping import Mapping
 from .llm_request import LlmRequest, LlmRequestState, SamplingConfig
@@ -437,14 +438,10 @@ def calculate_max_num_blocks(self,
         cache_size_per_token = kv_factor * sum(
             self.num_kv_heads_per_layer) * head_dim
 
-        if dtype == DataType.FP8:
-            kv_cache_dtype_bytes = 1
-        elif dtype in (DataType.HALF, DataType.BF16):
-            kv_cache_dtype_bytes = 2
-        elif dtype == DataType.FLOAT:
-            kv_cache_dtype_bytes = 4
-        else:
+        if dtype not in (DataType.FP8, DataType.HALF, DataType.BF16,
+                         DataType.FLOAT):
             raise ValueError(f'Cannot support {dtype} KV cache.')
+        kv_cache_dtype_bytes = binding_dtype_size(dtype)
 
         cache_size_bytes_per_token = cache_size_per_token * kv_cache_dtype_bytes
         free_mem, total_mem = torch.cuda.mem_get_info()
@@ -603,6 +600,102 @@ def _get_window_size_to_layers(self) -> dict[int, list[int]]:
             window_size_to_layers_map[window_size].append(local_layer_idx)
         return window_size_to_layers_map
 
+    @staticmethod
+    def adjust_window_sizes_for_vswa(
+        window_size_to_layers: Dict[int, List[int]],
+        kv_cache_config: KvCacheConfigCpp,
+        model_config: ModelConfig,
+        pool_memory_bytes: int,
+        kv_factor: int,
+        dtype: DataType,
+        is_cross_attention: bool = False,
+    ) -> Dict[int, List[int]]:
+
+        assert is_cross_attention is False, 'Cross attention is not supported'
+
+        max_tokens_from_config = kv_cache_config.max_tokens
+
+        def calculate_cache_size_per_token(layers: Set[int]) -> int:
+            # Same as BaseKVCacheManager::calculateCacheSizePerTokenForSingleWindowSize
+            total_kv_heads = sum(model_config.num_kv_heads_per_layer[i]
+                                 for i in layers)
+            return total_kv_heads * kv_factor * model_config.head_size
+
+        # Calculate the required memory bytes per sequence.
+        required_mem_bytes_per_seq = 0
+        for window_size in sorted(window_size_to_layers):
+            layers = window_size_to_layers[window_size]
+            cache_size_per_token = calculate_cache_size_per_token(layers)
+            cache_size_bytes_per_token = cache_size_per_token * binding_dtype_size(
+                dtype)
+            required_mem_bytes_per_seq += window_size * cache_size_bytes_per_token
+        logger.debug(
+            f'Required memory per sequence: {required_mem_bytes_per_seq} bytes')
+
+        if required_mem_bytes_per_seq < pool_memory_bytes:
+            # No need to adjust the window sizes.
+            return copy.deepcopy(window_size_to_layers)
+
+        logger.debug(
+            f'Adjusting the window sizes {list(window_size_to_layers)} to fit '
+            f'the memory {pool_memory_bytes} bytes.')
+        adjusted_window_size_to_layers = {}
+
+        remaining_mem_bytes = pool_memory_bytes
+        remaining_layers = set(i for layers in window_size_to_layers.values()
+                               for i in layers)
+
+        accum_max_tokens = 0
+        prev_window_size = 0
+
+        for window_size in sorted(window_size_to_layers):
+            layers = window_size_to_layers[window_size]
+            if remaining_mem_bytes > 0 and remaining_layers:
+                # Calculate cache size per token for remaining layers only
+                cache_size_per_token = calculate_cache_size_per_token(
+                    remaining_layers)
+                cache_size_bytes_per_token = cache_size_per_token * binding_dtype_size(
+                    dtype)
+                logger.debug(
+                    f'Cache size per token for {len(remaining_layers)} layers: '
+                    f'{cache_size_bytes_per_token} bytes')
+                # Calculate max tokens that can fit in this window with remaining memory.
+                max_tokens_in_window = min(
+                    remaining_mem_bytes // cache_size_bytes_per_token,
+                    window_size - prev_window_size)
+                remaining_mem_bytes -= max_tokens_in_window * cache_size_bytes_per_token
+                accum_max_tokens += max_tokens_in_window
+                logger.debug(f'Remaining memory: {remaining_mem_bytes} bytes')
+                logger.debug(
+                    f'Max token of window {window_size}: {accum_max_tokens}')
+
+                if accum_max_tokens < window_size:
+                    logger.debug(
+                        f'Max tokens ({accum_max_tokens}) cannot fill the current window ({window_size}). '
+                        f'The larger windows will have the same max tokens.')
+                    remaining_mem_bytes = 0
+
+                # Clamp the sequence length if provided explicitly.
+                if max_tokens_from_config is not None:
+                    accum_max_tokens = min(max_tokens_from_config,
+                                           accum_max_tokens)
+                    # If max tokens from config is reached, stop allocating
+                    # more memory. Since the maximum number of tokens is
+                    # already reached, for the remaining windows maxTokens
+                    # will be set by the current value of accumMaxTokens.
+                    if accum_max_tokens == max_tokens_from_config:
+                        remaining_mem_bytes = 0
+
+            if accum_max_tokens not in adjusted_window_size_to_layers:
+                adjusted_window_size_to_layers[accum_max_tokens] = layers.copy()
+            else:
+                adjusted_window_size_to_layers[accum_max_tokens].extend(layers)
+
+            remaining_layers -= set(layers)
+            prev_window_size = window_size
+
+        return adjusted_window_size_to_layers
+
     def calculate_max_num_blocks_from_cpp(
             self,
             kv_cache_config: KvCacheConfigCpp,
@@ -622,6 +715,9 @@ def calculate_max_num_blocks_from_cpp(
             A dict of (max_attention_window, (blocks_in_primary_pool, blocks_in_secondary_pool)).
         """
 
+        # VSWA on Torch backend has not supported the cross attention.
+        is_cross_attention = False
+
         # Construct WorldConfig from self.mapping
         world_config_cpp = WorldConfig(
             tensor_parallelism=self.mapping.tp_size,
@@ -636,12 +732,26 @@ def calculate_max_num_blocks_from_cpp(
         primary_pool_memory_bytes = free_mem
         secondary_pool_memory_bytes = 0
         logger.debug(
-            f"primary_pool_memory_bytes is set to {primary_pool_memory_bytes/1024**3}GB, \nsecondary_pool_memory_bytes is set to {secondary_pool_memory_bytes/1024**3}GB"
+            f"primary_pool_memory_bytes is set to {primary_pool_memory_bytes/1024**3}GB, \n"
+            f"secondary_pool_memory_bytes is set to {secondary_pool_memory_bytes/1024**3}GB"
+        )
+
+        # Adjust the window sizes to fit the memory if even a single sequence
+        # cannot fit in the memory.
+        window_size_to_layers = self.adjust_window_sizes_for_vswa(
+            window_size_to_layers=window_size_to_layers,
+            model_config=model_config,
+            kv_cache_config=kv_cache_config,
+            pool_memory_bytes=primary_pool_memory_bytes,
+            kv_factor=self.kv_factor,
+            dtype=self.dtype,
+            is_cross_attention=is_cross_attention,
         )
 
         blocks_per_window = KVCacheManagerCpp.calculate_max_num_blocks(
             config=kv_cache_config,
-            is_cross_attention=False,  #TODO: support cross attention
+            # TODO: support cross attention
+            is_cross_attention=is_cross_attention,
             dtype=self.dtype,
             model_config=model_config,
             world_config=world_config_cpp,
diff --git a/tensorrt_llm/_utils.py b/tensorrt_llm/_utils.py
@@ -180,6 +180,22 @@ def str_dtype_to_torch(dtype):
     fp8=DataType.FP8,
 )
 
+_binding_dtype_size = {
+    DataType.INT64: 8,
+    DataType.FLOAT: 4,
+    DataType.INT32: 4,
+    DataType.BF16: 2,
+    DataType.HALF: 2,
+    DataType.BOOL: 1,
+    DataType.FP8: 1,
+    DataType.INT8: 1,
+    DataType.UINT8: 1,
+}
+
+
+def binding_dtype_size(dtype: DataType):
+    return _binding_dtype_size[dtype]
+
 
 def str_dtype_to_binding(dtype):
     ret = _str_to_binding_dtype_dict.get(dtype)
diff --git a/tests/unittest/_torch/test_resource_manager.py b/tests/unittest/_torch/test_resource_manager.py
@@ -10,13 +10,15 @@
 
 import tensorrt_llm
 import tensorrt_llm.bindings
-from tensorrt_llm._torch.pyexecutor.resource_manager import (PeftCacheConfig,
+from tensorrt_llm._torch.pyexecutor.resource_manager import (KVCacheManager,
+                                                             PeftCacheConfig,
                                                              PeftCacheManager)
 from tensorrt_llm.bindings import ModelConfig as ModelConfigCpp
 from tensorrt_llm.bindings import executor as tllm
 from tensorrt_llm.bindings.internal.batch_manager import \
     PeftTaskNotCachedException
 
+DataType = tensorrt_llm.bindings.DataType
 LoraModule = tensorrt_llm.bindings.LoraModule
 LoraModuleType = tensorrt_llm.bindings.LoraModuleType
 current_dir = pathlib.Path(__file__).parent.resolve()
@@ -66,7 +68,15 @@ def __init__(self):
             self.num_rnn_layers = 0
             self.num_attention_heads = 1
             self.hidden_size = 16
-            self.data_type = tensorrt_llm.bindings.DataType.HALF
+            self.data_type = DataType.HALF
+
+        @property
+        def num_kv_heads_per_layer(self):
+            return [self.num_attention_heads] * self.num_attention_layers
+
+        @property
+        def head_size(self):
+            return self.hidden_size // self.num_attention_heads
 
     class MockPeftCacheManagerConfig:
         """
@@ -416,3 +426,105 @@ def test_put_get(self):
             self.assertEqual(entry.layer_id, expected_values[i][5])
             self.assertEqual(entry.adapter_size, expected_values[i][6])
             self.assertEqual(entry.num_slots, expected_values[i][7])
+
+    def test_adjust_window_sizes_for_vswa(self):
+        window_size_to_layers = {
+            100: [0, 1, 2, 3],
+            200: [4, 5, 6],
+            7000: [7, 8],
+        }
+
+        model_config = self.MockModelConfig()
+        model_config.num_attention_heads = 2
+        model_config.hidden_size = 2
+        model_config.data_type = DataType.HALF
+
+        total_layers = [
+            i for layers in window_size_to_layers.values() for i in layers
+        ]
+
+        model_config.num_hidden_layers = len(total_layers)
+        model_config.num_attention_layers = len(total_layers)
+
+        kv_factor = 2
+        cache_bytes_per_token_per_layer = 8
+
+        # Define test cases:
+        #    (memory_bytes, expected_window_sizes, max_tokens, description)
+        #    If max_tokens is None, then it will use the default value of KvCacheConfig.
+        test_cases = [
+            (
+                # Case 1: Limited memory - windows get clamped
+                cache_bytes_per_token_per_layer * (100 * 9 + 30 * 5) + 4,
+                {
+                    100: [0, 1, 2, 3],
+                    130: [4, 5, 6, 7, 8],
+                },
+                None,
+                "limited_memory_clamped_windows"),
+            (
+                # Case 2: Less limited memory - the largest window get clamped
+                cache_bytes_per_token_per_layer *
+                (100 * 9 + 100 * 5 + 817 * 2) + 4,
+                {
+                    100: [0, 1, 2, 3],
+                    200: [4, 5, 6],
+                    1017: [7, 8],
+                },
+                None,
+                "less_limited_memory_clamped_windows"),
+            (
+                # Case 3: Sufficient memory - no clamping needed
+                cache_bytes_per_token_per_layer *
+                (100 * 4 + 200 * 3 + 7000 * 2) + 9402,
+                {
+                    100: [0, 1, 2, 3],
+                    200: [4, 5, 6],
+                    7000: [7, 8],
+                },
+                None,
+                "sufficient_memory_no_clamping"),
+            (
+                # Case 4: Very limited memory - all windows get small values
+                cache_bytes_per_token_per_layer * (51 * 9) + 1,
+                {
+                    51: [0, 1, 2, 3, 4, 5, 6, 7, 8],
+                },
+                None,
+                "very_limited_memory_all_clamped"),
+            (
+                # Case 5: Less limited memory but max_tokens is given.
+                # memory is enough for 1017 tokens, it will be clamped by max_tokens=134.
+                cache_bytes_per_token_per_layer *
+                (100 * 9 + 100 * 5 + 817 * 2) + 4,
+                {
+                    100: [0, 1, 2, 3],
+                    134: [4, 5, 6, 7, 8],
+                },
+                134,
+                "less_limited_memory_but_clamped_by_max_tokens"),
+        ]
+
+        for memory_bytes, expected_window_sizes, max_tokens, description in test_cases:
+            with self.subTest(case=description, memory_bytes=memory_bytes):
+                kv_cache_config = tllm.KvCacheConfig(max_tokens=max_tokens)
+                adjusted = KVCacheManager.adjust_window_sizes_for_vswa(
+                    window_size_to_layers=window_size_to_layers,
+                    model_config=model_config,
+                    kv_cache_config=kv_cache_config,
+                    pool_memory_bytes=memory_bytes,
+                    kv_factor=kv_factor,
+                    dtype=model_config.data_type,
+                    is_cross_attention=False,
+                )
+
+                self.assertEqual(
+                    adjusted, expected_window_sizes,
+                    f"Test case '{description}' failed.\n"
+                    f"Memory bytes: {memory_bytes}\n"
+                    f"Actual: {adjusted}\n"
+                    f"Expected: {expected_window_sizes}")
+
+
+if __name__ == "__main__":
+    unittest.main()