Skip to content

Commit bd1c9c0

Browse files
authored
[https://nvbugs/5625990][chore] Add test coverage for current incapability of the KV cache manager (NVIDIA#8829)
Signed-off-by: eopXD <[email protected]>
1 parent 67208f1 commit bd1c9c0

File tree

2 files changed

+67
-0
lines changed

2 files changed

+67
-0
lines changed

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1076,6 +1076,21 @@ def test_auto_dtype_vswa_without_reuse(self):
10761076
task = MMLU(self.MODEL_NAME)
10771077
task.evaluate(llm)
10781078

1079+
def test_auto_dtype_vswa_without_reuse_low_memory_available(self):
1080+
# NOTE: Test with VSWA kv cache config.
1081+
kv_cache_config = KvCacheConfig(
1082+
enable_block_reuse=False,
1083+
enable_partial_reuse=False,
1084+
max_attention_window=[512, 512, 512, 512, 512, 32768],
1085+
free_gpu_memory_fraction=0.1,
1086+
)
1087+
1088+
with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
1089+
task = GSM8K(self.MODEL_NAME)
1090+
task.evaluate(llm)
1091+
task = MMLU(self.MODEL_NAME)
1092+
task.evaluate(llm)
1093+
10791094
def test_auto_dtype_vswa_reuse(self):
10801095
# NOTE: Test with VSWA kv cache config.
10811096
kv_cache_config = KvCacheConfig(
@@ -1089,6 +1104,54 @@ def test_auto_dtype_vswa_reuse(self):
10891104
task = MMLU(self.MODEL_NAME)
10901105
task.evaluate(llm)
10911106

1107+
def test_auto_dtype_vswa_reuse_partial_reuse(self):
1108+
# NOTE: Test with VSWA kv cache config.
1109+
kv_cache_config = KvCacheConfig(
1110+
enable_block_reuse=True,
1111+
enable_partial_reuse=True,
1112+
max_attention_window=[512, 512, 512, 512, 512, 32768],
1113+
)
1114+
1115+
with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
1116+
task = GSM8K(self.MODEL_NAME)
1117+
task.evaluate(llm)
1118+
task = MMLU(self.MODEL_NAME)
1119+
task.evaluate(llm)
1120+
1121+
def test_auto_dtype_vswa_reuse_low_memory_available_no_partial_reuse(self):
1122+
# NOTE: Test with VSWA kv cache config.
1123+
kv_cache_config = KvCacheConfig(
1124+
enable_block_reuse=True,
1125+
enable_partial_reuse=False,
1126+
max_attention_window=[512, 512, 512, 512, 512, 32768],
1127+
free_gpu_memory_fraction=0.1,
1128+
)
1129+
1130+
with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
1131+
task = GSM8K(self.MODEL_NAME)
1132+
task.evaluate(llm)
1133+
task = MMLU(self.MODEL_NAME)
1134+
task.evaluate(llm)
1135+
1136+
@pytest.mark.skip(
1137+
reason=
1138+
"Currently failing due to accuracy drop, https://nvbugspro.nvidia.com/bug/5625990"
1139+
)
1140+
def test_auto_dtype_vswa_reuse_low_memory_available_partial_reuse(self):
1141+
# NOTE: Test with VSWA kv cache config.
1142+
kv_cache_config = KvCacheConfig(
1143+
enable_block_reuse=True,
1144+
enable_partial_reuse=True,
1145+
max_attention_window=[512, 512, 512, 512, 512, 32768],
1146+
free_gpu_memory_fraction=0.1,
1147+
)
1148+
1149+
with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
1150+
task = GSM8K(self.MODEL_NAME)
1151+
task.evaluate(llm)
1152+
task = MMLU(self.MODEL_NAME)
1153+
task.evaluate(llm)
1154+
10921155
def test_auto_dtype_vswa_chunked_prefill_without_reuse(self):
10931156
# NOTE: Test with VSWA kv cache config.
10941157
kv_cache_config = KvCacheConfig(

tests/integration/test_lists/test-db/l0_h100.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ l0_h100:
3737
- accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_reuse
3838
- accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_chunked_prefill_without_reuse
3939
- accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_chunked_prefill_reuse
40+
- accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_without_reuse_low_memory_available
41+
- accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_reuse_partial_reuse
42+
- accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_reuse_low_memory_available_no_partial_reuse
43+
- accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_reuse_low_memory_available_partial_reuse
4044
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16[attn_backend=TRTLLM-torch_compile=False]
4145
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16[attn_backend=TRTLLM-torch_compile=True]
4246
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=TRTLLM] TIMEOUT (90)

0 commit comments

Comments
 (0)