@@ -1076,6 +1076,21 @@ def test_auto_dtype_vswa_without_reuse(self):
10761076 task = MMLU (self .MODEL_NAME )
10771077 task .evaluate (llm )
10781078
1079+ def test_auto_dtype_vswa_without_reuse_low_memory_available (self ):
1080+ # NOTE: Test with VSWA kv cache config.
1081+ kv_cache_config = KvCacheConfig (
1082+ enable_block_reuse = False ,
1083+ enable_partial_reuse = False ,
1084+ max_attention_window = [512 , 512 , 512 , 512 , 512 , 32768 ],
1085+ free_gpu_memory_fraction = 0.1 ,
1086+ )
1087+
1088+ with LLM (self .MODEL_PATH , kv_cache_config = kv_cache_config ) as llm :
1089+ task = GSM8K (self .MODEL_NAME )
1090+ task .evaluate (llm )
1091+ task = MMLU (self .MODEL_NAME )
1092+ task .evaluate (llm )
1093+
10791094 def test_auto_dtype_vswa_reuse (self ):
10801095 # NOTE: Test with VSWA kv cache config.
10811096 kv_cache_config = KvCacheConfig (
@@ -1089,6 +1104,54 @@ def test_auto_dtype_vswa_reuse(self):
10891104 task = MMLU (self .MODEL_NAME )
10901105 task .evaluate (llm )
10911106
1107+ def test_auto_dtype_vswa_reuse_partial_reuse (self ):
1108+ # NOTE: Test with VSWA kv cache config.
1109+ kv_cache_config = KvCacheConfig (
1110+ enable_block_reuse = True ,
1111+ enable_partial_reuse = True ,
1112+ max_attention_window = [512 , 512 , 512 , 512 , 512 , 32768 ],
1113+ )
1114+
1115+ with LLM (self .MODEL_PATH , kv_cache_config = kv_cache_config ) as llm :
1116+ task = GSM8K (self .MODEL_NAME )
1117+ task .evaluate (llm )
1118+ task = MMLU (self .MODEL_NAME )
1119+ task .evaluate (llm )
1120+
1121+ def test_auto_dtype_vswa_reuse_low_memory_available_no_partial_reuse (self ):
1122+ # NOTE: Test with VSWA kv cache config.
1123+ kv_cache_config = KvCacheConfig (
1124+ enable_block_reuse = True ,
1125+ enable_partial_reuse = False ,
1126+ max_attention_window = [512 , 512 , 512 , 512 , 512 , 32768 ],
1127+ free_gpu_memory_fraction = 0.1 ,
1128+ )
1129+
1130+ with LLM (self .MODEL_PATH , kv_cache_config = kv_cache_config ) as llm :
1131+ task = GSM8K (self .MODEL_NAME )
1132+ task .evaluate (llm )
1133+ task = MMLU (self .MODEL_NAME )
1134+ task .evaluate (llm )
1135+
1136+ @pytest .mark .skip (
1137+ reason =
1138+ "Currently failing due to accuracy drop, https://nvbugspro.nvidia.com/bug/5625990"
1139+ )
1140+ def test_auto_dtype_vswa_reuse_low_memory_available_partial_reuse (self ):
1141+ # NOTE: Test with VSWA kv cache config.
1142+ kv_cache_config = KvCacheConfig (
1143+ enable_block_reuse = True ,
1144+ enable_partial_reuse = True ,
1145+ max_attention_window = [512 , 512 , 512 , 512 , 512 , 32768 ],
1146+ free_gpu_memory_fraction = 0.1 ,
1147+ )
1148+
1149+ with LLM (self .MODEL_PATH , kv_cache_config = kv_cache_config ) as llm :
1150+ task = GSM8K (self .MODEL_NAME )
1151+ task .evaluate (llm )
1152+ task = MMLU (self .MODEL_NAME )
1153+ task .evaluate (llm )
1154+
10921155 def test_auto_dtype_vswa_chunked_prefill_without_reuse (self ):
10931156 # NOTE: Test with VSWA kv cache config.
10941157 kv_cache_config = KvCacheConfig (
0 commit comments