Restore support for kv_cache_dtype (#40)

kzawora-intel · web-flow · commit 55cb5aab7d00 · 2025-07-22T12:59:29.000+02:00
vllm-project/vllm#21302 got merged, we can re-enable kv_cache_dtype now. --------- Signed-off-by: Konrad Zawora <kzawora@habana.ai>
diff --git a/tests/full_tests/ci_gsm8k_tests.sh b/tests/full_tests/ci_gsm8k_tests.sh
@@ -29,26 +29,26 @@ fi
 echo "Test with deepseek v2 lite passed"
 
 # granite + inc
-#echo "Testing granite-8b + inc with vllm-hpu plugin v1"
-#echo QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_unit_scale_quant.json HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model ibm-granite/granite-3.3-2b-instruct --trust-remote-code  --quantization inc --kv_cache_dtype fp8_inc
-#QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_unit_scale_quant.json \
-#HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model ibm-granite/granite-3.3-2b-instruct --trust-remote-code --quantization inc --kv_cache_dtype fp8_inc
-#if [ $? -ne 0 ]; then
-#    echo "Error: Test failed for granite + inc" >&2
-#    exit -1
-#fi
-#echo "Test with granite + inc passed"
+echo "Testing granite-8b + inc with vllm-hpu plugin v1"
+echo QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_unit_scale_quant.json HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model ibm-granite/granite-3.3-2b-instruct --trust-remote-code  --quantization inc --kv_cache_dtype fp8_inc
+QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_unit_scale_quant.json \
+HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model ibm-granite/granite-3.3-2b-instruct --trust-remote-code --quantization inc --kv_cache_dtype fp8_inc
+if [ $? -ne 0 ]; then
+    echo "Error: Test failed for granite + inc" >&2
+    exit -1
+fi
+echo "Test with granite + inc passed"
 
 # deepseek v2 + inc
-#echo "Testing deepseek_v2 + inc with vllm-hpu plugin v1"
-#echo QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_unit_scale_quant.json HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model deepseek-ai/DeepSeek-V2-Lite-Chat --trust-remote-code  --quantization inc --kv_cache_dtype fp8_inc
-#QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_unit_scale_quant.json \
-#HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model deepseek-ai/DeepSeek-V2-Lite-Chat --trust-remote-code --quantization inc --kv_cache_dtype fp8_inc
-#if [ $? -ne 0 ]; then
-#    echo "Error: Test failed for deepseek_v2 + inc" >&2
-#    exit -1
-#fi
-#echo "Test with deepseek_v2 + inc passed"
+echo "Testing deepseek_v2 + inc with vllm-hpu plugin v1"
+echo QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_unit_scale_quant.json HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model deepseek-ai/DeepSeek-V2-Lite-Chat --trust-remote-code  --quantization inc --kv_cache_dtype fp8_inc
+QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_unit_scale_quant.json \
+HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model deepseek-ai/DeepSeek-V2-Lite-Chat --trust-remote-code --quantization inc --kv_cache_dtype fp8_inc
+if [ $? -ne 0 ]; then
+    echo "Error: Test failed for deepseek_v2 + inc" >&2
+    exit -1
+fi
+echo "Test with deepseek_v2 + inc passed"
 
 # gsm8k test
 # used to check HPUattn + MLP
diff --git a/vllm_gaudi/platform.py b/vllm_gaudi/platform.py
@@ -149,6 +149,10 @@ def set_torch_compile(cls) -> None:
             # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html  # noqa: E501
             os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true'
 
+    @classmethod
+    def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str) -> bool:
+        return kv_cache_dtype == "fp8_inc"
+
     @classmethod
     def set_synchronized_weight_loader(cls) -> None: