Skip to content

Commit 55cb5aa

Browse files
Restore support for kv_cache_dtype (#40)
vllm-project/vllm#21302 got merged, we can re-enable kv_cache_dtype now. --------- Signed-off-by: Konrad Zawora <[email protected]>
1 parent c894862 commit 55cb5aa

File tree

2 files changed

+22
-18
lines changed

2 files changed

+22
-18
lines changed

tests/full_tests/ci_gsm8k_tests.sh

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -29,26 +29,26 @@ fi
2929
echo "Test with deepseek v2 lite passed"
3030

3131
# granite + inc
32-
#echo "Testing granite-8b + inc with vllm-hpu plugin v1"
33-
#echo QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_unit_scale_quant.json HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model ibm-granite/granite-3.3-2b-instruct --trust-remote-code --quantization inc --kv_cache_dtype fp8_inc
34-
#QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_unit_scale_quant.json \
35-
#HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model ibm-granite/granite-3.3-2b-instruct --trust-remote-code --quantization inc --kv_cache_dtype fp8_inc
36-
#if [ $? -ne 0 ]; then
37-
# echo "Error: Test failed for granite + inc" >&2
38-
# exit -1
39-
#fi
40-
#echo "Test with granite + inc passed"
32+
echo "Testing granite-8b + inc with vllm-hpu plugin v1"
33+
echo QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_unit_scale_quant.json HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model ibm-granite/granite-3.3-2b-instruct --trust-remote-code --quantization inc --kv_cache_dtype fp8_inc
34+
QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_unit_scale_quant.json \
35+
HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model ibm-granite/granite-3.3-2b-instruct --trust-remote-code --quantization inc --kv_cache_dtype fp8_inc
36+
if [ $? -ne 0 ]; then
37+
echo "Error: Test failed for granite + inc" >&2
38+
exit -1
39+
fi
40+
echo "Test with granite + inc passed"
4141

4242
# deepseek v2 + inc
43-
#echo "Testing deepseek_v2 + inc with vllm-hpu plugin v1"
44-
#echo QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_unit_scale_quant.json HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model deepseek-ai/DeepSeek-V2-Lite-Chat --trust-remote-code --quantization inc --kv_cache_dtype fp8_inc
45-
#QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_unit_scale_quant.json \
46-
#HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model deepseek-ai/DeepSeek-V2-Lite-Chat --trust-remote-code --quantization inc --kv_cache_dtype fp8_inc
47-
#if [ $? -ne 0 ]; then
48-
# echo "Error: Test failed for deepseek_v2 + inc" >&2
49-
# exit -1
50-
#fi
51-
#echo "Test with deepseek_v2 + inc passed"
43+
echo "Testing deepseek_v2 + inc with vllm-hpu plugin v1"
44+
echo QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_unit_scale_quant.json HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model deepseek-ai/DeepSeek-V2-Lite-Chat --trust-remote-code --quantization inc --kv_cache_dtype fp8_inc
45+
QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_unit_scale_quant.json \
46+
HABANA_VISIBLE_DEVICES=all VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model deepseek-ai/DeepSeek-V2-Lite-Chat --trust-remote-code --quantization inc --kv_cache_dtype fp8_inc
47+
if [ $? -ne 0 ]; then
48+
echo "Error: Test failed for deepseek_v2 + inc" >&2
49+
exit -1
50+
fi
51+
echo "Test with deepseek_v2 + inc passed"
5252

5353
# gsm8k test
5454
# used to check HPUattn + MLP

vllm_gaudi/platform.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,10 @@ def set_torch_compile(cls) -> None:
149149
# see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501
150150
os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true'
151151

152+
@classmethod
153+
def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str) -> bool:
154+
return kv_cache_dtype == "fp8_inc"
155+
152156
@classmethod
153157
def set_synchronized_weight_loader(cls) -> None:
154158

0 commit comments

Comments
 (0)