From ce46720ec0bab71eca23e801ba3fa089ff21c582 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Fri, 29 Aug 2025 04:44:28 +0000 Subject: [PATCH 1/3] add dequant for lienar Signed-off-by: yiliu30 --- vllm_gaudi/extension/ops.py | 13 ++++++++++++- vllm_gaudi/ops/hpu_fp8.py | 12 ++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/vllm_gaudi/extension/ops.py b/vllm_gaudi/extension/ops.py index 1f8970c3..e8e7879f 100644 --- a/vllm_gaudi/extension/ops.py +++ b/vllm_gaudi/extension/ops.py @@ -12,7 +12,7 @@ import habana_frameworks.torch.core as htcore from vllm_gaudi.extension.runtime import get_config import habana_frameworks.torch.utils.experimental as htexp - +import types is_hpu_gaudi2 = htexp._get_device_type( ) == htexp.synDeviceType.synDeviceGaudi2 @@ -698,6 +698,14 @@ def dynamic_quant(data, single_scale = False): data, 1.0 / scale, False, False, torch.float8_e4m3fn)[0] return data_fp8, scale.float() +# Chendi: Necessary base func added by INC team +def get_dequant_weights_func( + self, ) -> Optional[Callable[[torch.nn.Module], torch.Tensor]]: + if self.quant_method is not None: + quant_method = self.quant_method + if hasattr(quant_method, "dequant_fp8_weight"): + return quant_method.dequant_fp8_weight + return None def fp8_block_linear_postprocess_weights(layer, force_channel_fp8=False): torch.hpu.synchronize() @@ -723,6 +731,9 @@ def fp8_block_linear_postprocess_weights(layer, force_channel_fp8=False): requires_grad=False) htorch.core.mark_step() return layer + else: + # For INC path, we attach the dequant func to the layer + layer.get_dequant_weights_func = types.MethodType(get_dequant_weights_func, layer) layer.weight = torch.nn.Parameter(weight, requires_grad=False) orig_M = torch.nn.Parameter(torch.tensor(orig_M, dtype=torch.int32, device=weight.device), requires_grad=False) diff --git a/vllm_gaudi/ops/hpu_fp8.py b/vllm_gaudi/ops/hpu_fp8.py index 6ce55479..ec825298 100644 --- a/vllm_gaudi/ops/hpu_fp8.py +++ b/vllm_gaudi/ops/hpu_fp8.py @@ -50,6 +50,18 @@ def apply(self, bias=bias, trans_B=False) + def dequant_fp8_weight(self, layer) -> torch.Tensor: + if hasattr(layer, "updated_fp8_weight") and layer.updated_fp8_weight: + return layer.weight + dequant_weight = hpu_ops.dequant_block_fp8_weight_naive( + layer.weight, + layer.weight_scale_inv.data, + self.quant_config.weight_block_size, + original_M=layer.orig_M, + original_N=layer.orig_N, + do_unpad=True, + ) + return dequant_weight @CustomOp.register_oot(name='Fp8MoEMethod') class HPUFp8MoEMethod(Fp8MoEMethod): From aab05d4fb72a59d1049381b16bdf649f9fb5100c Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Fri, 29 Aug 2025 07:26:33 +0000 Subject: [PATCH 2/3] add test Signed-off-by: yiliu30 --- tests/full_tests/ci_gsm8k_tests.sh | 11 +++++++++++ tests/full_tests/ci_tests.sh | 12 ++++++++++++ 2 files changed, 23 insertions(+) diff --git a/tests/full_tests/ci_gsm8k_tests.sh b/tests/full_tests/ci_gsm8k_tests.sh index 65d36573..de536975 100644 --- a/tests/full_tests/ci_gsm8k_tests.sh +++ b/tests/full_tests/ci_gsm8k_tests.sh @@ -61,6 +61,17 @@ if [ $? -ne 0 ]; then fi echo "Test with deepseek_v2 + inc dynamic quantization + tp 2 successful" +echo "Testing Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant" +echo VLLM_HPU_FORCE_CHANNEL_FP8=false QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_dynamic_quant.json HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model Qwen/Qwen3-8B-FP8 --trust-remote-code +QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_dynamic_quant.json VLLM_HPU_FORCE_CHANNEL_FP8=false \ + HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 \ + python -u vllm-gaudi/tests/full_tests/generate.py --model Qwen/Qwen3-8B-FP8 --trust-remote-code +if [ $? -ne 0 ]; then + echo "Error: Test failed for Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant" >&2 + exit -1 +fi +echo "Test with Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant passed" + # Chendi: commenting out dyamic scaling test, as it is only works on G3 and failed on G2 # Don't delete them, once we have G3 CI node, we can enable it. diff --git a/tests/full_tests/ci_tests.sh b/tests/full_tests/ci_tests.sh index 8f46eabd..33179b41 100644 --- a/tests/full_tests/ci_tests.sh +++ b/tests/full_tests/ci_tests.sh @@ -39,6 +39,18 @@ if [ $? -ne 0 ]; then fi echo "Test with deepseek_v2 + inc dynamic quantization + tp 2 successful" + +echo "Testing Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant" +echo VLLM_HPU_FORCE_CHANNEL_FP8=false QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_dynamic_quant.json HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model Qwen/Qwen3-8B-FP8 --trust-remote-code +QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_dynamic_quant.json VLLM_HPU_FORCE_CHANNEL_FP8=false \ + HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 \ + python -u vllm-gaudi/tests/full_tests/generate.py --model Qwen/Qwen3-8B-FP8 --trust-remote-code +if [ $? -ne 0 ]; then + echo "Error: Test failed for Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant" >&2 + exit -1 +fi +echo "Test with Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant passed" + # structured output echo "Testing structured output" echo HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/structured_outputs.py From 021486898766cdae5bfc2de126e6bf6e59197812 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Fri, 29 Aug 2025 08:15:59 +0000 Subject: [PATCH 3/3] fix pre-commit Signed-off-by: yiliu30 --- vllm_gaudi/ops/hpu_fp8.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm_gaudi/ops/hpu_fp8.py b/vllm_gaudi/ops/hpu_fp8.py index ec825298..0b939945 100644 --- a/vllm_gaudi/ops/hpu_fp8.py +++ b/vllm_gaudi/ops/hpu_fp8.py @@ -63,6 +63,7 @@ def dequant_fp8_weight(self, layer) -> torch.Tensor: ) return dequant_weight + @CustomOp.register_oot(name='Fp8MoEMethod') class HPUFp8MoEMethod(Fp8MoEMethod):