diff --git a/tests/full_tests/ci_gsm8k_tests.sh b/tests/full_tests/ci_gsm8k_tests.sh index 65d36573..de536975 100644 --- a/tests/full_tests/ci_gsm8k_tests.sh +++ b/tests/full_tests/ci_gsm8k_tests.sh @@ -61,6 +61,17 @@ if [ $? -ne 0 ]; then fi echo "Test with deepseek_v2 + inc dynamic quantization + tp 2 successful" +echo "Testing Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant" +echo VLLM_HPU_FORCE_CHANNEL_FP8=false QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_dynamic_quant.json HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model Qwen/Qwen3-8B-FP8 --trust-remote-code +QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_dynamic_quant.json VLLM_HPU_FORCE_CHANNEL_FP8=false \ + HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 \ + python -u vllm-gaudi/tests/full_tests/generate.py --model Qwen/Qwen3-8B-FP8 --trust-remote-code +if [ $? -ne 0 ]; then + echo "Error: Test failed for Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant" >&2 + exit -1 +fi +echo "Test with Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant passed" + # Chendi: commenting out dyamic scaling test, as it is only works on G3 and failed on G2 # Don't delete them, once we have G3 CI node, we can enable it. diff --git a/tests/full_tests/ci_tests.sh b/tests/full_tests/ci_tests.sh index 8f46eabd..33179b41 100644 --- a/tests/full_tests/ci_tests.sh +++ b/tests/full_tests/ci_tests.sh @@ -39,6 +39,18 @@ if [ $? -ne 0 ]; then fi echo "Test with deepseek_v2 + inc dynamic quantization + tp 2 successful" + +echo "Testing Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant" +echo VLLM_HPU_FORCE_CHANNEL_FP8=false QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_dynamic_quant.json HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model Qwen/Qwen3-8B-FP8 --trust-remote-code +QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_dynamic_quant.json VLLM_HPU_FORCE_CHANNEL_FP8=false \ + HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 \ + python -u vllm-gaudi/tests/full_tests/generate.py --model Qwen/Qwen3-8B-FP8 --trust-remote-code +if [ $? -ne 0 ]; then + echo "Error: Test failed for Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant" >&2 + exit -1 +fi +echo "Test with Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant passed" + # structured output echo "Testing structured output" echo HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/structured_outputs.py diff --git a/vllm_gaudi/extension/ops.py b/vllm_gaudi/extension/ops.py index 1f8970c3..e8e7879f 100644 --- a/vllm_gaudi/extension/ops.py +++ b/vllm_gaudi/extension/ops.py @@ -12,7 +12,7 @@ import habana_frameworks.torch.core as htcore from vllm_gaudi.extension.runtime import get_config import habana_frameworks.torch.utils.experimental as htexp - +import types is_hpu_gaudi2 = htexp._get_device_type( ) == htexp.synDeviceType.synDeviceGaudi2 @@ -698,6 +698,14 @@ def dynamic_quant(data, single_scale = False): data, 1.0 / scale, False, False, torch.float8_e4m3fn)[0] return data_fp8, scale.float() +# Chendi: Necessary base func added by INC team +def get_dequant_weights_func( + self, ) -> Optional[Callable[[torch.nn.Module], torch.Tensor]]: + if self.quant_method is not None: + quant_method = self.quant_method + if hasattr(quant_method, "dequant_fp8_weight"): + return quant_method.dequant_fp8_weight + return None def fp8_block_linear_postprocess_weights(layer, force_channel_fp8=False): torch.hpu.synchronize() @@ -723,6 +731,9 @@ def fp8_block_linear_postprocess_weights(layer, force_channel_fp8=False): requires_grad=False) htorch.core.mark_step() return layer + else: + # For INC path, we attach the dequant func to the layer + layer.get_dequant_weights_func = types.MethodType(get_dequant_weights_func, layer) layer.weight = torch.nn.Parameter(weight, requires_grad=False) orig_M = torch.nn.Parameter(torch.tensor(orig_M, dtype=torch.int32, device=weight.device), requires_grad=False) diff --git a/vllm_gaudi/ops/hpu_fp8.py b/vllm_gaudi/ops/hpu_fp8.py index 6ce55479..0b939945 100644 --- a/vllm_gaudi/ops/hpu_fp8.py +++ b/vllm_gaudi/ops/hpu_fp8.py @@ -50,6 +50,19 @@ def apply(self, bias=bias, trans_B=False) + def dequant_fp8_weight(self, layer) -> torch.Tensor: + if hasattr(layer, "updated_fp8_weight") and layer.updated_fp8_weight: + return layer.weight + dequant_weight = hpu_ops.dequant_block_fp8_weight_naive( + layer.weight, + layer.weight_scale_inv.data, + self.quant_config.weight_block_size, + original_M=layer.orig_M, + original_N=layer.orig_N, + do_unpad=True, + ) + return dequant_weight + @CustomOp.register_oot(name='Fp8MoEMethod') class HPUFp8MoEMethod(Fp8MoEMethod):