vllm-project · yiliu30 · Aug 29, 2025 · Aug 29, 2025 · Aug 29, 2025
@@ -61,6 +61,17 @@ if [ $? -ne 0 ]; then
 fi
 echo "Test with deepseek_v2 + inc dynamic quantization + tp 2 successful"
 
+echo "Testing Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant"
+echo VLLM_HPU_FORCE_CHANNEL_FP8=false QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_dynamic_quant.json HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model Qwen/Qwen3-8B-FP8 --trust-remote-code 
+QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_dynamic_quant.json VLLM_HPU_FORCE_CHANNEL_FP8=false  \
+    HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 \
+    python -u vllm-gaudi/tests/full_tests/generate.py --model Qwen/Qwen3-8B-FP8 --trust-remote-code 
+if [ $? -ne 0 ]; then
+    echo "Error: Test failed for Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant" >&2
+    exit -1
+fi
+echo "Test with Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant passed"
+
 # Chendi: commenting out dyamic scaling test, as it is only works on G3 and failed on G2
 # Don't delete them, once we have G3 CI node, we can enable it.
 

@@ -39,6 +39,18 @@ if [ $? -ne 0 ]; then
 fi
 echo "Test with deepseek_v2 + inc dynamic quantization + tp 2 successful"
 
+
+echo "Testing Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant"
+echo VLLM_HPU_FORCE_CHANNEL_FP8=false QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_dynamic_quant.json HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model Qwen/Qwen3-8B-FP8 --trust-remote-code 
+QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_dynamic_quant.json VLLM_HPU_FORCE_CHANNEL_FP8=false  \
+    HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 \
+    python -u vllm-gaudi/tests/full_tests/generate.py --model Qwen/Qwen3-8B-FP8 --trust-remote-code 
+if [ $? -ne 0 ]; then
+    echo "Error: Test failed for Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant" >&2
+    exit -1
+fi
+echo "Test with Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant passed"
+
 # structured output
 echo "Testing structured output"
 echo HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/structured_outputs.py 

@@ -12,7 +12,7 @@
 import habana_frameworks.torch.core as htcore
 from vllm_gaudi.extension.runtime import get_config
 import habana_frameworks.torch.utils.experimental as htexp
-
+import types
 is_hpu_gaudi2 = htexp._get_device_type(
     ) == htexp.synDeviceType.synDeviceGaudi2
 
@@ -698,6 +698,14 @@ def dynamic_quant(data, single_scale = False):
         data, 1.0 / scale, False, False, torch.float8_e4m3fn)[0]
     return data_fp8, scale.float()
 
+# Chendi: Necessary base func added by INC team
+def get_dequant_weights_func(
+    self, ) -> Optional[Callable[[torch.nn.Module], torch.Tensor]]:
+    if self.quant_method is not None:
+        quant_method = self.quant_method
+        if hasattr(quant_method, "dequant_fp8_weight"):
+            return quant_method.dequant_fp8_weight
+    return None
 
 def fp8_block_linear_postprocess_weights(layer, force_channel_fp8=False):
     torch.hpu.synchronize()
@@ -723,6 +731,9 @@ def fp8_block_linear_postprocess_weights(layer, force_channel_fp8=False):
                                         requires_grad=False)
         htorch.core.mark_step()
         return layer
+    else:
+        # For INC path, we attach the dequant func to the layer
+        layer.get_dequant_weights_func = types.MethodType(get_dequant_weights_func, layer)
 
     layer.weight = torch.nn.Parameter(weight, requires_grad=False)
     orig_M = torch.nn.Parameter(torch.tensor(orig_M, dtype=torch.int32, device=weight.device), requires_grad=False)

@@ -50,6 +50,19 @@ def apply(self,
                                             bias=bias,
                                             trans_B=False)
 
+    def dequant_fp8_weight(self, layer) -> torch.Tensor:
+        if hasattr(layer, "updated_fp8_weight") and layer.updated_fp8_weight:
+            return layer.weight
+        dequant_weight = hpu_ops.dequant_block_fp8_weight_naive(
+            layer.weight,
+            layer.weight_scale_inv.data,
+            self.quant_config.weight_block_size,
+            original_M=layer.orig_M,
+            original_N=layer.orig_N,
+            do_unpad=True,
+        )
+        return dequant_weight
+
 
 @CustomOp.register_oot(name='Fp8MoEMethod')
 class HPUFp8MoEMethod(Fp8MoEMethod):