From ce46720ec0bab71eca23e801ba3fa089ff21c582 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 29 Aug 2025 04:44:28 +0000
Subject: [PATCH 1/3] add dequant for lienar

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 vllm_gaudi/extension/ops.py | 13 ++++++++++++-
 vllm_gaudi/ops/hpu_fp8.py   | 12 ++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/vllm_gaudi/extension/ops.py b/vllm_gaudi/extension/ops.py
index 1f8970c3..e8e7879f 100644
--- a/vllm_gaudi/extension/ops.py
+++ b/vllm_gaudi/extension/ops.py
@@ -12,7 +12,7 @@
 import habana_frameworks.torch.core as htcore
 from vllm_gaudi.extension.runtime import get_config
 import habana_frameworks.torch.utils.experimental as htexp
-
+import types
 is_hpu_gaudi2 = htexp._get_device_type(
     ) == htexp.synDeviceType.synDeviceGaudi2
 
@@ -698,6 +698,14 @@ def dynamic_quant(data, single_scale = False):
         data, 1.0 / scale, False, False, torch.float8_e4m3fn)[0]
     return data_fp8, scale.float()
 
+# Chendi: Necessary base func added by INC team
+def get_dequant_weights_func(
+    self, ) -> Optional[Callable[[torch.nn.Module], torch.Tensor]]:
+    if self.quant_method is not None:
+        quant_method = self.quant_method
+        if hasattr(quant_method, "dequant_fp8_weight"):
+            return quant_method.dequant_fp8_weight
+    return None
 
 def fp8_block_linear_postprocess_weights(layer, force_channel_fp8=False):
     torch.hpu.synchronize()
@@ -723,6 +731,9 @@ def fp8_block_linear_postprocess_weights(layer, force_channel_fp8=False):
                                         requires_grad=False)
         htorch.core.mark_step()
         return layer
+    else:
+        # For INC path, we attach the dequant func to the layer
+        layer.get_dequant_weights_func = types.MethodType(get_dequant_weights_func, layer)
 
     layer.weight = torch.nn.Parameter(weight, requires_grad=False)
     orig_M = torch.nn.Parameter(torch.tensor(orig_M, dtype=torch.int32, device=weight.device), requires_grad=False)
diff --git a/vllm_gaudi/ops/hpu_fp8.py b/vllm_gaudi/ops/hpu_fp8.py
index 6ce55479..ec825298 100644
--- a/vllm_gaudi/ops/hpu_fp8.py
+++ b/vllm_gaudi/ops/hpu_fp8.py
@@ -50,6 +50,18 @@ def apply(self,
                                             bias=bias,
                                             trans_B=False)
 
+    def dequant_fp8_weight(self, layer) -> torch.Tensor:
+        if hasattr(layer, "updated_fp8_weight") and layer.updated_fp8_weight:
+            return layer.weight
+        dequant_weight = hpu_ops.dequant_block_fp8_weight_naive(
+            layer.weight,
+            layer.weight_scale_inv.data,
+            self.quant_config.weight_block_size,
+            original_M=layer.orig_M,
+            original_N=layer.orig_N,
+            do_unpad=True,
+        )
+        return dequant_weight
 
 @CustomOp.register_oot(name='Fp8MoEMethod')
 class HPUFp8MoEMethod(Fp8MoEMethod):

From aab05d4fb72a59d1049381b16bdf649f9fb5100c Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 29 Aug 2025 07:26:33 +0000
Subject: [PATCH 2/3] add test

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 tests/full_tests/ci_gsm8k_tests.sh | 11 +++++++++++
 tests/full_tests/ci_tests.sh       | 12 ++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/tests/full_tests/ci_gsm8k_tests.sh b/tests/full_tests/ci_gsm8k_tests.sh
index 65d36573..de536975 100644
--- a/tests/full_tests/ci_gsm8k_tests.sh
+++ b/tests/full_tests/ci_gsm8k_tests.sh
@@ -61,6 +61,17 @@ if [ $? -ne 0 ]; then
 fi
 echo "Test with deepseek_v2 + inc dynamic quantization + tp 2 successful"
 
+echo "Testing Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant"
+echo VLLM_HPU_FORCE_CHANNEL_FP8=false QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_dynamic_quant.json HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model Qwen/Qwen3-8B-FP8 --trust-remote-code 
+QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_dynamic_quant.json VLLM_HPU_FORCE_CHANNEL_FP8=false  \
+    HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 \
+    python -u vllm-gaudi/tests/full_tests/generate.py --model Qwen/Qwen3-8B-FP8 --trust-remote-code 
+if [ $? -ne 0 ]; then
+    echo "Error: Test failed for Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant" >&2
+    exit -1
+fi
+echo "Test with Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant passed"
+
 # Chendi: commenting out dyamic scaling test, as it is only works on G3 and failed on G2
 # Don't delete them, once we have G3 CI node, we can enable it.
 
diff --git a/tests/full_tests/ci_tests.sh b/tests/full_tests/ci_tests.sh
index 8f46eabd..33179b41 100644
--- a/tests/full_tests/ci_tests.sh
+++ b/tests/full_tests/ci_tests.sh
@@ -39,6 +39,18 @@ if [ $? -ne 0 ]; then
 fi
 echo "Test with deepseek_v2 + inc dynamic quantization + tp 2 successful"
 
+
+echo "Testing Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant"
+echo VLLM_HPU_FORCE_CHANNEL_FP8=false QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_dynamic_quant.json HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model Qwen/Qwen3-8B-FP8 --trust-remote-code 
+QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_dynamic_quant.json VLLM_HPU_FORCE_CHANNEL_FP8=false  \
+    HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 \
+    python -u vllm-gaudi/tests/full_tests/generate.py --model Qwen/Qwen3-8B-FP8 --trust-remote-code 
+if [ $? -ne 0 ]; then
+    echo "Error: Test failed for Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant" >&2
+    exit -1
+fi
+echo "Test with Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant passed"
+
 # structured output
 echo "Testing structured output"
 echo HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/structured_outputs.py 

From 021486898766cdae5bfc2de126e6bf6e59197812 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 29 Aug 2025 08:15:59 +0000
Subject: [PATCH 3/3] fix pre-commit

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 vllm_gaudi/ops/hpu_fp8.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm_gaudi/ops/hpu_fp8.py b/vllm_gaudi/ops/hpu_fp8.py
index ec825298..0b939945 100644
--- a/vllm_gaudi/ops/hpu_fp8.py
+++ b/vllm_gaudi/ops/hpu_fp8.py
@@ -63,6 +63,7 @@ def dequant_fp8_weight(self, layer) -> torch.Tensor:
         )
         return dequant_weight
 
+
 @CustomOp.register_oot(name='Fp8MoEMethod')
 class HPUFp8MoEMethod(Fp8MoEMethod):