Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions tests/full_tests/ci_gsm8k_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,17 @@ if [ $? -ne 0 ]; then
fi
echo "Test with deepseek_v2 + inc dynamic quantization + tp 2 successful"

echo "Testing Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant"
echo VLLM_HPU_FORCE_CHANNEL_FP8=false QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_dynamic_quant.json HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model Qwen/Qwen3-8B-FP8 --trust-remote-code
QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_dynamic_quant.json VLLM_HPU_FORCE_CHANNEL_FP8=false \
HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 \
python -u vllm-gaudi/tests/full_tests/generate.py --model Qwen/Qwen3-8B-FP8 --trust-remote-code
if [ $? -ne 0 ]; then
echo "Error: Test failed for Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant" >&2
exit -1
fi
echo "Test with Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant passed"

# Chendi: commenting out dyamic scaling test, as it is only works on G3 and failed on G2
# Don't delete them, once we have G3 CI node, we can enable it.

Expand Down
12 changes: 12 additions & 0 deletions tests/full_tests/ci_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,18 @@ if [ $? -ne 0 ]; then
fi
echo "Test with deepseek_v2 + inc dynamic quantization + tp 2 successful"


echo "Testing Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant"
echo VLLM_HPU_FORCE_CHANNEL_FP8=false QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_dynamic_quant.json HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/generate.py --model Qwen/Qwen3-8B-FP8 --trust-remote-code
QUANT_CONFIG=vllm-gaudi/tests/models/language/generation/inc_dynamic_quant.json VLLM_HPU_FORCE_CHANNEL_FP8=false \
HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 \
python -u vllm-gaudi/tests/full_tests/generate.py --model Qwen/Qwen3-8B-FP8 --trust-remote-code
if [ $? -ne 0 ]; then
echo "Error: Test failed for Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant" >&2
exit -1
fi
echo "Test with Qwen3-8B-FP8 + inc requant FP8 model + dynamic quant passed"

# structured output
echo "Testing structured output"
echo HABANA_VISIBLE_DEVICES=all VLLM_CONTIGUOUS_PA=False VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u vllm-gaudi/tests/full_tests/structured_outputs.py
Expand Down
13 changes: 12 additions & 1 deletion vllm_gaudi/extension/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import habana_frameworks.torch.core as htcore
from vllm_gaudi.extension.runtime import get_config
import habana_frameworks.torch.utils.experimental as htexp

import types
is_hpu_gaudi2 = htexp._get_device_type(
) == htexp.synDeviceType.synDeviceGaudi2

Expand Down Expand Up @@ -698,6 +698,14 @@ def dynamic_quant(data, single_scale = False):
data, 1.0 / scale, False, False, torch.float8_e4m3fn)[0]
return data_fp8, scale.float()

# Chendi: Necessary base func added by INC team
def get_dequant_weights_func(
self, ) -> Optional[Callable[[torch.nn.Module], torch.Tensor]]:
if self.quant_method is not None:
quant_method = self.quant_method
if hasattr(quant_method, "dequant_fp8_weight"):
return quant_method.dequant_fp8_weight
return None

def fp8_block_linear_postprocess_weights(layer, force_channel_fp8=False):
torch.hpu.synchronize()
Expand All @@ -723,6 +731,9 @@ def fp8_block_linear_postprocess_weights(layer, force_channel_fp8=False):
requires_grad=False)
htorch.core.mark_step()
return layer
else:
# For INC path, we attach the dequant func to the layer
layer.get_dequant_weights_func = types.MethodType(get_dequant_weights_func, layer)

layer.weight = torch.nn.Parameter(weight, requires_grad=False)
orig_M = torch.nn.Parameter(torch.tensor(orig_M, dtype=torch.int32, device=weight.device), requires_grad=False)
Expand Down
13 changes: 13 additions & 0 deletions vllm_gaudi/ops/hpu_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,19 @@ def apply(self,
bias=bias,
trans_B=False)

def dequant_fp8_weight(self, layer) -> torch.Tensor:
if hasattr(layer, "updated_fp8_weight") and layer.updated_fp8_weight:
return layer.weight
dequant_weight = hpu_ops.dequant_block_fp8_weight_naive(
layer.weight,
layer.weight_scale_inv.data,
self.quant_config.weight_block_size,
original_M=layer.orig_M,
original_N=layer.orig_N,
do_unpad=True,
)
return dequant_weight


@CustomOp.register_oot(name='Fp8MoEMethod')
class HPUFp8MoEMethod(Fp8MoEMethod):
Expand Down
Loading