[SW-235047] use w8a8 path for per_channel for performance regression fixing (#1629)

xuechendi · web-flow · commit d9aa3c1a91f0 · 2025-07-23T08:46:20.000+02:00
https://jira.habana-labs.com/browse/SW-235047 ## Essential Elements of an Effective PR Description Checklist - [X] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)". - [ ] The test plan, such as providing test command. - [ ] The test results, such as pasting the results comparison before and after, or e2e results ## Purpose Previous, any HPU fp8 linear will go `hpu_ops.apply_fp8_linear_hpu`, which is not necessary for per_channel scaling since W8A8 also supports HPU; which introduced a performance regression for WOQ model. This PR is to skip per_channel scaling fp8 support in `hpu_ops.apply_fp8_linear_hpu` ## Test Plan ## Test Result  Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -473,7 +473,7 @@ def apply(self,
                 use_aiter_and_is_supported=self.use_aiter_and_is_supported,
             )
 
-        if current_platform.is_hpu():
+        if self.block_quant and current_platform.is_hpu():
             if layer.weight_scale.dim() > 1:
                 weight_scale = layer.weight_scale.transpose(0, 1)
             else:

Original file line number	Diff line number	Diff line change
`@@ -473,7 +473,7 @@ def apply(self,`
`473`	`473`	`use_aiter_and_is_supported=self.use_aiter_and_is_supported,`
`474`	`474`	`)`
`475`	`475`
`476`		`- if current_platform.is_hpu():`
	`476`	`+ if self.block_quant and current_platform.is_hpu():`
`477`	`477`	`if layer.weight_scale.dim() > 1:`
`478`	`478`	`weight_scale = layer.weight_scale.transpose(0, 1)`
`479`	`479`	`else:`