fix(quant): fix logic for scale_bias dim

Anionex · Anionex · commit a92e2be015f9 · 2025-10-05T20:59:27.000+08:00
diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py
@@ -288,13 +288,10 @@ def create_weights(
             layer.register_parameter(perchannel_name, param)
             set_weight_attrs(param, extra_weight_attrs)
 
-        layer_type = None
-        if isinstance(layer, RowParallelLinear):
-            # down_proj, o_proj
-            layer_type = "row"
-        else:
-            # gate_up_proj, qkv_proj (ColumnParallel or MergedColumnParallel)
-            layer_type = "column"
+        # NOTE: In w4a8 quantization implementation, 
+        # for down_proj and o_proj scale_bias shape is [output_size, 16], 
+        # others are [output_size, 1]
+        layer_type = "row" if isinstance(layer, RowParallelLinear) else "others"
         
         pergroup_dict = self.quant_method.get_pergroup_param(
             input_size_per_partition, output_size_per_partition, params_dtype,
diff --git a/vllm_ascend/quantization/w4a8_dynamic.py b/vllm_ascend/quantization/w4a8_dynamic.py
@@ -93,7 +93,9 @@ def get_pergroup_param(self, input_size: int, output_size: int,
             input_size: input dimension size
             output_size: output dimension size
             params_dtype: parameter data type
-            layer_type: layer type hint, can be "row" (down_proj/o_proj) or "column" (gate_up_proj/qkv_proj)
+            layer_type: "row" or "others" (default)
+                - "row": RowParallelLinear (down_proj, o_proj)
+                - "others": Others (ColumnParallel, ReplicatedLinear, etc.)
         """
         params_dict = {}
         params_dict["weight_scale"] = torch.empty(output_size,
@@ -111,19 +113,11 @@ def get_pergroup_param(self, input_size: int, output_size: int,
                                                           self.group_size,
                                                           dtype=params_dtype)
 
-        # ✅ New quantization version includes scale_bias parameters
-        # Shape depends on layer type:
-        # - ColumnParallel (gate_up_proj, qkv_proj): [output_size, 1]
-        # - RowParallel (down_proj, o_proj): [output_size, 16 // tp_size]
+        # NOTE: In w4a8 quantization implementation, 
+        # for down_proj and o_proj scale_bias shape is [output_size, 16], 
+        # others are [output_size, 1]
         if self.new_quant_version:
-            if layer_type == "row":
-                # RowParallel: down_proj, o_proj
-                # scale_bias shape: [output_size, 16 // tp_size]
-                scale_bias_dim = 16 // self.tp_size
-            else:
-                # ColumnParallel (default): gate_up_proj, qkv_proj
-                # scale_bias shape: [output_size, 1]
-                scale_bias_dim = 1
+            scale_bias_dim = 16 if layer_type == "row" else 1
             
             params_dict["scale_bias"] = torch.empty(output_size,
                                                     scale_bias_dim,
diff --git a/vllm_ascend/torchair/quantization/torchair_w4a8_dynamic.py b/vllm_ascend/torchair/quantization/torchair_w4a8_dynamic.py
@@ -98,11 +98,7 @@ def get_pergroup_param(self, input_size: int, output_size: int,
                                                           dtype=params_dtype)
 
         if self.new_quant_version:
-            if layer_type == "row":
-                scale_bias_dim = 16 // self.tp_size
-            else:
-                scale_bias_dim = 1
-            
+            scale_bias_dim = 16 if layer_type == "row" else 1
             params_dict["scale_bias"] = torch.empty(output_size,
                                                     scale_bias_dim,
                                                     dtype=torch.float32)