@@ -88,14 +88,6 @@ def get_pergroup_param(self, input_size: int, output_size: int,
88
88
layer_type : Optional [str ] = None ) -> Dict [str , Any ]:
89
89
"""
90
90
Create per-group quantization parameters.
91
-
92
- Args:
93
- input_size: input dimension size
94
- output_size: output dimension size
95
- params_dtype: parameter data type
96
- layer_type: "row" or "others" (default)
97
- - "row": RowParallelLinear (down_proj, o_proj)
98
- - "others": Others (ColumnParallel, ReplicatedLinear, etc.)
99
91
"""
100
92
params_dict = {}
101
93
params_dict ["weight_scale" ] = torch .empty (output_size ,
@@ -114,8 +106,8 @@ def get_pergroup_param(self, input_size: int, output_size: int,
114
106
dtype = params_dtype )
115
107
116
108
# NOTE: In w4a8 quantization implementation,
117
- # for down_proj and o_proj scale_bias shape is [output_size, 16],
118
- # others are [output_size, 1]
109
+ # for down_proj and o_proj(layer_type == "row") scale_bias shape is [output_size, 16],
110
+ # others are [output_size, 1]
119
111
if self .new_quant_version :
120
112
scale_bias_dim = 16 if layer_type == "row" else 1
121
113
@@ -144,9 +136,8 @@ def process_scale_second(weight: torch.Tensor,
144
136
k , n = weight .shape
145
137
group_num , n_scale = per_group_scale .shape
146
138
147
- # For new quantization version, the second dimension of weight is already compressed (double int4 pack into int8)
148
- # Need to restore the logical dimension to correctly compute the scale
149
139
if is_new_quant :
140
+ # Restore logical dimension for compressed weight
150
141
n = n * 2
151
142
152
143
bias = None
@@ -155,9 +146,10 @@ def process_scale_second(weight: torch.Tensor,
155
146
group_num , - 1 , n ) * per_group_scale .reshape (group_num , 1 , n )
156
147
weight_high = weight_high .reshape (k , n )
157
148
bias = 8 * (weight_high .to (torch .float32 ) * scale ).sum (dim = 0 )
158
- # New version : scale_bias is not used currently
159
- # because symmetric activation quantization is adopted in msIT for w4a8
149
+ # NOTE : scale_bias is not used currently
150
+ # because in msmodelslim w4a8 uses symmetric quantization
160
151
152
+ # TODO: support potential future asymmetric quantization
161
153
antiquant_scale = (scale * per_group_scale ).reshape (group_num , n )
162
154
return antiquant_scale .npu (), bias
163
155
@@ -188,36 +180,29 @@ def process_weights_after_loading(self, layer: torch.nn.Module):
188
180
is_new_quant = self .new_quant_version ,
189
181
)
190
182
191
- # ✅ Handle scale_bias based on quantization version
192
183
if self .new_quant_version :
193
- # New version: scale_bias is loaded from checkpoint
194
184
# Process the loaded data based on layer type
195
185
if hasattr (layer , "scale_bias" ):
196
- # Detect layer type from shape
197
186
if layer .scale_bias .data .shape [1 ] == 1 :
198
- # ColumnParallel (gate_up_proj, qkv_proj): [output_size, 1] -> flatten
199
187
layer .scale_bias .data = layer .scale_bias .data .flatten ()
200
188
else :
201
- # RowParallel (down_proj, o_proj): [output_size, 16//tp_size]
202
- # Keep 2D shape but make contiguous
203
189
layer .scale_bias .data = layer .scale_bias .data .contiguous ()
204
190
else :
205
- # Old version: scale_bias is computed, register as parameter
206
191
if scale_bias is not None :
207
192
param = torch .nn .Parameter (scale_bias , requires_grad = False )
208
193
layer .register_parameter ("weight_scale_bias" , param )
209
194
210
195
# Convert to NPU-specific int4pack format
211
196
if self .new_quant_version :
212
- # New version: weights on disk are already in double int4 pack into int8 format
213
- # Refer to MoE's pack_to_int32 method: use view(torch.int32) instead of npu_convert_weight_to_int4pack
197
+ # weights on disk are already in packed int4 format
214
198
# pack 4 int8(int4*2) to int32
215
199
assert layer .weight .data .shape [- 1 ] % 4 == 0 , \
216
200
f"the last dim of weight needs to be divided by 4, got shape { layer .weight .data .shape } "
217
201
layer .weight .data = layer .weight .data .view (
218
202
torch .int32 ).contiguous ()
219
203
else :
220
- # Old version: weights are not compressed, need to be packed via npu_convert_weight_to_int4pack
204
+ # weights are not compressed
205
+ # need to be packed via npu_convert_weight_to_int4pack
221
206
layer .weight .data = torch_npu .npu_convert_weight_to_int4pack (
222
207
layer .weight .data .to (torch .int32 ))
223
208
0 commit comments