We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
2 parents ffea17e + 9b8abad commit 2e134d8Copy full SHA for 2e134d8
auto_fp8/quantize.py
@@ -202,11 +202,14 @@ def quantize_weights(
202
or name in quantize_config.ignored_layers
203
):
204
continue
205
- quant_weight, quant_scale = per_tensor_quantize(linear.weight)
206
- quant_linear = FP8DynamicLinear(quant_weight, quant_scale, linear.bias)
+ quant_weight, quant_scale = per_tensor_quantize(linear.weight.clone())
+ bias = linear.bias.clone() if linear.bias is not None else None
207
+ quant_linear = FP8DynamicLinear(quant_weight, quant_scale, bias)
208
replace_module(model, name, quant_linear)
209
+ del linear.weight
210
+ del linear.bias
211
del linear
- cleanup_memory()
212
+ cleanup_memory()
213
214
215
def quantize_activations(
0 commit comments