Skip to content

Commit 2e134d8

Browse files
authored
Merge pull request #16 from neuralmagic/improve-memory-usage
Improve memory usage by properly cleaning up weights as quantized
2 parents ffea17e + 9b8abad commit 2e134d8

File tree

1 file changed

+6
-3
lines changed

1 file changed

+6
-3
lines changed

auto_fp8/quantize.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -202,11 +202,14 @@ def quantize_weights(
202202
or name in quantize_config.ignored_layers
203203
):
204204
continue
205-
quant_weight, quant_scale = per_tensor_quantize(linear.weight)
206-
quant_linear = FP8DynamicLinear(quant_weight, quant_scale, linear.bias)
205+
quant_weight, quant_scale = per_tensor_quantize(linear.weight.clone())
206+
bias = linear.bias.clone() if linear.bias is not None else None
207+
quant_linear = FP8DynamicLinear(quant_weight, quant_scale, bias)
207208
replace_module(model, name, quant_linear)
209+
del linear.weight
210+
del linear.bias
208211
del linear
209-
cleanup_memory()
212+
cleanup_memory()
210213

211214

212215
def quantize_activations(

0 commit comments

Comments
 (0)