Skip to content

Commit 913d43e

Browse files
committed
Update
1 parent 0899f34 commit 913d43e

File tree

1 file changed

+4
-4
lines changed

1 file changed

+4
-4
lines changed

prototype_source/gpu_quantization_torchao_tutorial.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ def get_sam_model(only_one_block=False, batchsize=1):
170170
# ``apply_weight_only_int8_quant`` instead as drop in replacement for the two
171171
# above (no replacement for int4).
172172
#
173-
# The difference between the two APIs is that ``change_linear_weights`` API
173+
# The difference between the two APIs is that ``int8_dynamic_activation`` API
174174
# alters the weight tensor of the linear module so instead of doing a
175175
# normal linear, it does a quantized operation. This is helpful when you
176176
# have non-standard linear ops that do more than one thing. The ``apply``
@@ -220,7 +220,7 @@ def get_sam_model(only_one_block=False, batchsize=1):
220220
model = model.to(torch.bfloat16)
221221
image = image.to(torch.bfloat16)
222222
torch._inductor.config.force_fuse_int_mm_with_mul = True
223-
change_linear_weights_to_int8_dqtensors(model)
223+
quantize_(model, int8_dynamic_activation_int8_weight())
224224
model_c = torch.compile(model, mode='max-autotune')
225225
quant_res = benchmark(model_c, image)
226226
print(f"bf16 compiled runtime of the fused quantized block is {quant_res['time']:0.2f}ms and peak memory {quant_res['memory']: 0.2f}GB")
@@ -251,7 +251,7 @@ def get_sam_model(only_one_block=False, batchsize=1):
251251
torch._inductor.config.coordinate_descent_tuning = True
252252
torch._inductor.config.coordinate_descent_check_all_directions = True
253253
torch._inductor.config.force_fuse_int_mm_with_mul = True
254-
change_linear_weights_to_int8_dqtensors(model)
254+
quantize_(model, int8_dynamic_activation_int8_weight())
255255
model_c = torch.compile(model, mode='max-autotune')
256256
quant_res = benchmark(model_c, image)
257257
print(f"bf16 compiled runtime of the final quantized block is {quant_res['time']:0.2f}ms and peak memory {quant_res['memory']: 0.2f}GB")
@@ -280,7 +280,7 @@ def get_sam_model(only_one_block=False, batchsize=1):
280280
model, image = get_sam_model(False, batchsize)
281281
model = model.to(torch.bfloat16)
282282
image = image.to(torch.bfloat16)
283-
change_linear_weights_to_int8_dqtensors(model)
283+
quantize_(model, int8_dynamic_activation_int8_weight())
284284
model_c = torch.compile(model, mode='max-autotune')
285285
quant_res = benchmark(model_c, image)
286286
print(f"bf16 compiled runtime of the quantized full model is {quant_res['time']:0.2f}ms and peak memory {quant_res['memory']: 0.2f}GB")

0 commit comments

Comments
 (0)