@@ -170,7 +170,7 @@ def get_sam_model(only_one_block=False, batchsize=1):
170
170
# ``apply_weight_only_int8_quant`` instead as drop in replacement for the two
171
171
# above (no replacement for int4).
172
172
#
173
- # The difference between the two APIs is that ``change_linear_weights `` API
173
+ # The difference between the two APIs is that ``int8_dynamic_activation `` API
174
174
# alters the weight tensor of the linear module so instead of doing a
175
175
# normal linear, it does a quantized operation. This is helpful when you
176
176
# have non-standard linear ops that do more than one thing. The ``apply``
@@ -220,7 +220,7 @@ def get_sam_model(only_one_block=False, batchsize=1):
220
220
model = model .to (torch .bfloat16 )
221
221
image = image .to (torch .bfloat16 )
222
222
torch ._inductor .config .force_fuse_int_mm_with_mul = True
223
- change_linear_weights_to_int8_dqtensors (model )
223
+ quantize_ (model , int8_dynamic_activation_int8_weight () )
224
224
model_c = torch .compile (model , mode = 'max-autotune' )
225
225
quant_res = benchmark (model_c , image )
226
226
print (f"bf16 compiled runtime of the fused quantized block is { quant_res ['time' ]:0.2f} ms and peak memory { quant_res ['memory' ]: 0.2f} GB" )
@@ -251,7 +251,7 @@ def get_sam_model(only_one_block=False, batchsize=1):
251
251
torch ._inductor .config .coordinate_descent_tuning = True
252
252
torch ._inductor .config .coordinate_descent_check_all_directions = True
253
253
torch ._inductor .config .force_fuse_int_mm_with_mul = True
254
- change_linear_weights_to_int8_dqtensors (model )
254
+ quantize_ (model , int8_dynamic_activation_int8_weight () )
255
255
model_c = torch .compile (model , mode = 'max-autotune' )
256
256
quant_res = benchmark (model_c , image )
257
257
print (f"bf16 compiled runtime of the final quantized block is { quant_res ['time' ]:0.2f} ms and peak memory { quant_res ['memory' ]: 0.2f} GB" )
@@ -280,7 +280,7 @@ def get_sam_model(only_one_block=False, batchsize=1):
280
280
model , image = get_sam_model (False , batchsize )
281
281
model = model .to (torch .bfloat16 )
282
282
image = image .to (torch .bfloat16 )
283
- change_linear_weights_to_int8_dqtensors (model )
283
+ quantize_ (model , int8_dynamic_activation_int8_weight () )
284
284
model_c = torch .compile (model , mode = 'max-autotune' )
285
285
quant_res = benchmark (model_c , image )
286
286
print (f"bf16 compiled runtime of the quantized full model is { quant_res ['time' ]:0.2f} ms and peak memory { quant_res ['memory' ]: 0.2f} GB" )
0 commit comments