44
44
#
45
45
46
46
import torch
47
- from torchao .quantization import change_linear_weights_to_int8_dqtensors
47
+ from torchao .quantization . quant_api import quantize_ , int8_dynamic_activation_int8_weight
48
48
from segment_anything import sam_model_registry
49
49
from torch .utils .benchmark import Timer
50
50
@@ -156,9 +156,9 @@ def get_sam_model(only_one_block=False, batchsize=1):
156
156
# in memory bound situations where the benefit comes from loading less
157
157
# weight data, rather than doing less computation. The torchao APIs:
158
158
#
159
- # ``change_linear_weights_to_int8_dqtensors ``,
160
- # ``change_linear_weights_to_int8_woqtensors `` or
161
- # ``change_linear_weights_to_int4_woqtensors ``
159
+ # ``int8_dynamic_activation_int8_weight() ``,
160
+ # ``int8_dynamic_activation_int8_semi_sparse_weight `` or
161
+ # ``int8_dynamic_activation_int4_weight ``
162
162
#
163
163
# can be used to easily apply the desired quantization technique and then
164
164
# once the model is compiled with ``torch.compile`` with ``max-autotune``, quantization is
@@ -185,7 +185,7 @@ def get_sam_model(only_one_block=False, batchsize=1):
185
185
model , image = get_sam_model (only_one_block , batchsize )
186
186
model = model .to (torch .bfloat16 )
187
187
image = image .to (torch .bfloat16 )
188
- change_linear_weights_to_int8_dqtensors (model )
188
+ quantize_ (model , int8_dynamic_activation_int8_weight () )
189
189
model_c = torch .compile (model , mode = 'max-autotune' )
190
190
quant_res = benchmark (model_c , image )
191
191
print (f"bf16 compiled runtime of the quantized block is { quant_res ['time' ]:0.2f} ms and peak memory { quant_res ['memory' ]: 0.2f} GB" )
0 commit comments