Skip to content

Commit 9b08c79

Browse files
committed
added quantization code
1 parent 835f1e2 commit 9b08c79

File tree

1 file changed

+22
-0
lines changed
  • content/learning-paths/servers-and-cloud-computing/llama-vision

1 file changed

+22
-0
lines changed

content/learning-paths/servers-and-cloud-computing/llama-vision/backend.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,28 @@ app = Flask(__name__)
2424
# Load model and processor
2525
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
2626
model = MllamaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float32)
27+
28+
# Apply torchao quantization
29+
from torchao.dtypes import PlainLayout
30+
from torchao.experimental.packed_linear_int8_dynamic_activation_intx_weight_layout import (
31+
PackedLinearInt8DynamicActivationIntxWeightLayout,
32+
)
33+
from torchao.experimental.quant_api import int8_dynamic_activation_intx_weight
34+
from torchao.quantization.granularity import PerGroup
35+
from torchao.quantization.quant_api import quantize_
36+
from torchao.quantization.quant_primitives import MappingType
37+
38+
quantize_(
39+
model,
40+
int8_dynamic_activation_intx_weight(
41+
weight_dtype=torch.int4,
42+
granularity=PerGroup(32),
43+
has_weight_zeros=True,
44+
weight_mapping_type=MappingType.SYMMETRIC_NO_CLIPPING_ERR,
45+
layout=PackedLinearInt8DynamicActivationIntxWeightLayout(target="aten"),
46+
),
47+
)
48+
2749
processor = AutoProcessor.from_pretrained(model_id)
2850
model.eval()
2951

0 commit comments

Comments
 (0)