Let's walk through the main steps of the quantization process:
- Load model
- Configure quantization algorithm and scheme
- Apply quantization
- Confirm generations of the quantized model look sane
- Save to disk in compressed-tensors format
Load the model using AutoModelForCausalLM:
from compressed_tensors.offload import dispatch_model
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
MODEL_ID = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)recipe = QuantizationModifier(
targets="Linear",
scheme="FP8_BLOCK",
ignore=[
"re:.*lm_head",
"re:.*self_attn",
"re:.*router",
"re:.*vision_model.*",
"re:.*multi_modal_projector.*",
"Llama4TextAttention",
],
)oneshot(model=model, recipe=recipe)print("========== SAMPLE GENERATION ==============")
dispatch_model(model)
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
model.device
)
output = model.generate(input_ids, max_new_tokens=20)
print(tokenizer.decode(output[0]))
print("==========================================")SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-BLOCK"
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)