updated example

brian-dellabetta · brian-dellabetta · commit 5bd51df3668e · 2025-07-02T19:11:20.000Z
Signed-off-by: Brian Dellabetta &lt;bdellabe@redhat.com&gt;
diff --git a/examples/transform/llama3_example.py b/examples/transform/llama3_example.py
@@ -7,7 +7,7 @@
 from llmcompressor.utils import dispatch_for_generation
 
 # Select model and load it.
-MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"  # "meta-llama/Meta-Llama-3-8B-Instruct"
 
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
@@ -57,6 +57,10 @@ def tokenize(sample):
 # Configure the quantization algorithm to run.
 #   * quantize the weights to 4 bit with GPTQ with a group size 128
 recipe = [
+    # TODO preset_config="LLAMA_SPINQUANT_R1R2" outputs gibberish
+    # TODO preset_config="QUIP_ONLINE" outputs gibberish
+    # preset_config="QUIP" output sensible, but cannot load saved
+    #  checkpoint or run evals (~4hrs to run)
     TransformModifier(preset_config="LLAMA_SPINQUANT_R1R2"),
     QuantizationModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]),
 ]
@@ -72,12 +76,12 @@ def tokenize(sample):
 )
 
 # # Confirm generations of the quantized model look sane.
-# print("\n\n")
-# print("========== SAMPLE GENERATION ==============")
-# dispatch_for_generation(model)
-# input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
-# output = model.generate(input_ids, max_new_tokens=100)
-# print(tokenizer.decode(output[0]))
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+output = model.generate(input_ids, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
 # print("==========================================\n\n")
 
 # Save to disk compressed.