7
7
from llmcompressor .utils import dispatch_for_generation
8
8
9
9
# Select model and load it.
10
- MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
10
+ MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct" # "meta-llama/ Meta-Llama-3-8B-Instruct"
11
11
12
12
model = AutoModelForCausalLM .from_pretrained (
13
13
MODEL_ID ,
@@ -57,6 +57,10 @@ def tokenize(sample):
57
57
# Configure the quantization algorithm to run.
58
58
# * quantize the weights to 4 bit with GPTQ with a group size 128
59
59
recipe = [
60
+ # TODO preset_config="LLAMA_SPINQUANT_R1R2" outputs gibberish
61
+ # TODO preset_config="QUIP_ONLINE" outputs gibberish
62
+ # preset_config="QUIP" output sensible, but cannot load saved
63
+ # checkpoint or run evals (~4hrs to run)
60
64
TransformModifier (preset_config = "LLAMA_SPINQUANT_R1R2" ),
61
65
QuantizationModifier (targets = "Linear" , scheme = "W4A16" , ignore = ["lm_head" ]),
62
66
]
@@ -72,12 +76,12 @@ def tokenize(sample):
72
76
)
73
77
74
78
# # Confirm generations of the quantized model look sane.
75
- # print("\n\n")
76
- # print("========== SAMPLE GENERATION ==============")
77
- # dispatch_for_generation(model)
78
- # input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
79
- # output = model.generate(input_ids, max_new_tokens=100)
80
- # print(tokenizer.decode(output[0]))
79
+ print ("\n \n " )
80
+ print ("========== SAMPLE GENERATION ==============" )
81
+ dispatch_for_generation (model )
82
+ input_ids = tokenizer ("Hello my name is" , return_tensors = "pt" ).input_ids .to ("cuda" )
83
+ output = model .generate (input_ids , max_new_tokens = 100 )
84
+ print (tokenizer .decode (output [0 ]))
81
85
# print("==========================================\n\n")
82
86
83
87
# Save to disk compressed.
0 commit comments