11import pytest
22import torch
3- from compressed_tensors .quantization import QuantizationArgs , QuantizationScheme
4- from transformers import AutoModelForCausalLM
3+ from compressed_tensors .quantization import (
4+ ActivationOrdering ,
5+ QuantizationArgs ,
6+ QuantizationScheme ,
7+ )
8+ from transformers import AutoModelForCausalLM , AutoTokenizer
59
610from llmcompressor import oneshot
711from llmcompressor .modifiers .gptq import GPTQModifier
3640 config_groups = {
3741 "group_0" : QuantizationScheme (
3842 targets = ["re:.*model.layers.2.self_attn.q_proj$" ],
39- weights = QuantizationArgs (num_bits = 4 , strategy = "group" , group_size = 128 ),
43+ weights = QuantizationArgs (num_bits = 4 , strategy = "group" , group_size = 32 ),
4044 )
4145 },
4246)
4347
4448recipe_modifier_shorthand_a = GPTQModifier (
45- ignore = ["lm_head" ], targets = "re:.*model.layers.2.self_attn.q_proj$" , scheme = "W4A16"
49+ ignore = ["lm_head" ],
50+ config_groups = {
51+ "group_0" : QuantizationScheme (
52+ targets = ["re:.*model.layers.2.self_attn.q_proj$" ],
53+ weights = QuantizationArgs (num_bits = 4 , strategy = "group" , group_size = 32 ),
54+ )
55+ },
4656)
4757
4858recipe_modifier_shorthand_b = GPTQModifier (
49- ignore = ["lm_head" ], scheme = {"W4A16" : ["re:.*model.layers.2.self_attn.q_proj$" ]}
59+ ignore = ["lm_head" ],
60+ config_groups = {
61+ "group_0" : QuantizationScheme (
62+ targets = ["re:.*model.layers.2.self_attn.q_proj$" ],
63+ weights = QuantizationArgs (num_bits = 4 , strategy = "group" , group_size = 32 ),
64+ )
65+ },
66+ )
67+
68+ # Test activation ordering variants
69+ recipe_modifier_group_actorder_weight = GPTQModifier (
70+ ignore = ["lm_head" ],
71+ config_groups = {
72+ "group_0" : QuantizationScheme (
73+ targets = ["re:.*model.layers.2.self_attn.q_proj$" ],
74+ weights = QuantizationArgs (
75+ num_bits = 4 ,
76+ strategy = "group" ,
77+ group_size = 32 ,
78+ actorder = ActivationOrdering .WEIGHT ,
79+ ),
80+ )
81+ },
82+ )
83+
84+ recipe_modifier_group_actorder_group = GPTQModifier (
85+ ignore = ["lm_head" ],
86+ config_groups = {
87+ "group_0" : QuantizationScheme (
88+ targets = ["re:.*model.layers.2.self_attn.q_proj$" ],
89+ weights = QuantizationArgs (
90+ num_bits = 4 ,
91+ strategy = "group" ,
92+ group_size = 32 ,
93+ actorder = ActivationOrdering .GROUP ,
94+ ),
95+ )
96+ },
5097)
5198
5299
58105 recipe_modifier_full_group ,
59106 recipe_modifier_shorthand_a ,
60107 recipe_modifier_shorthand_b ,
108+ recipe_modifier_group_actorder_weight ,
109+ recipe_modifier_group_actorder_group ,
61110 ],
62111)
63112def test_oneshot_application (recipe , tmp_path ):
64113 output = tmp_path / "oneshot_output"
65- model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0 "
114+ model_id = "nm-testing/tinysmokellama-3.2 "
66115 dataset = "open_platypus"
67116 device = "cuda:0" if torch .cuda .is_available () else "cpu"
68117
118+ # Load original model for numerical comparison
119+ original_model = AutoModelForCausalLM .from_pretrained (
120+ model_id , torch_dtype = torch .float16 , device_map = device
121+ )
122+ tokenizer = AutoTokenizer .from_pretrained (model_id )
123+
124+ # Create test input
125+ test_text = "The quick brown fox jumps over the lazy dog"
126+ inputs = tokenizer (test_text , return_tensors = "pt" ).to (device )
127+
128+ # Get original model output
129+ with torch .no_grad ():
130+ original_output = original_model (** inputs ).logits
131+
132+ # Quantize model
69133 oneshot (
70- model = model ,
134+ model = model_id ,
71135 dataset = dataset ,
72136 output_dir = output ,
73137 recipe = recipe ,
74138 num_calibration_samples = 9 ,
139+ splits = {"calibration" : "train[:9]" },
75140 )
76141 model_loaded = AutoModelForCausalLM .from_pretrained (output , device_map = device )
77142
@@ -98,3 +163,32 @@ def test_oneshot_application(recipe, tmp_path):
98163 # Check lm-head is not quantized
99164 not_targetted = model_loaded .lm_head
100165 assert not hasattr (not_targetted , "quantization_scheme" )
166+
167+ # Verify g_idx behavior for activation ordering
168+ if weight_args .actorder == ActivationOrdering .GROUP :
169+ # GROUP actorder should save g_idx
170+ assert hasattr (
171+ targetted_linear_layer , "weight_g_idx"
172+ ), "GROUP actorder should have g_idx"
173+ elif weight_args .actorder == ActivationOrdering .WEIGHT :
174+ # WEIGHT actorder should NOT save g_idx (identity mapping)
175+ assert not hasattr (
176+ targetted_linear_layer , "weight_g_idx"
177+ ), "WEIGHT actorder should not have g_idx"
178+
179+ # Numerical validation: check MSE
180+ with torch .no_grad ():
181+ quantized_output = model_loaded (** inputs ).logits
182+
183+ mse = torch .nn .functional .mse_loss (quantized_output , original_output ).item ()
184+
185+ # MSE threshold - quantization should not degrade quality too much
186+ mse_threshold = 0.015
187+ assert mse < mse_threshold , (
188+ f"MSE { mse :.6f} exceeds threshold { mse_threshold } . "
189+ f"Quantization degraded model quality too much."
190+ )
191+
192+ # Cleanup
193+ del original_model , model_loaded
194+ torch .cuda .empty_cache ()
0 commit comments