update examples, enable

kylesayrs · kylesayrs · commit 33b7216927e7 · 2025-04-08T11:04:29.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py
@@ -20,7 +20,7 @@
 
 # Select number of samples. 512 samples is a good place to start.
 # Increasing the number of samples can improve accuracy.
-NUM_CALIBRATION_SAMPLES = 1
+NUM_CALIBRATION_SAMPLES = 512
 MAX_SEQUENCE_LENGTH = 2048
 
 # Load dataset and preprocess.
@@ -66,15 +66,15 @@ def tokenize(sample):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
-# Confirm generations of the quantized model look sane.
-print("\n\n")
-print("========== SAMPLE GENERATION ==============")
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
-    model.device
-)
-output = model.generate(input_ids, max_new_tokens=100)
-print(tokenizer.decode(output[0]))
-print("==========================================\n\n")
+# # Confirm generations of the quantized model look sane.
+# print("\n\n")
+# print("========== SAMPLE GENERATION ==============")
+# input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+#     model.device
+# )
+# output = model.generate(input_ids, max_new_tokens=100)
+# print(tokenizer.decode(output[0]))
+# print("==========================================\n\n")
 
 # Save to disk compressed.
 SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py
@@ -305,7 +305,6 @@ def calibrate_module(
         args: Tuple[torch.Tensor, ...],
         _output: torch.Tensor,
     ):
-        return
         """
         Quantize a module's weight according to the GPTQ algorithm