shape fix for gptq

HDCharles · HDCharles · commit f2c6534f083f · 2024-04-10T12:12:17.000-07:00
Summary: aligns with previous shape fixes (#152) Test Plan: export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf python quantize.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --mode int4-gptq --calibration_tasks wikitext --calibration_limit 10 python eval.py --checkpoint_path checkpoints/$MODEL_REPO/model_int4-gptq.g32.cuda.pth --tasks wikitext wikitext: {'word_perplexity,none': 12.4647656874071, 'word_perplexity_stderr,none': 'N/A', 'byte_perplexity,none': 1.6028703940149458, 'byte_perplexity_stderr,none': 'N/A', 'bits_per_byte,none': 0.6806577757911142, 'bits_per_byte_stderr,none': 'N/A', 'alias': 'wikitext'} python quantize.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --mode int4 python eval.py --checkpoint_path checkpoints/$MODEL_REPO/model_int4.g32.cuda.pth --tasks wikitext wikitext: {'word_perplexity,none': 12.639992147818221, 'word_perplexity_stderr,none': 'N/A', 'byte_perplexity,none': 1.6070602521912754, 'byte_perplexity_stderr,none': 'N/A', 'bits_per_byte,none': 0.6844240198082908, 'bits_per_byte_stderr,none': 'N/A', 'alias': 'wikitext'} Reviewers: Subscribers: Tasks: Tags:
diff --git a/quantize.py b/quantize.py
@@ -458,7 +458,10 @@ def __init__(self, mod, groupsize=128, inner_k_tiles=8, padding=True):
         # we need to do the padding here, both for q and the qparams if necessary
         def make_names_and_values_dict_func(q, qparams):
             k = q.shape[1]
-            new_k = find_multiple(k, 1024)
+            if not _check_linear_int4_k(k, groupsize, inner_k_tiles):
+                new_k = find_multiple(k, 1024)
+            else:
+                new_k = k
             # how much we need to pad the weight
             delta_k = new_k - q.shape[1]
             final_q = torch.ops.aten._convert_weight_to_int4pack(F.pad(q, pad=(0, delta_k)), inner_k_tiles)