DummyModel script

brian-dellabetta · brian-dellabetta · commit 3c216dd685fd · 2025-07-08T21:29:27.000Z
Signed-off-by: Brian Dellabetta &lt;bdellabe@redhat.com&gt;
diff --git a/examples/transform/llama3_example.py b/examples/transform/llama3_example.py
@@ -7,7 +7,9 @@
 from llmcompressor.utils import dispatch_for_generation
 
 # Select model and load it.
-MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"  # "meta-llama/Meta-Llama-3-8B-Instruct"
+# MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"
+# MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct" # TODO hidden size 3072 causes failure when creating hadamard
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
@@ -62,17 +64,18 @@ def tokenize(sample):
     # preset_config="QUIP" output sensible, but cannot load saved
     #  checkpoint or run evals (~4hrs to run)
     TransformModifier(preset_config="LLAMA_SPINQUANT_R1R2"),
-    QuantizationModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]),
+    # QuantizationModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]),
 ]
 
 # Apply algorithms.
 oneshot(
     model=model,
-    dataset=ds,
     recipe=recipe,
-    pipeline="sequential",
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    # dataset=ds,
+    pipeline="datafree",
+    # max_seq_length=MAX_SEQUENCE_LENGTH,
+    # num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    log_dir=None,
 )
 
 # # Confirm generations of the quantized model look sane.
@@ -84,7 +87,7 @@ def tokenize(sample):
 print(tokenizer.decode(output[0]))
 # print("==========================================\n\n")
 
-# Save to disk compressed.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-transform-quant-w4a16"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
+# # Save to disk compressed.
+# SAVE_DIR = MODEL_ID.split("/")[1] + "-transform-quant-w4a16"
+# model.save_pretrained(SAVE_DIR, save_compressed=True)
+# tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/transform/spinquant_dummy.py b/examples/transform/spinquant_dummy.py
@@ -0,0 +1,112 @@
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+from compressed_tensors.utils import update_parameter_data
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationModifier
+from llmcompressor.modifiers.transform import TransformModifier
+from llmcompressor.utils import dispatch_for_generation
+from transformers.models.llama.modeling_llama import (
+    LlamaRMSNorm,
+)
+
+hidden_dim = intermediate_dim = 64
+up_dim = 128
+num_embeddings = 12
+
+
+# TODO remove file before merging
+
+
+class DummySelfAttn(torch.nn.Module):
+    def __init__(self, hidden_dim, intermediate_dim):
+        super().__init__()
+        self.q_proj = torch.nn.Linear(hidden_dim, hidden_dim, bias=None)
+        self.k_proj = torch.nn.Linear(hidden_dim, intermediate_dim, bias=None)
+        self.v_proj = torch.nn.Linear(hidden_dim, intermediate_dim, bias=None)
+        self.o_proj = torch.nn.Linear(hidden_dim, hidden_dim, bias=None)
+        self.num_heads = 1
+        self.num_key_value_groups = 1
+
+    def forward(self, hidden_states):
+        q = self.q_proj(hidden_states)
+        k = self.k_proj(hidden_states)
+        v = self.v_proj(hidden_states)
+
+        ### EAGER ATTENTION
+        attn_weights = torch.matmul(q.T, k)
+
+        attn_weights = torch.nn.functional.softmax(
+            attn_weights, dim=-1, dtype=torch.float32
+        ).to(q.dtype)
+        attn_output = torch.matmul(attn_weights, v.T)
+        attn_output = attn_output.T.contiguous()
+
+        return self.o_proj(attn_output)
+
+
+class DummyMLP(torch.nn.Module):
+    def __init__(self, hidden_dim, up_dim):
+        super().__init__()
+        self.up_proj = torch.nn.Linear(hidden_dim, up_dim, bias=None)
+        self.gate_proj = torch.nn.Linear(hidden_dim, up_dim, bias=None)
+        self.down_proj = torch.nn.Linear(up_dim, hidden_dim, bias=None)
+        self.act_fn = torch.nn.SiLU()
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+class DummyModel(torch.nn.Module):
+    def __init__(self, num_embeddings, hidden_dim, intermediate_dim, up_dim):
+        super().__init__()
+        self.embed_tokens = torch.nn.Embedding(num_embeddings, hidden_dim)
+        self.input_layernorm = LlamaRMSNorm(hidden_dim)
+        self.post_attention_layernorm = LlamaRMSNorm(hidden_dim)
+        self.self_attn = DummySelfAttn(hidden_dim, intermediate_dim)
+        self.mlp = DummyMLP(hidden_dim, up_dim)
+        self.lm_head = torch.nn.Linear(hidden_dim, num_embeddings, bias=None)
+
+    def forward(self, input_ids):
+        x = self.embed_tokens(input_ids)
+        x = self.input_layernorm(x)
+        x = self.self_attn(x)
+        x = self.post_attention_layernorm(x)
+        x = self.mlp(x)
+        return self.lm_head(x)
+
+
+model = DummyModel(num_embeddings, hidden_dim, intermediate_dim, up_dim)
+
+# TODO Uncomment this to see norm diff > 1e-6
+# This is due to issue Kyle spotted in https://arxiv.org/pdf/2405.16406 Page 5 Footnote 2
+# Will have to fuse layernorms with subsequent layers so that input_layernorm.weight is equal to torch.ones() (this apparently makes it rotation invariant)
+# https://github.com/facebookresearch/SpinQuant/blob/8f47aa3f00e8662caf1a484153920a07e5281c3a/utils/fuse_norm_utils.py#L39
+# update_parameter_data(
+#     model.input_layernorm,
+#     torch.rand(model.input_layernorm.weight.shape),
+#     "weight",
+# )
+
+input_ids = torch.IntTensor([1, 2, 3, 4, 5])
+orig_output = model(input_ids)
+
+recipe = [
+    # NOTE: preset_config="QUIP" output sensible, but cannot load saved
+    #  checkpoint or run evals (~4hrs to run)
+    TransformModifier(preset_config="LLAMA_SPINQUANT_R1R2"),
+    # QuantizationModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]),
+]
+
+oneshot(
+    model=model,
+    recipe=recipe,
+    pipeline="datafree",
+    log_dir=None,
+)
+
+# # Confirm generations of the quantized model look the same
+transformed_output = model(input_ids)
+
+print(f"Norm Diff {(orig_output-transformed_output).norm()}")
+print(f"Norm {orig_output.norm()}, {transformed_output.norm()}")
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
@@ -125,7 +125,8 @@ def __init__(
         self.output_dir = output_dir
 
         # initialize the model and processor
-        pre_process(model_args)
+        # TODO Remove Comment before merge, this is just needed for DummyModel
+        # pre_process(model_args)
 
         # Set instance attributes
         self.model = self.model_args.model
diff --git a/src/llmcompressor/modifiers/transform/presets/spinquant.py b/src/llmcompressor/modifiers/transform/presets/spinquant.py
@@ -9,43 +9,54 @@
             type="hadamard",
             apply=[
                 TransformArgs(
-                    targets=["re:.*embed_tokens$", "re:.*o_proj$", "re:.*down_proj$"],
+                    targets=[
+                        # outermost rotation
+                        "re:.*embed_tokens$",
+                        # attention rotations
+                        "re:.*o_proj$",
+                        # mlp rotations
+                        "re:.*down_proj$",
+                    ],
                     location="weight_output",
                 ),
                 TransformArgs(
                     targets=[
+                        # outermost rotation
+                        "lm_head",
+                        # attention rotations
                         "re:.*q_proj$",
                         "re:.*k_proj$",
                         "re:.*v_proj$",
+                        # mlp rotations
                         "re:.*up_proj$",
                         "re:.*gate_proj$",
-                        "lm_head",
                     ],
                     location="weight_input",
                     inverse=True,
                 ),
             ],
         ),
-        "R2": TransformScheme(
-            type="hadamard",
-            apply=[
-                TransformArgs(
-                    targets=["re:.*v_proj$"],
-                    location="weight_output",
-                ),
-                TransformArgs(
-                    targets=["re:.*o_proj$"], location="weight_input", inverse=True
-                ),
-            ],
-        ),
+        # "R2": TransformScheme(
+        #     type="hadamard",
+        #     # TODO infer head_dim from config.json in SpinQuantModifier
+        #     head_dim=128,
+        #     apply=[
+        #         TransformArgs(targets=["re:.*v_proj$"], location="weight_output"),
+        #         TransformArgs(
+        #             targets=["re:.*o_proj$"],
+        #             location="weight_input",
+        #             inverse=True,
+        #         ),
+        #     ],
+        # ),
     }
 )
 
 # All rotations
 LLAMA_SPINQUANT = TransformConfig(
     config_groups={
-        "R1": LLAMA_SPINQUANT_R1R2.config_groups["R1"],
-        "R2": LLAMA_SPINQUANT_R1R2.config_groups["R2"],
+        # "R1": LLAMA_SPINQUANT_R1R2.config_groups["R1"],
+        # "R2": LLAMA_SPINQUANT_R1R2.config_groups["R2"],
         "R3": TransformScheme(
             type="hadamard",
             apply=[