pytorch
diff --git a/‎test/prototype/test_awq.py
Lines changed: 117 additions & 70 deletions b/‎test/prototype/test_awq.py
Lines changed: 117 additions & 70 deletions
diff --git a/‎test/quantization/test_config_serialization.py
Lines changed: 5 additions & 0 deletions b/‎test/quantization/test_config_serialization.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎torchao/_models/_eval.py
Lines changed: 13 additions & 7 deletions b/‎torchao/_models/_eval.py
Lines changed: 13 additions & 7 deletions
diff --git a/‎torchao/_models/llama/eval.py
Lines changed: 40 additions & 0 deletions b/‎torchao/_models/llama/eval.py
Lines changed: 40 additions & 0 deletions
diff --git a/‎torchao/core/config.py
Lines changed: 1 addition & 0 deletions b/‎torchao/core/config.py
Lines changed: 1 addition & 0 deletions
@@ -3,29 +3,28 @@
 #
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
-import os
-from copy import deepcopy
+import copy
+import tempfile
 
 import pytest
 import torch
 
-from torchao.quantization import quantize_
-from torchao.testing.utils import skip_if_rocm
+from torchao.quantization import FbgemmConfig, quantize_
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_3,
     TORCH_VERSION_AT_LEAST_2_5,
 )
 
 if TORCH_VERSION_AT_LEAST_2_3:
-    from torchao.prototype.awq import AWQObservedLinear, awq_uintx, insert_awq_observer_
+    from torchao.prototype.awq import AWQConfig, AWQStep
 
 
 class ToyLinearModel(torch.nn.Module):
     def __init__(self, m=512, n=256, k=128):
         super().__init__()
         self.linear1 = torch.nn.Linear(m, n, bias=False)
         self.linear2 = torch.nn.Linear(n, k, bias=False)
-        self.linear3 = torch.nn.Linear(k, 1, bias=False)
+        self.linear3 = torch.nn.Linear(k, 64, bias=False)
 
     def example_inputs(
         self, batch_size, sequence_length=10, dtype=torch.bfloat16, device="cuda"
@@ -44,36 +43,74 @@ def forward(self, x):
         return x
 
 
-devices = ["cpu", "cuda"]
-# torch.uintx dtypes are introduced in 2.3
-if TORCH_VERSION_AT_LEAST_2_3:
-    qdtypes = (torch.uint4, torch.uint7)
-else:
-    qdtypes = ()
-
-
 @pytest.fixture(autouse=True)
 def run_before_and_after_tests():
     yield
     torch._dynamo.reset()  # reset cache between tests
 
 
-@pytest.mark.parametrize("device", devices)
-@pytest.mark.parametrize("qdtype", qdtypes)
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.skipif(not TORCH_VERSION_AT_LEAST_2_5, reason="requires nightly pytorch")
-@pytest.mark.skip("Temporarily skipping to unpin nightiles")
-def test_awq_loading(device, qdtype):
-    if qdtype == torch.uint4 and device == "cpu":
-        pytest.skip("uint4 not supported on cpu")
+def test_awq_functionality():
+    device = "cuda"
+    dataset_size = 100
+    l1, l2, l3 = 512, 256, 128
+    original_dtype = torch.bfloat16  # tinygemm kernel only uses bfloat16 inputs
+    group_size = 128
+    n_calibration_examples = 10
+    sequence_length = 5
+
+    m = ToyLinearModel(l1, l2, l3).eval().to(original_dtype).to(device)
+
+    # baseline quantization
+    base_config = FbgemmConfig(
+        input_dtype=torch.bfloat16,
+        weight_dtype=torch.int4,
+        output_dtype=torch.bfloat16,
+        block_size=[1, group_size],
+        preshuffle=False,
+    )
+    m_baseline = copy.deepcopy(m)
+    quantize_(m_baseline, base_config)
+
+    # awq quantization
+    dataset = m.example_inputs(
+        dataset_size,
+        sequence_length=sequence_length,
+        dtype=original_dtype,
+        device=device,
+    )
+    ref_out = torch.cat([m(d.squeeze(0)) for d in dataset])
+
+    calibration_data = dataset[:n_calibration_examples]
 
+    quant_config = AWQConfig(base_config, step=AWQStep.PREPARE)
+    quantize_(m, quant_config)
+
+    for example in calibration_data:
+        print("device:", example.device)
+        m(example)
+
+    quant_config = AWQConfig(base_config, step=AWQStep.CONVERT)
+    quantize_(m, quant_config)
+
+    awq_out = torch.cat([m(d.squeeze(0)) for d in dataset])
+    baseline_out = torch.cat([m_baseline(d.squeeze(0)) for d in dataset])
+
+    loss_awq = (ref_out - awq_out).pow(2).mean().item()
+    loss_base = (ref_out - baseline_out).pow(2).mean().item()
+    assert loss_awq < loss_base
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(not TORCH_VERSION_AT_LEAST_2_5, reason="requires nightly pytorch")
+def test_awq_loading():
+    device = "cuda"
     dataset_size = 100
     l1, l2, l3 = 512, 256, 128
     original_dtype = torch.bfloat16  # tinygemm kernel only uses bfloat16 inputs
-    quant_dtype = qdtype
     group_size = 128
     n_calibration_examples = 10
-    n_validation_examples = 10
     sequence_length = 5
 
     m = ToyLinearModel(l1, l2, l3).eval().to(original_dtype).to(device)
@@ -86,56 +123,60 @@ def test_awq_loading(device, qdtype):
     calibration_data = dataset[:n_calibration_examples]
 
     # calibrate
-    insert_awq_observer_(
-        m,
-        n_validation_examples,
-        sequence_length,
-        quant_dtype=quant_dtype,
-        group_size=group_size,
+    base_config = FbgemmConfig(
+        input_dtype=torch.bfloat16,
+        weight_dtype=torch.int4,
+        output_dtype=torch.bfloat16,
+        block_size=[1, group_size],
+        preshuffle=False,
     )
+    quant_config = AWQConfig(base_config, step=AWQStep.PREPARE)
+    quantize_(m, quant_config)
 
     for example in calibration_data:
-        m(example.to(device))
+        m(example)
 
     # quantize
-    is_observed_linear = lambda m, fqn: isinstance(m, AWQObservedLinear)
-    quantize_(
-        m, awq_uintx(quant_dtype=quant_dtype, group_size=group_size), is_observed_linear
-    )
+    quant_config = AWQConfig(base_config, step=AWQStep.CONVERT)
+    quantize_(m, quant_config)
 
-    model_save_path = "awq_model.pth"
-    torch.save(m, model_save_path)
-    loaded_model = torch.load(model_save_path)
-    os.remove(model_save_path)
+    with tempfile.NamedTemporaryFile() as f:
+        torch.save(m.state_dict(), f)
+        f.seek(0)
+        state_dict = torch.load(f)
 
-    if torch.cuda.is_available():
-        m = torch.compile(m, fullgraph=True)
-        loaded_model = torch.compile(loaded_model, fullgraph=True)
+    loaded_model = ToyLinearModel(l1, l2, l3).eval().to(original_dtype).to(device)
+    loaded_model.load_state_dict(state_dict, assign=True)
 
-    awq_out = torch.cat([m(i.squeeze(0)) for i in dataset])
-    awq_save_load_out = torch.cat([loaded_model(i.squeeze(0)) for i in dataset])
+    m = torch.compile(m, fullgraph=True)
+    loaded_model = torch.compile(loaded_model, fullgraph=True)
+
+    awq_out = torch.cat([m(d.squeeze(0)) for d in dataset])
+    awq_save_load_out = torch.cat([loaded_model(d.squeeze(0)) for d in dataset])
 
     assert awq_out is not None
     assert awq_save_load_out is not None
     assert torch.allclose(awq_out, awq_save_load_out, atol=1e-2)
 
 
-@pytest.mark.skipif(not TORCH_VERSION_AT_LEAST_2_5, reason="requires nightly pytorch")
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
-@skip_if_rocm("ROCm enablement in progress")
-def test_save_weights_only():
+@pytest.mark.skipif(not TORCH_VERSION_AT_LEAST_2_5, reason="requires nightly pytorch")
+def test_awq_loading_vllm():
+    """Simulate weight loading in vllm:
+    * prepare model weight to the same format (awq weight)
+    * use weight.copy_(state_dict["weight"]) to copy over the quantized weights from checkpoint
+
+    There is also a slicing op that is ommitted here, overall e2e is tested in tests in vllm repo
+    """
+    device = "cuda"
     dataset_size = 100
     l1, l2, l3 = 512, 256, 128
-    original_dtype = torch.bfloat16
-    quant_dtype = torch.uint4
-    device = "cuda"
+    original_dtype = torch.bfloat16  # tinygemm kernel only uses bfloat16 inputs
     group_size = 128
     n_calibration_examples = 10
-    n_validation_examples = 10
     sequence_length = 5
 
     m = ToyLinearModel(l1, l2, l3).eval().to(original_dtype).to(device)
-    m2 = deepcopy(m)
     dataset = m.example_inputs(
         dataset_size,
         sequence_length=sequence_length,
@@ -145,35 +186,41 @@ def test_save_weights_only():
     calibration_data = dataset[:n_calibration_examples]
 
     # calibrate
-    insert_awq_observer_(
-        m,
-        n_validation_examples,
-        sequence_length,
-        quant_dtype=quant_dtype,
-        group_size=group_size,
+    base_config = FbgemmConfig(
+        input_dtype=torch.bfloat16,
+        weight_dtype=torch.int4,
+        output_dtype=torch.bfloat16,
+        block_size=[1, group_size],
+        preshuffle=False,
     )
+    quant_config = AWQConfig(base_config, step=AWQStep.PREPARE)
+    quantize_(m, quant_config)
 
     for example in calibration_data:
-        m(example.to(device))
+        m(example)
 
     # quantize
-    is_observed_linear = lambda m, fqn: isinstance(m, AWQObservedLinear)
-    quantize_(
-        m, awq_uintx(quant_dtype=quant_dtype, group_size=group_size), is_observed_linear
-    )
+    quant_config = AWQConfig(base_config, step=AWQStep.CONVERT)
+    quantize_(m, quant_config)
+
+    with tempfile.NamedTemporaryFile() as f:
+        torch.save(m.state_dict(), f)
+        f.seek(0)
+        state_dict = torch.load(f)
+
+    loaded_model = ToyLinearModel(l1, l2, l3).eval().to(original_dtype).to(device)
+    quant_config = AWQConfig(base_config, step=AWQStep.PREPARE_FOR_LOADING)
+    quantize_(loaded_model, quant_config)
 
-    model_save_path = "awq_model.pth"
-    torch.save(m.state_dict(), model_save_path)
-    m2.load_state_dict(
-        torch.load(model_save_path), assign=True
-    )  # load weights only.torch.load(model_save_path)
-    os.remove(model_save_path)
+    loaded_model.linear1.weight.copy_(state_dict["linear1.weight"])
+    loaded_model.linear2.weight.copy_(state_dict["linear2.weight"])
+    loaded_model.linear3.weight.copy_(state_dict["linear3.weight"])
 
     m = torch.compile(m, fullgraph=True)
-    m2 = torch.compile(m2, fullgraph=True)
+    loaded_model = torch.compile(loaded_model, fullgraph=True)
 
-    awq_out = torch.cat([m(i.squeeze(0)) for i in dataset])
-    awq_save_load_out = torch.cat([m2(i.squeeze(0)) for i in dataset])
+    awq_out = torch.cat([m(d.squeeze(0)) for d in dataset])
+    awq_save_load_out = torch.cat([loaded_model(d.squeeze(0)) for d in dataset])
 
     assert awq_out is not None
     assert awq_save_load_out is not None
 
@@ -19,6 +19,10 @@
     config_from_dict,
     config_to_dict,
 )
+from torchao.prototype.awq import (
+    AWQConfig,
+    AWQStep,
+)
 from torchao.quantization.quant_api import (
     FbgemmConfig,
     Float8DynamicActivationFloat8WeightConfig,
@@ -79,6 +83,7 @@
             "linear2": Int8DynamicActivationInt4WeightConfig(),
         }
     ),
+    AWQConfig(Int4WeightOnlyConfig(group_size=128), step=AWQStep.PREPARE_FOR_LOAD),
 ]
 
 if TORCH_VERSION_AT_LEAST_2_6:
 
@@ -57,8 +57,13 @@ def _model_call(self, inps):
 
         max_seq_length = min(max(inps.size()), self.max_length)
         with torch.device(self._device):
-            self._model.setup_caches(self.batch_size, max_seq_length)
+            if hasattr(self._model, "setup_caches"):
+                self._model.setup_caches(self.batch_size, max_seq_length)
         logits = self._model(*input)
+        from transformers.modeling_outputs import CausalLMOutputWithPast
+
+        if isinstance(logits, CausalLMOutputWithPast):
+            logits = logits.logits
         return logits
 
     def run_eval(self, tasks, limit):
@@ -84,7 +89,11 @@ def eot_token_id(self):
         try:
             return self.tokenizer.eos_id()
         except:
-            return self.tokenizer.eos_id
+            try:
+                return self.tokenizer.eos_id
+            except:
+                idx = self.tokenizer.all_special_tokens.index("<|endoftext|>")
+                return self.tokenizer.all_special_ids[idx]
 
     @property
     def max_length(self):
@@ -102,8 +111,8 @@ def batch_size(self):
     def device(self):
         return self._device
 
-    def tok_decode(self, tokens):
-        decoded = self.tokenizer.decode(tokens)
+    def tok_decode(self, tokens, **kwargs):
+        decoded = self.tokenizer.decode(tokens, **kwargs)
         return decoded
 
     def tok_encode(self, string: str, **kwargs):
@@ -115,9 +124,6 @@ def tok_encode(self, string: str, **kwargs):
                 tokens = [self.tokenizer.bos_id] + tokens
         return tokens
 
-    def _model_generate(self, context, max_length, eos_token_id):
-        raise Exception("unimplemented")
-
 
 class LMEvalInputRecorder(TransformerEvalWrapper):
     def __init__(
 
@@ -237,6 +237,46 @@ def run_evaluation(
             quantize_(
                 model, codebook_weight_only(dtype=torch.uint4, scale_block_size=64)
             )
+        elif quantization.startswith("awq-uintx"):
+            from torchao._models._eval import TransformerEvalWrapper
+            from torchao.utils import TORCH_VERSION_AT_LEAST_2_3
+
+            if not TORCH_VERSION_AT_LEAST_2_3:
+                print("Awq requires torch2.3+")
+                exit()
+            from torchao.prototype.awq import (
+                AWQObservedLinear,
+                awq_uintx,
+                insert_awq_observer_,
+            )
+
+            quant_dtype = quantization.split("-")[1]
+            group_size = int(quantization.split("-")[2])
+            quant_dtype = getattr(torch, quant_dtype, torch.uint8)
+            model = model.to(device)
+            # get calibration data
+            insert_awq_observer_(
+                model, 1, 256, quant_dtype=quant_dtype, group_size=group_size
+            )
+            TransformerEvalWrapper(
+                model=model.to(device),
+                tokenizer=tokenizer,
+                max_seq_length=256,
+                input_prep_func=prepare_inputs_for_model,
+                device=device,
+            ).run_eval(
+                tasks=["wikitext"],
+                limit=1,
+            )
+            is_observed_linear = lambda m, fqn: isinstance(m, AWQObservedLinear)
+            use_hqq = "hqq" in quantization
+            quantize_(
+                model,
+                awq_uintx(
+                    quant_dtype=quant_dtype, group_size=group_size, use_hqq=use_hqq
+                ),
+                is_observed_linear,
+            )
 
     if compile:
         model = torch.compile(model, mode="max-autotune", fullgraph=True)
 
@@ -191,6 +191,7 @@ def config_to_dict(config: AOBaseConfig) -> Dict[str, Any]:
     "torchao.prototype.quantization",
     "torchao.prototype.mx_formats",
     "torchao.dtypes",
+    "torchao.prototype.awq",
 }
Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,10 @@`
`19`	`19`	`config_from_dict,`
`20`	`20`	`config_to_dict,`
`21`	`21`	`)`
	`22`	`+from torchao.prototype.awq import (`
	`23`	`+ AWQConfig,`
	`24`	`+ AWQStep,`
	`25`	`+)`
`22`	`26`	`from torchao.quantization.quant_api import (`
`23`	`27`	`FbgemmConfig,`
`24`	`28`	`Float8DynamicActivationFloat8WeightConfig,`
`@@ -79,6 +83,7 @@`
`79`	`83`	`"linear2": Int8DynamicActivationInt4WeightConfig(),`
`80`	`84`	`}`
`81`	`85`	`),`
	`86`	`+ AWQConfig(Int4WeightOnlyConfig(group_size=128), step=AWQStep.PREPARE_FOR_LOAD),`
`82`	`87`	`]`
`83`	`88`
`84`	`89`	`if TORCH_VERSION_AT_LEAST_2_6:`
Original file line number	Diff line number	Diff line change
`@@ -191,6 +191,7 @@ def config_to_dict(config: AOBaseConfig) -> Dict[str, Any]:`
`191`	`191`	`"torchao.prototype.quantization",`
`192`	`192`	`"torchao.prototype.mx_formats",`
`193`	`193`	`"torchao.dtypes",`
	`194`	`+ "torchao.prototype.awq",`
`194`	`195`	`}`
`195`	`196`
`196`	`197`