Adding subclass and api for weight-only quant

HDCharles · HDCharles · commit a9e0596016c3 · 2023-11-27T21:31:46.000-08:00
Summary: Reconfigured subclasses to inherit for int8weightquantized subclass since all teh weight manipulation code is the same, only the quantized op differs Test Plan: python test.py -k "subclass" Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: 1d82a46 Pull Request resolved: #11
diff --git a/test/test.py b/test/test.py
@@ -21,6 +21,7 @@
     apply_dynamic_quant,
     apply_weight_only_int8_quant,
     change_linear_weights_to_dqtensors,
+    change_linear_weights_to_woqtensors,
     _replace_with_custom_fn_if_matches_filter,
 )
 from torchao.quantization.quant_primitives import (
@@ -42,6 +43,7 @@
 )
 from torchao.quantization.subclass import (
     DynamicallyQuantizedLinearWeight,
+    WeightOnlyQuantizedLinearWeight
 )
 from torchao.quantization.utils import (
     apply_logging_hook,
@@ -51,6 +53,10 @@
     LoggingTensorMode,
 )
 from torch.ao.quantization.quantize_fx import convert_to_reference_fx, prepare_fx
+from torchao.quantization.weight_only import (
+    WeightOnlyInt8QuantLinear
+)
+
 
 torch.manual_seed(0)
 
@@ -782,84 +788,62 @@ def test_qlinear_per_channel_numerics_cuda(self):
 
 
 class TestSubclass(unittest.TestCase):
-    def test_dq_lin_weight_subclass_aot(self):
-        m, k, n = 32, 64, 32
-        x = torch.randn(m, k, device="cuda", dtype=torch.float32)
-        lin = torch.nn.Linear(k, n, device="cuda")
-
-        import copy
-
-        linq = DynamicallyPerAxisQuantizedLinear.from_float(copy.deepcopy(lin))
-
-        ref_f = lin(x)
-        ref_q = linq(x)
-
-        print(SQNR(ref_f, ref_q), "float to dq")
-
-        lin.weight = torch.nn.Parameter(
-            DynamicallyQuantizedLinearWeight.from_float(lin.weight), requires_grad=False
-        )
-        test = lin(x)
-        print(SQNR(ref_f, test), "float to dq class")
-        print(SQNR(ref_q, test), "dq to dq class")
-        assert SQNR(ref_f, test) > 35
-        assert SQNR(ref_q, test) > 35
-
-        lin_comp = torch.compile(lin, backend="aot_eager")
-        linq_comp = torch.compile(linq, backend="aot_eager")
-        test_comp = lin_comp(x)
-        ref_q_comp = linq_comp(x)
-        print(SQNR(ref_f, test_comp), "float to dq class compiled")
-        print(SQNR(ref_q_comp, test_comp), "dq compiled to dq class compiled")
-        assert SQNR(ref_f, test_comp) > 35
-        assert SQNR(ref_q_comp, test_comp) > 35
-
-    def test_dq_lin_weight_subclass_max_autotune(self):
-        m, k, n = 32, 64, 32
-        x = torch.randn(m, k, device="cuda", dtype=torch.float32)
-        lin = torch.nn.Linear(k, n, device="cuda")
-
-        import copy
-
-        linq = DynamicallyPerAxisQuantizedLinear.from_float(copy.deepcopy(lin))
-
-        ref_f = lin(x)
-        ref_q = linq(x)
+    def _test_lin_weight_subclass_impl(self,
+        test_subclass,
+        min_sqnr=35,
+        test_dtypes=[torch.float32, torch.float16, torch.bfloat16],
+        test_shape=[32, 64, 32]
+    ):
+        for test_dtype in test_dtypes:
+            m, k, n = test_shape
+            x = torch.randn(m, k, device="cuda", dtype=test_dtype)
+            lin = torch.nn.Linear(k, n, device="cuda").to(test_dtype)
+            ref_f = lin(x)
+
+            lin.weight = torch.nn.Parameter(
+                test_subclass.from_float(lin.weight), requires_grad=False
+            )
+            test = lin(x)
+            self.assertGreater(SQNR(ref_f, test), min_sqnr, f"{test_subclass.__name__} failed, no compile, dtype={test_dtype}, (m, k, n)={test_shape}")
+            lin_comp = torch.compile(lin, mode='max-autotune')
+            test_comp = lin_comp(x)
+            self.assertGreater(SQNR(ref_f, test_comp), min_sqnr, f"{test_subclass.__name__} failed at compile with dtype={test_dtype}, (m, k, n)={test_shape}")
 
-        print(SQNR(ref_f, ref_q), "float to dq")
+    def test_int8_dynamic_quant_subclass(self):
+        self._test_lin_weight_subclass_impl(DynamicallyQuantizedLinearWeight, 35)
 
-        lin.weight = torch.nn.Parameter(
-            DynamicallyQuantizedLinearWeight.from_float(lin.weight), requires_grad=False
-        )
-        test = lin(x)
-        print(SQNR(ref_f, test), "float to dq class")
-        print(SQNR(ref_q, test), "dq to dq class")
-        assert SQNR(ref_f, test) > 35
-        assert SQNR(ref_q, test) > 35
-
-        lin_comp = torch.compile(lin, mode="max-autotune")
-        linq_comp = torch.compile(linq, mode="max-autotune")
-
-        test_comp = lin_comp(x)
-        ref_q_comp = linq_comp(x)
-        print(SQNR(ref_f, test_comp), "float to dq class compiled")
-        print(SQNR(ref_q_comp, test_comp), "dq compiled to dq class compiled")
-        assert SQNR(ref_f, test_comp) > 35
-        assert SQNR(ref_q_comp, test_comp) > 35
+    def test_int8_weight_only_quant_subclass(self):
+        self._test_lin_weight_subclass_impl(WeightOnlyQuantizedLinearWeight, 40)
 
     @torch.no_grad()
-    def test_dq_lin_weight_subclass_max_autotune_api(self):
-        m, k, n = 32, 64, 32
-        x = torch.randn(m, k, device="cuda", dtype=torch.float32)
-
-        mod = nn.Sequential(
-            nn.Linear(k, n, device="cuda"), nn.ReLU(), nn.Linear(n, n, device="cuda")
-        )
-        change_linear_weights_to_dqtensors(mod)
-        mod_qc = torch.compile(mod, mode="max-autotune")
-        mod_qc(x)
-        mod_qc(x)
-
+    def _test_lin_weight_subclass_api_impl(
+        self,
+        api,
+        min_sqnr=35,
+        test_dtypes=[torch.float32, torch.float16, torch.bfloat16],
+        test_shape=[32, 64, 32]
+    ):
+        for test_dtype in test_dtypes:
+            m, k, n = test_shape
+            x = torch.randn(m, k, device="cuda", dtype=test_dtype)
+            mod = nn.Sequential(
+                nn.Linear(k, n, device="cuda"), nn.ReLU(), nn.Linear(n, n, device="cuda")
+            ).to(test_dtype)
+            ref_f = mod(x)
+            api(mod)
+            test = mod(x)
+            self.assertGreater(SQNR(ref_f, test), min_sqnr, f"{api.__name__} failed, no compile dtype={test_dtype}, (m, k, n)={test_shape}")
+
+            mod_qc = torch.compile(mod, mode="max-autotune")
+            test_comp = mod_qc(x)
+            self.assertGreater(SQNR(ref_f, test_comp), min_sqnr, f"{api.__name__} failed when compiled with dtype={test_dtype}, (m, k, n)={test_shape}")
+
+
+    def test_int8_dynamic_quant_subclass_api(self):
+        self._test_lin_weight_subclass_api_impl(change_linear_weights_to_dqtensors, 35)
+
+    def test_int8_weight_only_quant_subclass_api(self):
+        self._test_lin_weight_subclass_api_impl(change_linear_weights_to_woqtensors, 40)
 
 class TestDynamicQuant(unittest.TestCase):
     def test_dynamic_quant(self):
diff --git a/torchao/quantization/__init__.py b/torchao/quantization/__init__.py
@@ -16,6 +16,7 @@
     "apply_weight_only_int8_quant",
     "apply_dynamic_quant",
     "change_linear_weights_to_dqtensors",
+    "change_linear_weights_to_woqtensors",
     "insert_subclass",
     "safe_int_mm",
     "dynamically_quantize_per_tensor",
@@ -34,6 +35,7 @@
     "smooth_fq_linear_to_inference",
     "set_smooth_fq_attribute",
     "DynamicallyQuantizedLinearWeight",
+    "WeightOnlyQuantizedLinearWeight",
     "log_with_rank",
     "clear_logs",
     "compute_error",
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -21,6 +21,7 @@
 )
 from .subclass import (
     DynamicallyQuantizedLinearWeight,
+    WeightOnlyQuantizedLinearWeight,
 )
 from .weight_only import (
     WeightOnlyInt8QuantLinear,
@@ -30,6 +31,7 @@
     "apply_weight_only_int8_quant",
     "apply_dynamic_quant",
     "change_linear_weights_to_dqtensors",
+    "change_linear_weights_to_woqtensors",
 ]
 
 
@@ -74,17 +76,35 @@ def apply_dynamic_quant(model):
         lambda mod: DynamicallyPerAxisQuantizedLinear.from_float(mod),
         lambda mod, fqn: isinstance(mod, torch.nn.Linear),
     )
+
+def _get_subclass_inserter(cls):
+    def insert_subclass(lin):
+        lin.weight = torch.nn.Parameter(
+            cls.from_float(lin.weight), requires_grad=False
+        )
+        return lin
+    return insert_subclass
+
 def change_linear_weights_to_dqtensors(model):
     """
     Converts all linear weight tensors to the `DynamicallyQuantizedLinearWeight`
     Tensor subclass, effectively applying the same form of quantization
     as apply_dynamic_quant while not modifying the linear modules.
     """
-    def insert_subclass(lin):
-        lin.weight = torch.nn.Parameter(
-            DynamicallyQuantizedLinearWeight.from_float(lin.weight), requires_grad=False
-        )
-        return lin
     _replace_with_custom_fn_if_matches_filter(
-        model, insert_subclass, lambda mod, fqn: isinstance(mod, torch.nn.Linear)
+        model,
+        _get_subclass_inserter(DynamicallyQuantizedLinearWeight),
+        lambda mod, fqn: isinstance(mod, torch.nn.Linear)
+    )
+
+def change_linear_weights_to_woqtensors(model):
+    """
+    Converts all linear weight tensors to the `WeightOnlyQuantizedLinearWeight`
+    Tensor subclass, effectively applying the same form of quantization
+    as apply_dynamic_quant while not modifying the linear modules.
+    """
+    _replace_with_custom_fn_if_matches_filter(
+        model,
+        _get_subclass_inserter(WeightOnlyQuantizedLinearWeight),
+        lambda mod, fqn: isinstance(mod, torch.nn.Linear)
     )
diff --git a/torchao/quantization/subclass.py b/torchao/quantization/subclass.py