Merge pull request #9 from pytorch-labs/gh/HDCharles/1/base

supriyar · web-flow · commit 27bf5bf168b2 · 2023-11-16T08:59:45.000-08:00
Documentation Updates
diff --git a/README.md b/README.md
@@ -29,29 +29,92 @@ torchao                            0.0.1                   <install dir>
 
 Relevant APIs can be found in torchao.quantization.quant_api
 
+Note: While these techniques are designed to improve model performance, in some cases the opposite can occur.
+This is because quantization adds additional overhead to the model that is hopefully made up for by faster matmuls (dynamic quantization) or loading weights faster (weight-only quantization). If your matmuls are small enough or your non-quantized perf isn't bottlenecked by weight load time, these techniques may reduce performance.
+
+### A8W8 Dynamic Quantization
+
+Similar to the weight only api above, the `apply_dynamic_quant` function swaps all
+linear modules to dynamically quantized quantized linear modules.
+
+Example
+
+```
+
+# some user model and example input
+...
+
+# convert linear modules to quantized linear modules
+quant_api.apply_dynamic_quant(model)
+
+# compile the model to improve performance
+...
+```
+
+This technique works best when the torch._inductor.config.force_fuse_int_mm_with_mul option is enabled. This allows fusion of the int8*int8 -> int32 matmul and subsequent mul op, thereby avoiding materialization of the int32 intermediary tensor.
+
+### A16W8 WeightOnly Quantization
+
+The `apply_weight_only_int8_quant` function swaps all
+linear modules to weight-only quantized linear modules.
+
 Example
 
 ```
 import torch
 from torchao.quantization import quant_api
 
-# some user model
+# some user model and example input
 model = torch.nn.Sequential(torch.nn.Linear(32, 64)).cuda().to(torch.bfloat16)
-# some example input
 input = torch.randn(32,32, dtype=torch.bfloat16, device='cuda')
 
 # convert linear modules to quantized linear modules
-# insert quantization method/api of choice
 quant_api.apply_weight_only_int8_quant(model)
-# quant_api.apply_dynamic_quant(model)
-# quant_api.change_linear_weights_to_dqtensors(model)
 
 # compile the model to improve performance
 torch.compile(model, mode='max-autotune')
 model(input)
 ```
 
-### A16W8 WeightOnly Quantization
+This technique works best when the torch._inductor.config.use_mixed_mm option is enabled. This avoids dequantizing the weight tensor before the matmul, instead fusing the dequantization into the matmul, thereby avoiding materialization of a large floating point weight tensor.
+
+## Other APIs
+
+### A8W8 Dynamic Quantization by subclasses
+
+You can use [tensor subclasses](https://pytorch.org/docs/stable/notes/extending.html#subclassing-torch-tensor) to do dynamic quantization with the `change_linear_weights_to_dqtensors` function using the exact same formula as above. This avoids modifying the graph and can be more composable with
+other techniques.
+
+### A8W8 Dynamic Quantization with Smoothquant
+
+We've also implemented a version of [smoothquant](https://arxiv.org/abs/2211.10438) with the same GEMM format as above.
+Due to requiring calibration, the API is slightly more complicated
+
+Example
+
+```
+import torch
+from torchao.smoothquant import swap_linear_with_smooth_fq_linear, smooth_fq_linear_to_inference
+
+# some user model
+model = get_model()
+
+# convert linear modules to smoothquant
+# linear module in calibration mode
+swap_linear_with_smooth_fq_linear(model)
+
+# calibration
+for i in range(calibration_amount):
+    input = get_input()
+    model(input)
+
+# set it to inference mode
+smooth_fq_linear_to_inference(model)
+
+# compile the model to improve performance
+torch.compile(model, mode='max-autotune')
+model(input)
+```
 
 ## License
 
diff --git a/test/test.py b/test/test.py
@@ -21,6 +21,7 @@
     apply_dynamic_quant,
     apply_weight_only_int8_quant,
     change_linear_weights_to_dqtensors,
+    _replace_with_custom_fn_if_matches_filter,
 )
 from torchao.quantization.quant_primitives import (
     dequantize_per_channel,
@@ -35,7 +36,6 @@
 
 from torchao.quantization.smoothquant import (
     get_scale,
-    replace_with_custom_fn_if_matches_filter,
     smooth_fq_linear_to_inference,
     SmoothFakeDynamicallyQuantizedLinear,
     swap_linear_with_smooth_fq_linear,
@@ -284,7 +284,7 @@ def test_selective_torch_compile(self):
         x = torch.randn(4, 4)
         y_ref = m(x)
 
-        replace_with_custom_fn_if_matches_filter(
+        _replace_with_custom_fn_if_matches_filter(
             m,
             lambda mod: torch.compile(mod),
             lambda mod, fqn: isinstance(mod, nn.Linear) and fqn != "1.0",
diff --git a/torchao/quantization/__init__.py b/torchao/quantization/__init__.py
@@ -13,7 +13,6 @@
 
 __all__ = [
     "DynamicallyPerAxisQuantizedLinear",
-    "replace_with_custom_fn_if_matches_filter",
     "apply_weight_only_int8_quant",
     "apply_dynamic_quant",
     "change_linear_weights_to_dqtensors",
diff --git a/torchao/quantization/dynamic_quant.py b/torchao/quantization/dynamic_quant.py
@@ -16,56 +16,45 @@
 
 class DynamicallyPerAxisQuantizedLinear(torch.nn.Linear):
     """
-    This class is a replacement for `torch.nn.Linear`, implementing dynamic quantization on
-    the input across all axes except for the last axis.
+    This class is a replacement for `torch.nn.Linear`. It implements a
+    quantized matmul using int8 dynamic symmetric per-token activation,
+    and int8 symmetric per-channel weight quantization
     """
 
     def __init__(
         self,
         in_features: int,
         out_features: int,
         bias: bool = True,
-        use_fused_int_mm=False,
     ) -> None:
         super().__init__(in_features, out_features, bias)
-        self.use_fused_int_mm = use_fused_int_mm
-        # note: enabling use_fused_int_mm = True has best perf when additionally setting
-        # torch._inductor.config.force_fuse_int_mm_with_mul = True
 
     def forward(self, X: torch.Tensor) -> torch.Tensor:
         """
-        Performs the forward pass of the quantized linear layer.
-
-        This method applies dynamic quantization to the input tensor across all axes except
-        the last axis using the `quant_int8_dynamic_per_token_linear` function.
+        Performs the forward pass of the quantized linear layer which consists
+        of int8 dynamic symmetric per-token activation and int8 symmetric per-channel weight
+        quantization
 
         Args:
-            X (torch.Tensor): The input tensor to the quantized linear layer.
+            X (torch.Tensor): The input floating point tensor to the quantized linear layer.
 
         Returns:
-            torch.Tensor: The output tensor after the quantized matmul and rescale.
+            torch.Tensor: The output floating point tensor after the quantized matmul and rescale.
 
         """
-        # The following line mimics the behavior of SmoothFakeDynamicallyQuantizedLinear
-        if not self.use_fused_int_mm:
-            X = X / self.fake_rescale
-        # somehow the inductor fusion that occurs for most transformer models
-        # when this module has an additional div op is faster than when it doesn't
-        # have it although the memory usage is slightly higher. fake_rescale is scalar 1
-        # so it doesn't affect accuracy
+
         Y = quant_int8_dynamic_per_token_linear(
             X, self.W_int_repr_t, self.W_scales, self.bias, X.dtype
         )
         return Y
 
     @classmethod
     def from_float(
-        cls, mod: torch.nn.Linear, use_fused_int_mm=False
+        cls, mod: torch.nn.Linear
     ) -> "DynamicallyPerAxisQuantizedLinear":
         """
-        Converts a `mod` of class `torch.nn.Linear` to the dynamically quantized version of it.
-
-        Note: this class does not require calibration.
+        Converts a `mod` of class `torch.nn.Linear` to the
+        `DynamicallyPerAxisQuantizedLinear` class
 
         Args:
             mod (torch.nn.Linear): The original `torch.nn.Linear` module to convert.
@@ -81,7 +70,6 @@ def from_float(
             fake_in_features,
             fake_out_features,
             bias=mod.bias is not None,
-            use_fused_int_mm=use_fused_int_mm,
         )
         new_mod.in_features = mod.in_features
         new_mod.out_features = mod.out_features
@@ -91,10 +79,6 @@ def from_float(
         new_mod.register_buffer("W_int_repr_t", W_int_repr.contiguous().t())
         new_mod.W_scales = nn.Parameter(W_scales)
         new_mod.bias = mod.bias
-        if not use_fused_int_mm:
-            new_mod.fake_rescale = torch.tensor(
-                [1.0], dtype=mod.weight.dtype, device=mod.weight.device
-            )
         del new_mod.weight
 
         device_to_use = next(mod.parameters()).device
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -5,10 +5,14 @@
 # LICENSE file in the root directory of this source tree.
 
 """
-Quantization API stuff which is not specific to SmoothQuant
+Quantization APIs
 
-Note: this is throwaway code for fast results on Blueberry, this is not
-intended to be the actual long term quantization API for server GPUs.
+Generally these APIs can be applied directly to any model
+with Linear modules to obtain quantized linear ops. The intended
+usage involves applying torch.compile to the model afterwards
+both because primitives were designed based on the fusions that
+come along with it and because that is how we access the intended quantized
+and mixed GEMM kernels
 """
 
 import torch
@@ -23,14 +27,13 @@
 )
 
 __all__ = [
-    "replace_with_custom_fn_if_matches_filter",
     "apply_weight_only_int8_quant",
     "apply_dynamic_quant",
     "change_linear_weights_to_dqtensors",
 ]
 
 
-def replace_with_custom_fn_if_matches_filter(
+def _replace_with_custom_fn_if_matches_filter(
     model, replacement_fn, filter_fn, cur_fqn=""
 ) -> None:
     """
@@ -47,34 +50,41 @@ def replace_with_custom_fn_if_matches_filter(
             new_child = replacement_fn(child)
             setattr(model, name, new_child)
         else:
-            replace_with_custom_fn_if_matches_filter(
+            _replace_with_custom_fn_if_matches_filter(
                 child, replacement_fn, filter_fn, new_fqn
             )
-
-
 def apply_weight_only_int8_quant(model):
-    replace_with_custom_fn_if_matches_filter(
+    """
+    Applies weight-only symmetric per-channel int8 quantization to all linear layers
+    in the given model using module swaps.
+    """
+    _replace_with_custom_fn_if_matches_filter(
         model,
         WeightOnlyInt8QuantLinear.from_float,
         lambda mod, fqn: isinstance(mod, torch.nn.Linear),
     )
-
-
-def apply_dynamic_quant(model, use_fused_int_mm=0):
-    replace_with_custom_fn_if_matches_filter(
+def apply_dynamic_quant(model):
+    """
+    Applies dynamic symmetric per-token activation and per-channel weight
+    quantization to all linear layers in the given model using
+    module swaps.
+    """
+    _replace_with_custom_fn_if_matches_filter(
         model,
-        lambda mod: DynamicallyPerAxisQuantizedLinear.from_float(mod, use_fused_int_mm),
+        lambda mod: DynamicallyPerAxisQuantizedLinear.from_float(mod),
         lambda mod, fqn: isinstance(mod, torch.nn.Linear),
     )
-
-
 def change_linear_weights_to_dqtensors(model):
+    """
+    Converts all linear weight tensors to the `DynamicallyQuantizedLinearWeight`
+    Tensor subclass, effectively applying the same form of quantization
+    as apply_dynamic_quant while not modifying the linear modules.
+    """
     def insert_subclass(lin):
         lin.weight = torch.nn.Parameter(
             DynamicallyQuantizedLinearWeight.from_float(lin.weight), requires_grad=False
         )
         return lin
-
-    replace_with_custom_fn_if_matches_filter(
+    _replace_with_custom_fn_if_matches_filter(
         model, insert_subclass, lambda mod, fqn: isinstance(mod, torch.nn.Linear)
     )
diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py
@@ -303,14 +303,13 @@ def quant_int8_dynamic_per_token_linear(
     w_vals_int8_t,
     w_scales,
     bias,
-    out_dtype=torch.float32,
-    use_fused_int_mm=0,
+    out_dtype,
 ):
     # like F.linear, but with int8 dynamic quantization of activation,
     # and a quantized weight
     x_vals_int8, x_scales = quantize_activation_per_token_absmax(x)
     mm_out = quant_int8_per_token_matmul(
-        x_vals_int8, x_scales, w_vals_int8_t, w_scales, out_dtype, use_fused_int_mm
+        x_vals_int8, x_scales, w_vals_int8_t, w_scales, out_dtype
     )
     if bias is not None:
         mm_out += bias
@@ -323,7 +322,6 @@ def quant_int8_per_token_matmul(
     w_vals_int8_t,
     w_scales,
     output_dtype=torch.float32,
-    use_fused_int_mm=0,
 ):
     # Quantized matmul of int8 operands that accumulates to int32 and returns
     # output_dtype. For now, this is written for approximate numerical
@@ -355,18 +353,6 @@ def quant_int8_per_token_matmul(
     #
 
     tmp = x_vals_int8.reshape(-1, x_vals_int8.shape[-1])
-    # these branches use external triton fused_int_mm kernel's which fuse either 1 or 2 mul operations
-    if use_fused_int_mm == 2:
-        y = torch.ops.custom_int_mm.int_mm_dequant(
-            tmp, w_vals_int8_t, x_scales.view(-1, 1), w_scales, output_dtype
-        ).reshape(*x_vals_int8.shape[:-1], -1)
-        return y
-    elif use_fused_int_mm == 1:
-        y = torch.ops.custom_int_mm.int_mm_one_mul(
-            tmp, w_vals_int8_t, x_scales.view(-1, 1), output_dtype
-        ).reshape(*x_vals_int8.shape[:-1], -1)
-        y = y * w_scales
-        return y.to(output_dtype)
     y_dot_int32 = safe_int_mm(tmp, w_vals_int8_t)
 
     #
@@ -381,6 +367,7 @@ def quant_int8_per_token_matmul(
         torch.float,
         torch.bfloat16,
     ], f"x_scales needs to be a torch.float32 or torch.bfloat16 but got {x_scales.dtype}"
+
     y = (y_dot_int32 * x_scales.view(-1, 1) * w_scales).reshape(
         *x_vals_int8.shape[:-1], -1
     )
diff --git a/torchao/quantization/smoothquant.py b/torchao/quantization/smoothquant.py
diff --git a/torchao/quantization/subclass.py b/torchao/quantization/subclass.py
diff --git a/torchao/quantization/weight_only.py b/torchao/quantization/weight_only.py