Update on "Add update_quantized_cache op"

kimishpatel · kimishpatel · commit 6fec56fc5934 · 2024-09-24T21:42:06.000-07:00
Why? - ton of copies due to functionalization - mutable buffer support without such custom inplace ops will results in giant copies at the end - Making inplace ops work will likely take longer and not clear safe path Differential Revision: [D62301838](https://our.internmc.facebook.com/intern/diff/D62301838/) [ghstack-poisoned]
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -214,7 +214,7 @@ def build_args_parser() -> argparse.ArgumentParser:
         "--quantize_kv_cache",
         default=False,
         action="store_true",
-        help="Whether or not to export a model using quantized kv cache",
+        help="Whether or not to export a model using int8 per token quantized kv cache",
     )
     parser.add_argument(
         "--num_sharding",
@@ -455,41 +455,6 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager:
     else:
         dtype_override = None
 
-    # source transforms
-    transforms = []
-    if args.quantization_mode:
-        modelname = f"{modelname}_q"
-        transforms.append(
-            get_quant_weight_transform(args, dtype_override, verbose_export())
-        )
-
-    if args.embedding_quantize:
-        modelname = f"{modelname}_e"
-        transforms.append(get_quant_embedding_transform(args))
-
-    if args.expand_rope_table:
-        transforms.append(materialze_broadcast_of_rope_freq_cis)
-
-    if args.use_sdpa_with_kv_cache:
-        transforms.append(replace_sdpa_with_custom_op)
-
-    if args.quantize_kv_cache:
-        assert (
-            args.use_kv_cache and not args.use_sdpa_with_kv_cache
-        ), "quantize_kv_cache requires use_kv_cache=True and use_sdpa_with_kv_cache=False"
-        transforms.append(replace_kv_cache_with_quantized_kv_cache)
-
-    if args.use_kv_cache:
-        if args.qnn:
-            transforms.append(replace_kv_cache_with_simple_kv_cache)
-            transforms.append(replace_sdpa_with_flex_sdpa)
-            transforms.append(replace_causal_mask)
-
-        elif args.coreml or args.mps:
-            # Currently qnn/coreml/mps doesn't support sdpa op, use the simpler decomposition
-            # to get free perf gain.
-            transforms.append(replace_sdpa_with_simple_sdpa)
-            transforms.append(replace_causal_mask)
     return (
         _load_llama_model(
             modelname=modelname,
@@ -850,6 +815,12 @@ def _get_source_transforms(  # noqa
     if args.use_sdpa_with_kv_cache:
         transforms.append(replace_sdpa_with_custom_op)
 
+    if args.quantize_kv_cache:
+        assert (
+            args.use_kv_cache and not args.use_sdpa_with_kv_cache
+        ), "quantize_kv_cache requires use_kv_cache=True and use_sdpa_with_kv_cache=False"
+        transforms.append(replace_kv_cache_with_quantized_kv_cache)
+
     if args.use_kv_cache:
         if args.qnn:
             # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.utils.utils`
diff --git a/examples/models/llama2/source_transformation/quantized_kv_cache.py b/examples/models/llama2/source_transformation/quantized_kv_cache.py
@@ -23,8 +23,8 @@
 class QuantizedCacheType(Enum):
     AffineSymmetric = 0
     AffineAsymmetric = 1
-    AffineSymmetricGroupWise = 1
-    AffineAsymmetricGroupWise = 2
+    AffineSymmetricGroupWise = 2
+    AffineAsymmetricGroupWise = 3
 
 
 class QuantizedKVCache(nn.Module):
@@ -58,8 +58,12 @@ def __init__(
         else:
             cache_shape = (max_batch_size, max_seq_length, n_heads, head_dim)
             scale_shape = (max_batch_size, max_seq_length, n_heads, 1)
-        self.register_buffer("k_cache", torch.zeros(cache_shape, dtype=torch.int8))
-        self.register_buffer("v_cache", torch.zeros(cache_shape, dtype=torch.int8))
+        self.register_buffer(
+            "k_cache", torch.zeros(cache_shape, dtype=self.quantized_cache_dtype)
+        )
+        self.register_buffer(
+            "v_cache", torch.zeros(cache_shape, dtype=self.quantized_cache_dtype)
+        )
         self.register_buffer(
             "k_cache_scales", torch.ones(scale_shape, dtype=torch.double)
         )
@@ -74,43 +78,32 @@ def __init__(
                 "v_cache_zero_points", torch.ones(scale_shape, dtype=torch.int64)
             )
 
-    def update(self, input_pos, k_val, v_val):
-        # quantize current k_val and store it in the cache
-        k_scales, k_zero_points = (
+    def _quantize(self, value):
+        scales, zero_points = (
             torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric.default(
-                k_val, torch.int8  # no other value is supported by this op anyway
+                value, self.quantized_cache_dtype
             )
         )
-        quantized_k_val = torch.ops.quantized_decomposed.quantize_per_token(
-            k_val,
-            k_scales,
-            k_zero_points,
-            torch.iinfo(torch.int8).min,
-            torch.iinfo(torch.int8).max,
-            torch.int8,
+        quantized_value = torch.ops.quantized_decomposed.quantize_per_token(
+            value,
+            scales,
+            zero_points,
+            torch.iinfo(self.quantized_cache_dtype).min,
+            torch.iinfo(self.quantized_cache_dtype).max,
+            self.quantized_cache_dtype,
         )
+        return quantized_value, scales, zero_points
 
-        v_scales, v_zero_points = (
-            torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric(
-                v_val, torch.int8
-            )
-        )
-        quantized_v_val = torch.ops.quantized_decomposed.quantize_per_token(
-            v_val,
-            v_scales,
-            v_zero_points,
-            torch.iinfo(torch.int8).min,
-            torch.iinfo(torch.int8).max,
-            torch.int8,
-        )
+    def update(self, input_pos, k_val, v_val):
+        # quantize current k_val and store it in the cache
+        quantized_k_val, k_scales, k_zero_points = self._quantize(k_val)
+
+        quantized_v_val, v_scales, v_zero_points = self._quantize(v_val)
 
         if self.enable_dynamic_shape:
             start_pos = input_pos[0].item()
             torch._check_is_size(start_pos)
-            if self.is_transposed:
-                dim_to_slice = 2
-            else:
-                dim_to_slice = 1
+            dim_to_slice = 2 if self.is_transposed else 1
             torch._check(start_pos < self.k_cache.size(dim_to_slice))
             seq_length = k_val.size(dim_to_slice)
             narrowed_k = self.k_cache.narrow(dim_to_slice, start_pos, seq_length)
@@ -154,17 +147,17 @@ def update(self, input_pos, k_val, v_val):
             self.k_cache,
             self.k_cache_scales,
             self.k_cache_zero_points,
-            torch.iinfo(torch.int8).min,
-            torch.iinfo(torch.int8).max,
+            torch.iinfo(self.quantized_cache_dtype).min,
+            torch.iinfo(self.quantized_cache_dtype).max,
             self.quantized_cache_dtype,
             self.cache_fp_type,
         )
         v_out = torch.ops.quantized_decomposed.dequantize_per_token(
             self.v_cache,
             self.v_cache_scales,
             self.v_cache_zero_points,
-            torch.iinfo(torch.int8).min,
-            torch.iinfo(torch.int8).max,
+            torch.iinfo(self.quantized_cache_dtype).min,
+            torch.iinfo(self.quantized_cache_dtype).max,
             self.quantized_cache_dtype,
             self.cache_fp_type,
         )
diff --git a/kernels/quantized/cpu/op_dequantize.cpp b/kernels/quantized/cpu/op_dequantize.cpp
@@ -394,17 +394,19 @@ Tensor& dequantize_per_token_out(
   for (size_t i = 0; i < input.dim() - 1; i++) {
     num_channels *= input.size(i);
   }
-// This unfortunate change is needed because we compile op_quantize for aten
-// mode as well
+  // This unfortunate change is needed because we compile op_quantize for aten
+  // mode as well
+  std::array<exec_aten::SizesType, 2> input_sizes;
+  input_sizes[0] = static_cast<exec_aten::SizesType>(num_channels);
+  input_sizes[1] =
+      static_cast<exec_aten::SizesType>(input.size(input.dim() - 1));
 #ifdef USE_ATEN_LIB
-  const std::array<int64_t, 2> sizes = {{num_channels, input.dim() - 1}};
   Tensor reshaped_input = at::from_blob(
-      input.mutable_data_ptr(), sizes, at::TensorOptions(input.scalar_type()));
+      input.mutable_data_ptr(),
+      input_sizes,
+      at::TensorOptions(input.scalar_type()));
 #else
   std::array<exec_aten::DimOrderType, 2> input_dim_order{0, 1};
-  std::array<exec_aten::SizesType, 2> input_sizes;
-  input_sizes[0] = num_channels;
-  input_sizes[1] = input.size(input.dim() - 1);
   std::array<exec_aten::StridesType, 2> input_strides;
   dim_order_to_stride_nocheck(
       input_sizes.data(), input_dim_order.data(), 2, input_strides.data());
@@ -428,7 +430,7 @@ Tensor& dequantize_per_token_out(
       reshaped_input,
       scale,
       zero_points,
-      0,
+      0, /* axis */
       quant_min,
       quant_max,
       dtype,
diff --git a/kernels/quantized/test/test_quant_dequant_per_token.py b/kernels/quantized/test/test_quant_dequant_per_token.py
@@ -9,14 +9,10 @@
 import unittest
 
 import torch
-from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib
 
 
 class QuantizePerTokenTest(unittest.TestCase):
 
-    def setUp(self):
-        pass
-
     def test_quantize_per_token(self):
         input_tensor = torch.tensor(
             [[-0.5, 0.3, 1.2], [0.1, -0.8, 2.1], [-5, 1, 2]], dtype=torch.float32