Update on " [ExecuTorch][BE] Split kv cache and SDPA for better code sharing"

kimishpatel · kimishpatel · commit 3468f0cadfa7 · 2025-01-13T14:22:28.000-08:00
Summary: Why? We have coupled SDPA with kv cache for a while. Initially this was done as we implemented sdpa_with_kv_cache custom op to reduce multiple copy overheads from kv cache update. (This could have been done by having separate custom kv cache update and custom sdpa op. Recent changes enabled this.) As a result of SDPA module owning kv cache, we get a) non-composable implementation and b) harder to reuse model definition and components from repos like tune. Output of this is that we have multiple definition of the same model, llama, lying around in ET, TorchChat and Tune. This diff and subsequent ones will try to move in the direction where custom kv cache and custom sdpa become decoupled and composable, making it more module-swap friendly with tune's model definition. How. Earlier PRs decoupled kv cache update from sdpa. So now 1. Decouple SDPA nn.Module from KV cache. 2. Standardize on KVCache and SDPA interface. That is KVCache and SDPA both operate on q, k, v in [B, # heads, seq_len, head_dim] formatted tensors. 3. 2 will introduce multiple tranposes when KVCache and SDPA are replaced by custom modules, but we will write graph pass to undo those. Test Plan: Existing tests. Make sure perf doesnt regress Differential Revision: [D67914054](https://our.internmc.facebook.com/intern/diff/D67914054) [ghstack-poisoned]
diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
@@ -112,6 +112,8 @@ fi
 
 if [[ "${MODE}" =~ .*quantize_kv.* ]]; then
   QUANTIZE_KV_CACHE=ON
+  # quantize_kv cache transform uses custom kv cache update op
+  CUSTOM=ON
 else
   QUANTIZE_KV_CACHE=OFF
 fi
diff --git a/examples/models/llama/source_transformation/quantized_kv_cache.py b/examples/models/llama/source_transformation/quantized_kv_cache.py
@@ -37,6 +37,7 @@ def __init__(
         n_heads,
         head_dim,
         cache_type: QuantizedCacheType = QuantizedCacheType.AffineSymmetric,
+        use_custom_update_cache_op: bool = False,
     ):
         super().__init__()
         if cache_type not in (
@@ -48,6 +49,7 @@ def __init__(
             )
 
         # For now supporting int8 only
+        self.use_custom_update_cache_op = True
         self.quantized_cache_dtype = torch.int8
         self.cache_fp_type = torch.float32
         cache_shape = (max_batch_size, max_seq_length, n_heads, head_dim)
@@ -103,24 +105,25 @@ def update(self, input_pos, k_val, v_val):
 
         quantized_v_val, v_scales, v_zero_points = self._quantize(v_val)
 
-        # Right now using custom ops on this path.
-        # In future we can update custom op to handle transposed cache
-        # as well.
-        # Note that we may have to revert this change if other ET
-        # backends such as QNN want to use quantized cache, with dynamic shape,
-        # instead of quantizing on their own.
-        # But until this opting for code simplicity
-        start_pos = input_pos[0].item()
-        _ = torch.ops.llama.update_cache(quantized_k_val, self.k_cache, start_pos)
-        _ = torch.ops.llama.update_cache(k_scales, self.k_cache_scales, start_pos)
-        _ = torch.ops.llama.update_cache(
-            k_zero_points, self.k_cache_zero_points, start_pos
-        )
-        _ = torch.ops.llama.update_cache(quantized_v_val, self.v_cache, start_pos)
-        _ = torch.ops.llama.update_cache(v_scales, self.v_cache_scales, start_pos)
-        _ = torch.ops.llama.update_cache(
-            v_zero_points, self.v_cache_zero_points, start_pos
-        )
+        if self.use_custom_update_cache_op:
+            start_pos = input_pos[0].item()
+            _ = torch.ops.llama.update_cache(quantized_k_val, self.k_cache, start_pos)
+            _ = torch.ops.llama.update_cache(k_scales, self.k_cache_scales, start_pos)
+            _ = torch.ops.llama.update_cache(
+                k_zero_points, self.k_cache_zero_points, start_pos
+            )
+            _ = torch.ops.llama.update_cache(quantized_v_val, self.v_cache, start_pos)
+            _ = torch.ops.llama.update_cache(v_scales, self.v_cache_scales, start_pos)
+            _ = torch.ops.llama.update_cache(
+                v_zero_points, self.v_cache_zero_points, start_pos
+            )
+        else:
+            self.k_cache[:, :, input_pos] = quantized_k_val
+            self.k_cache_scales[:, :, input_pos] = k_scales
+            self.k_cache_zero_points[:, :, input_pos] = k_zero_points
+            self.v_cache[:, :, input_pos] = quantized_v_val
+            self.v_cache_scales[:, :, input_pos] = v_scales
+            self.v_cache_zero_points[:, :, input_pos] = v_zero_points
 
         k_out = torch.ops.quantized_decomposed.dequantize_per_token(
             self.k_cache,
@@ -148,7 +151,12 @@ def update(self, input_pos, k_val, v_val):
         return k_out.transpose(1, 2), v_out.transpose(1, 2)
 
     @classmethod
-    def from_float(cls, kv_cache, cache_type: QuantizedCacheType):
+    def from_float(
+        cls,
+        kv_cache,
+        cache_type: QuantizedCacheType,
+        use_custom_update_cache_op: bool = False,
+    ):
         max_batch_size, n_heads, max_seq_length, head_dim = kv_cache.k_cache.shape
         if isinstance(kv_cache, CustomKVCache):
             # If replacing custom kv cache, then the shape is [B, S, H, D]
@@ -159,6 +167,7 @@ def from_float(cls, kv_cache, cache_type: QuantizedCacheType):
             n_heads,
             head_dim,
             cache_type,
+            use_custom_update_cache_op,
         )
 
 
@@ -199,6 +208,7 @@ def replace_kv_cache_with_quantized_kv_cache(module):
                 module,
                 name,
                 QuantizedKVCache.from_float(child, QuantizedCacheType.AffineAsymmetric),
+                use_custom_update_cache_op=True,
             )
         else:
             replace_kv_cache_with_quantized_kv_cache(child)