[Executorch] Add quantized kv cache to oss ci

kimishpatel · kimishpatel · commit d28f61afad7b · 2024-11-21T06:43:55.000-08:00
Pull Request resolved: #6997 Fixes to make sure quantized kv cache works in oss ghstack-source-id: 254774011 @exported-using-ghexport Differential Revision: [D66269487](https://our.internmc.facebook.com/intern/diff/D66269487/)
diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
@@ -70,6 +70,12 @@ else
   COREML=OFF
 fi
 
+if [[ "${MODE}" =~ .*quantize_kv.* ]]; then
+  QUANTIZE_KV_CACHE=ON
+else
+  QUANTIZE_KV_CACHE=OFF
+fi
+
 echo "COREML option ${COREML}"
 
 if [[ "${MODE}" =~ .*qnn.* ]]; then
@@ -205,6 +211,9 @@ fi
 if [[ "${QNN}" == "ON" ]]; then
   EXPORT_ARGS="${EXPORT_ARGS} -kv -v --qnn --disable_dynamic_shape"
 fi
+if [[ "${QUANTIZE_KV_CACHE}" == "ON" ]]; then
+  EXPORT_ARGS="${EXPORT_ARGS} --quantize_kv_cache"
+fi
 # Add dynamically linked library location
 $PYTHON_EXECUTABLE -m examples.models.llama.export_llama ${EXPORT_ARGS}
 
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -86,7 +86,7 @@ jobs:
     strategy:
       matrix:
         dtype: [fp32]
-        mode: [portable, xnnpack+custom, xnnpack+custom+qe]
+        mode: [portable, xnnpack+custom, xnnpack+custom+qe,xnnpack+custom+quantize_kv,xnnpack+quantize_kv]
         include:
           - dtype: bf16
             mode: portable
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
@@ -225,7 +225,7 @@ jobs:
     strategy:
       matrix:
         dtype: [fp32]
-        mode: [portable, xnnpack+kv+custom, mps, coreml]
+        mode: [portable, xnnpack+kv+custom, mps, coreml, xnnpack+custom+quantize_kv]
         include:
           - dtype: bf16
             mode: portable
diff --git a/examples/models/llama/source_transformation/quantized_kv_cache.py b/examples/models/llama/source_transformation/quantized_kv_cache.py
@@ -7,6 +7,8 @@
 import logging
 from enum import Enum
 
+import executorch.extension.llm.custom_ops  # noqa: F401
+
 import torch
 import torch.nn as nn
 from executorch.examples.models.llama.llama_transformer import KVCache
diff --git a/examples/models/llama/source_transformation/sdpa.py b/examples/models/llama/source_transformation/sdpa.py
@@ -56,7 +56,7 @@ def forward(
 
         k_cache = self.kv_cache.k_cache
         v_cache = self.kv_cache.v_cache
-        if isinstance(self.kv_cache, QuantizedKVCache):
+        if hasattr(self.kv_cache, "quantized_cache_dtype"):
             # updated quantize cache, scale and zero points
             # returns dequantized kv cache
             # Not most optimal. Optimizations to follow next
diff --git a/exir/passes/_quant_patterns_and_replacements.py b/exir/passes/_quant_patterns_and_replacements.py
@@ -192,6 +192,19 @@ def embedding_byte_dtype_out_meta(
     "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)",
 )
 
+# TODO: move these registrations to pytorch core
+quantized_decomposed_lib.define(
+    "quantize_per_token.out(Tensor input, Tensor scales, Tensor zero_points, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)",
+)
+
+quantized_decomposed_lib.define(
+    "dequantize_per_token.out(Tensor input, Tensor scales, Tensor zero_points, int quant_min, int quant_max, ScalarType dtype, ScalarType output_dtype, *, Tensor(a!) out) -> Tensor(a!)",
+)
+
+quantized_decomposed_lib.define(
+    "choose_qparams_per_token_asymmetric.out(Tensor input, ScalarType dtype, *, Tensor(a!) scale_out, Tensor(b!) zero_point_out) -> (Tensor(a!), Tensor(b!))",
+)
+
 
 @impl(quantized_decomposed_lib, "embedding_2bit", "CompositeExplicitAutograd")
 def embedding_2bit(