[Executorch] Add quantized kv cache to oss ci

kimishpatel · kimishpatel · commit 957eccd374b3 · 2024-11-22T11:24:24.000-08:00
Pull Request resolved: #6997 Fixes to make sure quantized kv cache works in oss ghstack-source-id: 255034795 @exported-using-ghexport Differential Revision: [D66269487](https://our.internmc.facebook.com/intern/diff/D66269487/)
diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
@@ -100,6 +100,12 @@ else
   COREML=OFF
 fi
 
+if [[ "${MODE}" =~ .*quantize_kv.* ]]; then
+  QUANTIZE_KV_CACHE=ON
+else
+  QUANTIZE_KV_CACHE=OFF
+fi
+
 echo "COREML option ${COREML}"
 
 if [[ "${MODE}" =~ .*qnn.* ]]; then
@@ -235,6 +241,9 @@ fi
 if [[ "${QNN}" == "ON" ]]; then
   EXPORT_ARGS="${EXPORT_ARGS} -kv -v --qnn --disable_dynamic_shape"
 fi
+if [[ "${QUANTIZE_KV_CACHE}" == "ON" ]]; then
+  EXPORT_ARGS="${EXPORT_ARGS} --quantize_kv_cache"
+fi
 # Add dynamically linked library location
 $PYTHON_EXECUTABLE -m examples.models.llama.export_llama ${EXPORT_ARGS}
 
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -86,7 +86,7 @@ jobs:
     strategy:
       matrix:
         dtype: [fp32]
-        mode: [portable, xnnpack+custom, xnnpack+custom+qe]
+        mode: [portable, xnnpack+custom, xnnpack+custom+qe,xnnpack+custom+quantize_kv,xnnpack+quantize_kv]
         include:
           - dtype: bf16
             mode: portable
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
@@ -225,7 +225,7 @@ jobs:
     strategy:
       matrix:
         dtype: [fp32]
-        mode: [portable, xnnpack+kv+custom, mps, coreml]
+        mode: [portable, xnnpack+kv+custom, mps, coreml, xnnpack+custom+quantize_kv]
         include:
           - dtype: bf16
             mode: portable
diff --git a/examples/models/llama/source_transformation/quantized_kv_cache.py b/examples/models/llama/source_transformation/quantized_kv_cache.py
@@ -10,9 +10,30 @@
 import torch
 import torch.nn as nn
 from executorch.examples.models.llama.llama_transformer import KVCache
+
+from executorch.extension.llm.custom_ops import custom_ops  # noqa: F401
 from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib  # noqa: F401
 
 
+try:
+    op = torch.ops.quantized_decomposed.quantize_per_token.out
+    assert op is not None
+except:
+    import executorch
+    import glob
+
+    executorch_package_path = executorch.__path__[0]
+    libs = list(
+        glob.glob(
+            f"{executorch_package_path}/**/libquantized_ops_aot_lib.*", recursive=True
+        )
+    )
+    assert len(libs) == 1, f"Expected 1 library but got {len(libs)}"
+    logging.info(f"Loading custom ops library: {libs[0]}")
+    torch.ops.load_library(libs[0])
+    op = torch.ops.quantized_decomposed.quantize_per_token.out
+    assert op is not None
+
 """
  Heavily "inspired" by AO's implementation of the same in torchao/_models/llama/model.py
 """
diff --git a/examples/models/llama/source_transformation/sdpa.py b/examples/models/llama/source_transformation/sdpa.py
@@ -56,7 +56,7 @@ def forward(
 
         k_cache = self.kv_cache.k_cache
         v_cache = self.kv_cache.v_cache
-        if isinstance(self.kv_cache, QuantizedKVCache):
+        if hasattr(self.kv_cache, "quantized_cache_dtype"):
             # updated quantize cache, scale and zero points
             # returns dequantized kv cache
             # Not most optimal. Optimizations to follow next
diff --git a/extension/llm/custom_ops/custom_ops.py b/extension/llm/custom_ops/custom_ops.py
@@ -17,14 +17,15 @@
 
 from torch.library import impl
 
-# TODO rename this file to custom_ops_meta_registration.py
 try:
     op = torch.ops.llama.sdpa_with_kv_cache.default
     assert op is not None
     op2 = torch.ops.llama.fast_hadamard_transform.default
     assert op2 is not None
 except:
-    libs = list(Path(__file__).parent.resolve().glob("libcustom_ops_aot_lib.*"))
+    path = Path(__file__).parent.resolve()
+    logging.info(f"Looking for libcustom_ops_aot_lib.so in {path}")
+    libs = list(path.glob("libcustom_ops_aot_lib.*"))
     assert len(libs) == 1, f"Expected 1 library but got {len(libs)}"
     logging.info(f"Loading custom ops library: {libs[0]}")
     torch.ops.load_library(libs[0])
diff --git a/kernels/quantized/CMakeLists.txt b/kernels/quantized/CMakeLists.txt
@@ -60,14 +60,17 @@ if(NOT CMAKE_GENERATOR STREQUAL "Xcode"
     set(_quantized_aot_ops
         "quantized_decomposed::add.out"
         "quantized_decomposed::choose_qparams.Tensor_out"
+        "quantized_decomposed::choose_qparams_per_token_asymmetric.out"
         "quantized_decomposed::dequantize_per_channel.out"
         "quantized_decomposed::dequantize_per_tensor.out"
         "quantized_decomposed::dequantize_per_tensor.Tensor_out"
+        "quantized_decomposed::dequantize_per_token.out"
         "quantized_decomposed::mixed_linear.out"
         "quantized_decomposed::mixed_mm.out"
         "quantized_decomposed::quantize_per_channel.out"
         "quantized_decomposed::quantize_per_tensor.out"
         "quantized_decomposed::quantize_per_tensor.Tensor_out"
+        "quantized_decomposed::quantize_per_token.out"
     )
     gen_selected_ops(
       LIB_NAME "quantized_ops_aot_lib" ROOT_OPS ${_quantized_aot_ops}