Update on "[Executorch] Add quantized kv cache to oss ci"

kimishpatel · kimishpatel · commit a943088fb547 · 2024-11-21T16:37:25.000-08:00
Fixes to make sure quantized kv cache works in oss Differential Revision: [D66269487](https://our.internmc.facebook.com/intern/diff/D66269487/) [ghstack-poisoned]
diff --git a/examples/models/llama/source_transformation/quantized_kv_cache.py b/examples/models/llama/source_transformation/quantized_kv_cache.py
@@ -4,17 +4,29 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from pathlib import Path
 import logging
 from enum import Enum
 
-import executorch.extension.llm.custom_ops  # noqa: F401
+from executorch.extension.llm.custom_ops import custom_ops  # noqa: F401
 
 import torch
 import torch.nn as nn
 from executorch.examples.models.llama.llama_transformer import KVCache
 from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib  # noqa: F401
 
 
+try:
+    op = torch.ops.quantized_decomposed.quantize_per_token
+    assert op is not None
+except:
+    libs = list(Path(__file__).parent.resolve().glob("libquantized_ops_aot_lib.*"))
+    assert len(libs) == 1, f"Expected 1 library but got {len(libs)}"
+    logging.info(f"Loading custom ops library: {libs[0]}")
+    torch.ops.load_library(libs[0])
+    op = torch.ops.quantized_decomposed.quantize_per_token
+    assert op is not None
+
 """
  Heavily "inspired" by AO's implementation of the same in torchao/_models/llama/model.py
 """
diff --git a/exir/passes/_quant_patterns_and_replacements.py b/exir/passes/_quant_patterns_and_replacements.py
@@ -192,19 +192,6 @@ def embedding_byte_dtype_out_meta(
     "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)",
 )
 
-# TODO: move these registrations to pytorch core
-quantized_decomposed_lib.define(
-    "quantize_per_token.out(Tensor input, Tensor scales, Tensor zero_points, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)",
-)
-
-quantized_decomposed_lib.define(
-    "dequantize_per_token.out(Tensor input, Tensor scales, Tensor zero_points, int quant_min, int quant_max, ScalarType dtype, ScalarType output_dtype, *, Tensor(a!) out) -> Tensor(a!)",
-)
-
-quantized_decomposed_lib.define(
-    "choose_qparams_per_token_asymmetric.out(Tensor input, ScalarType dtype, *, Tensor(a!) scale_out, Tensor(b!) zero_point_out) -> (Tensor(a!), Tensor(b!))",
-)
-
 
 @impl(quantized_decomposed_lib, "embedding_2bit", "CompositeExplicitAutograd")
 def embedding_2bit(
diff --git a/extension/llm/custom_ops/custom_ops.py b/extension/llm/custom_ops/custom_ops.py
@@ -17,7 +17,6 @@
 
 from torch.library import impl
 
-# TODO rename this file to custom_ops_meta_registration.py
 try:
     op = torch.ops.llama.sdpa_with_kv_cache.default
     assert op is not None