Update on "[Executorch] Add quantized kv cache to oss ci"

kimishpatel · kimishpatel · commit c984a6e0655b · 2024-12-04T11:23:07.000-08:00
Fixes to make sure quantized kv cache works in oss Differential Revision: [D66269487](https://our.internmc.facebook.com/intern/diff/D66269487/) [ghstack-poisoned]
diff --git a/examples/models/llama/source_transformation/quantized_kv_cache.py b/examples/models/llama/source_transformation/quantized_kv_cache.py
@@ -11,6 +11,8 @@
 import torch.nn as nn
 from executorch.examples.models.llama.llama_transformer import KVCache
 
+# This is needed to ensure that custom ops are registered
+from executorch.extension.pybindings import portable_lib  # noqa # usort: skip
 from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib  # noqa: F401
 
 
@@ -233,9 +235,8 @@ def from_float(cls, kv_cache, cache_type: QuantizedCacheType):
 
 
 def replace_kv_cache_with_quantized_kv_cache(module):
-    # This is needed to ensure that custom ops are registered
-    from executorch.extension.pybindings import portable_lib  # noqa # usort: skip
     from executorch.extension.llm.custom_ops import custom_ops  # noqa: F401
+
     logging.warning(
         "Replacing KVCache with QuantizedKVCache. This modifies the model in place."
     )