Update on "[Executorch] Add quantized kv cache to oss ci"

kimishpatel · kimishpatel · commit 643086cb5361 · 2024-12-04T16:56:31.000-08:00
Fixes to make sure quantized kv cache works in oss Differential Revision: [D66269487](https://our.internmc.facebook.com/intern/diff/D66269487/) [ghstack-poisoned]
diff --git a/examples/models/llama/source_transformation/quantized_kv_cache.py b/examples/models/llama/source_transformation/quantized_kv_cache.py
@@ -22,8 +22,6 @@
 
     import executorch
 
-    from executorch.extension.pybindings import portable_lib  # noqa # usort: skip
-
     # Ideally package is installed in only one location but usage of
     # PYATHONPATH can result in multiple locations.
     # ATM this is mainly used in CI for qnn runner. Will need to revisit this
@@ -247,8 +245,6 @@ def from_float(cls, kv_cache, cache_type: QuantizedCacheType):
 
 
 def replace_kv_cache_with_quantized_kv_cache(module):
-    # This is needed to ensure that custom ops are registered
-    from executorch.extension.pybindings import portable_lib  # noqa # usort: skip
     from executorch.extension.llm.custom_ops import custom_ops  # noqa: F401
 
     logging.warning(
diff --git a/extension/llm/custom_ops/custom_ops.py b/extension/llm/custom_ops/custom_ops.py
@@ -26,6 +26,9 @@
 
     import executorch
 
+    # This is needed to ensure that custom ops are registered
+    from executorch.extension.pybindings import portable_lib  # noqa # usort: skip
+
     # Ideally package is installed in only one location but usage of
     # PYATHONPATH can result in multiple locations.
     # ATM this is mainly used in CI for qnn runner. Will need to revisit this