Update base for Update on " [ExecuTorch][BE] Split kv cache and SDPA for better code sharing"

kimishpatel · kimishpatel · commit 14e7cdca0f69 · 2025-01-16T10:16:43.000-08:00
Summary: Why? We have coupled SDPA with kv cache for a while. Initially this was done as we implemented sdpa_with_kv_cache custom op to reduce multiple copy overheads from kv cache update. (This could have been done by having separate custom kv cache update and custom sdpa op. Recent changes enabled this.) As a result of SDPA module owning kv cache, we get a) non-composable implementation and b) harder to reuse model definition and components from repos like tune. Output of this is that we have multiple definition of the same model, llama, lying around in ET, TorchChat and Tune. This diff and subsequent ones will try to move in the direction where custom kv cache and custom sdpa become decoupled and composable, making it more module-swap friendly with tune's model definition. How. Earlier PRs decoupled kv cache update from sdpa. So now 1. Decouple SDPA nn.Module from KV cache. 2. Standardize on KVCache and SDPA interface. That is KVCache and SDPA both operate on q, k, v in [B, # heads, seq_len, head_dim] formatted tensors. 3. 2 will introduce multiple tranposes when KVCache and SDPA are replaced by custom modules, but we will write graph pass to undo those. Test Plan: Existing tests. Make sure perf doesnt regress Differential Revision: [D67914054](https://our.internmc.facebook.com/intern/diff/D67914054) [ghstack-poisoned]
diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -294,6 +294,7 @@ include_patterns = [
     'build/**/*.py',
     'codegen/**/*.py',
     # 'devtools/**/*.py',
+    'devtools/visualization/**/*.py',
     'docs/**/*.py',
     # 'examples/**/*.py',
     # 'exir/**/*.py',
diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
@@ -122,16 +122,6 @@ def dump_intermediate_artifacts_to(
         self.path_for_intermediates = output_path
         return self
 
-    def set_input_order(
-        self, input_order: Optional[str] = None
-    ) -> "ArmCompileSpecBuilder":
-        """
-        Reorder the inputs coming in. This may be required when inputs > 1.
-        And while using the U55/U85 CompileSpec.
-        """
-        self.input_order = input_order
-        return self
-
     def build(self) -> List[CompileSpec]:
         """
         Generate a list of compile spec objects from the builder
diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py
@@ -85,33 +85,28 @@ def get_tosa_compile_spec_unbuilt(
 
 def get_u55_compile_spec(
     custom_path=None,
-    reorder_inputs=None,
 ) -> list[CompileSpec]:
     """
     Default compile spec for Ethos-U55 tests.
     """
     return get_u55_compile_spec_unbuilt(
         custom_path=custom_path,
-        reorder_inputs=reorder_inputs,
     ).build()
 
 
 def get_u85_compile_spec(
     custom_path=None,
-    reorder_inputs=None,
 ) -> list[CompileSpec]:
     """
     Default compile spec for Ethos-U85 tests.
     """
     return get_u85_compile_spec_unbuilt(
         custom_path=custom_path,
-        reorder_inputs=reorder_inputs,
     ).build()
 
 
 def get_u55_compile_spec_unbuilt(
     custom_path=None,
-    reorder_inputs=None,
 ) -> ArmCompileSpecBuilder:
     """Get the ArmCompileSpecBuilder for the Ethos-U55 tests, to modify
     the compile spec before calling .build() to finalize it.
@@ -128,14 +123,12 @@ def get_u55_compile_spec_unbuilt(
             extra_flags="--debug-force-regor --output-format=raw",
         )
         .dump_intermediate_artifacts_to(artifact_path)
-        .set_input_order(reorder_inputs)
     )
     return compile_spec
 
 
 def get_u85_compile_spec_unbuilt(
     custom_path=None,
-    reorder_inputs=None,
 ) -> list[CompileSpec]:
     """Get the ArmCompileSpecBuilder for the Ethos-U85 tests, to modify
     the compile spec before calling .build() to finalize it.
@@ -150,7 +143,6 @@ def get_u85_compile_spec_unbuilt(
             extra_flags="--output-format=raw",
         )
         .dump_intermediate_artifacts_to(artifact_path)
-        .set_input_order(reorder_inputs)
     )
     return compile_spec
 
diff --git a/backends/arm/test/test_arm_baremetal.sh b/backends/arm/test/test_arm_baremetal.sh
@@ -96,12 +96,12 @@ test_run_ethosu_fvp() { # End to End model tests
     # Ethos-U55
     echo "${TEST_SUITE_NAME}: Test ethos-u target Ethos-U55"
     examples/arm/run.sh --target=ethos-u55-128 --model_name=mv2
-    examples/arm/run.sh --target=ethos-u55-128 --model_name=lstm --reorder_inputs=1,0,2
+    examples/arm/run.sh --target=ethos-u55-128 --model_name=lstm
 
     # Ethos-U85
     echo "${TEST_SUITE_NAME}: Test ethos-u target Ethos-U85"
     examples/arm/run.sh --target=ethos-u85-128 --model_name=mv2
-    examples/arm/run.sh --target=ethos-u85-128 --model_name=lstm --reorder_inputs=1,0,2
+    examples/arm/run.sh --target=ethos-u85-128 --model_name=lstm
     }
 
 ${TEST_SUITE}
diff --git a/devtools/install_requirements.sh b/devtools/install_requirements.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Conflict: this requires numpy<2 whereas ExecuTorch core requires numpy>=2
+# Follow https://github.com/google-ai-edge/model-explorer/issues/277 for potential
+# resolution
+pip install ai-edge-model-explorer>=0.1.16
diff --git a/devtools/visualization/visualization_utils.py b/devtools/visualization/visualization_utils.py
@@ -8,9 +8,16 @@
 import time
 
 from executorch.exir import EdgeProgramManager, ExecutorchProgramManager
-from model_explorer import config, consts, visualize_from_config  # type: ignore
 from torch.export.exported_program import ExportedProgram
 
+try:
+    from model_explorer import config, consts, visualize_from_config  # type: ignore
+except ImportError:
+    print(
+        "Error: 'model_explorer' is not installed. Install using devtools/install_requirement.sh"
+    )
+    raise
+
 
 class SingletonModelExplorerServer:
     """Singleton context manager for starting a model-explorer server.
diff --git a/devtools/visualization/visualization_utils_test.py b/devtools/visualization/visualization_utils_test.py
@@ -17,7 +17,14 @@
     visualize,
 )
 from executorch.exir import ExportedProgram
-from model_explorer.config import ModelExplorerConfig  # type: ignore
+
+try:
+    from model_explorer.config import ModelExplorerConfig  # type: ignore
+except ImportError:
+    print(
+        "Error: 'model_explorer' is not installed. Install using devtools/install_requirement.sh"
+    )
+    raise
 
 
 @pytest.fixture
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
@@ -259,34 +259,25 @@ def get_calibration_data(
 def get_compile_spec(
     target: str,
     intermediates: Optional[str] = None,
-    reorder_inputs: Optional[str] = None,
     system_config: Optional[str] = None,
     memory_mode: Optional[str] = None,
 ) -> list[CompileSpec]:
     spec_builder = None
     if target == "TOSA":
         spec_builder = ArmCompileSpecBuilder().tosa_compile_spec("TOSA-0.80+BI")
     elif "ethos-u55" in target:
-        spec_builder = (
-            ArmCompileSpecBuilder()
-            .ethosu_compile_spec(
-                target,
-                system_config=system_config,
-                memory_mode=memory_mode,
-                extra_flags="--debug-force-regor --output-format=raw --verbose-operators --verbose-cycle-estimate",
-            )
-            .set_input_order(reorder_inputs)
+        spec_builder = ArmCompileSpecBuilder().ethosu_compile_spec(
+            target,
+            system_config=system_config,
+            memory_mode=memory_mode,
+            extra_flags="--debug-force-regor --output-format=raw --verbose-operators --verbose-cycle-estimate",
         )
     elif "ethos-u85" in target:
-        spec_builder = (
-            ArmCompileSpecBuilder()
-            .ethosu_compile_spec(
-                target,
-                system_config=system_config,
-                memory_mode=memory_mode,
-                extra_flags="--output-format=raw --verbose-operators --verbose-cycle-estimate",
-            )
-            .set_input_order(reorder_inputs)
+        spec_builder = ArmCompileSpecBuilder().ethosu_compile_spec(
+            target,
+            system_config=system_config,
+            memory_mode=memory_mode,
+            extra_flags="--output-format=raw --verbose-operators --verbose-cycle-estimate",
         )
 
     if intermediates is not None:
@@ -429,14 +420,6 @@ def get_args():
         required=False,
         help="Location for outputs, if not the default of cwd.",
     )
-    parser.add_argument(
-        "-r",
-        "--reorder_inputs",
-        type=str,
-        required=False,
-        default=None,
-        help="Provide the order of the inputs. This can be required when inputs > 1.",
-    )
     parser.add_argument(
         "--system_config",
         required=False,
@@ -519,7 +502,6 @@ def get_args():
         compile_spec = get_compile_spec(
             args.target,
             args.intermediates,
-            args.reorder_inputs,
             args.system_config,
             args.memory_mode,
         )
diff --git a/examples/arm/run.sh b/examples/arm/run.sh
@@ -29,7 +29,6 @@ build_with_etdump=false
 build_type="Release"
 extra_build_flags=""
 build_only=false
-reorder_inputs=""
 system_config=""
 memory_mode=""
 
@@ -46,7 +45,6 @@ help() {
     echo "  --extra_build_flags                    Extra flags to pass to cmake like -DET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=60000 Default: none "
     echo "  --build_only                           Only build, don't run FVP"
     echo "  --scratch-dir=<FOLDER>                 Path to your Ethos-U scrach dir if you not using default"
-    echo "  --reorder_inputs=<FLAGS>               Reorder the inputs. This can be required when inputs > 1."
     echo "  --system_config=<CONFIG>               System configuration to select from the Vela configuration file (see vela.ini). Default: Ethos_U55_High_End_Embedded for EthosU55 targets, Ethos_U85_SYS_DRAM_Mid for EthosU85 targets."
     echo "                                            NOTE: If given, this option must match the given target. This option also sets timing adapter values customized for specific hardware, see ./executor_runner/CMakeLists.txt."
     echo "  --memory_mode=<MODE>                   Memory mode to select from the Vela configuration file (see vela.ini), e.g. Shared_Sram/Sram_Only. Default: 'Shared_Sram' for Ethos-U55 targets, 'Sram_Only' for Ethos-U85 targets"
@@ -66,7 +64,6 @@ for arg in "$@"; do
       --extra_build_flags=*) extra_build_flags="${arg#*=}";;
       --build_only) build_only=true ;;
       --scratch-dir=*) root_dir="${arg#*=}";;
-      --reorder_inputs=*) reorder_inputs="${arg#*=}";;
       --system_config=*) system_config="${arg#*=}";;
       --memory_mode=*) memory_mode="${arg#*=}";;
       *)
@@ -151,7 +148,7 @@ function generate_pte_file() {
     # We are using the aot_lib from build_quantization_aot_lib below
     SO_LIB=$(find cmake-out-aot-lib -name libquantized_ops_aot_lib.${SO_EXT})
 
-    local ARM_AOT_CMD="python3 -m examples.arm.aot_arm_compiler --model_name=${model} --target=${target} ${model_compiler_flags} --reorder_inputs=${reorder_inputs} --output ${output_folder} --so_library=$SO_LIB --system_config=${system_config} --memory_mode=${memory_mode}"
+    local ARM_AOT_CMD="python3 -m examples.arm.aot_arm_compiler --model_name=${model} --target=${target} ${model_compiler_flags} --output ${output_folder} --so_library=$SO_LIB --system_config=${system_config} --memory_mode=${memory_mode}"
     echo "CALL ${ARM_AOT_CMD}" >&2
     ${ARM_AOT_CMD} 1>&2
 
@@ -372,7 +369,6 @@ if [[ -z "$model_name" ]]; then
 else
     test_model=( "$model_name" )
     model_compiler_flags=( "$aot_arm_compiler_flags" )
-    reorder_inputs=( "$reorder_inputs" )
 fi
 
 # loop over running the AoT flow and executing the model on device
diff --git a/install_requirements.py b/install_requirements.py
@@ -170,7 +170,6 @@ def python_is_compatible():
     "tomli",  # Imported by extract_sources.py when using python < 3.11.
     "wheel",  # For building the pip package archive.
     "zstd",  # Imported by resolve_buck.py.
-    "ai-edge-model-explorer>=0.1.16",  # For visualizing ExportedPrograms
 ]
 
 # Assemble the list of requirements to actually install.
diff --git a/pytest.ini b/pytest.ini
@@ -14,6 +14,7 @@ addopts =
     # explicitly list out tests that are running successfully in oss
     examples/models/test
     devtools/
+    --ignore=devtools/visualization/visualization_utils_test.py
     # examples
     examples/models/llama/tests
     examples/models/llama3_2_vision/preprocess

Original file line number	Diff line number	Diff line change
`@@ -170,7 +170,6 @@ def python_is_compatible():`
`170`	`170`	`"tomli", # Imported by extract_sources.py when using python < 3.11.`
`171`	`171`	`"wheel", # For building the pip package archive.`
`172`	`172`	`"zstd", # Imported by resolve_buck.py.`
`173`		`- "ai-edge-model-explorer>=0.1.16", # For visualizing ExportedPrograms`
`174`	`173`	`]`
`175`	`174`
`176`	`175`	`# Assemble the list of requirements to actually install.`