diff --git a/.ci/scripts/setup-windows.ps1 b/.ci/scripts/setup-windows.ps1
index 20d29e4f558..329e81b3cf0 100644
--- a/.ci/scripts/setup-windows.ps1
+++ b/.ci/scripts/setup-windows.ps1
@@ -1,5 +1,5 @@
 param (
-    [string]$editable = $false
+    [string]$editable = "false"
 )
 
 conda create --yes --quiet -n et python=3.12
diff --git a/.ci/scripts/test_torchao_huggingface_checkpoints.sh b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
index c0910b47826..3c9ac598f8f 100644
--- a/.ci/scripts/test_torchao_huggingface_checkpoints.sh
+++ b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-set -euo pipefail
+set -euxo pipefail
 
 # -------------------------
 # Args / flags
diff --git a/.ci/scripts/unittest-windows.ps1 b/.ci/scripts/unittest-windows.ps1
index 65ed303051b..6f1365bc3fc 100644
--- a/.ci/scripts/unittest-windows.ps1
+++ b/.ci/scripts/unittest-windows.ps1
@@ -1,15 +1,38 @@
 param (
-    [string]$editable = $false
+    [string]$buildMode = "Release"
 )
 
 Set-PSDebug -Trace 1
 $ErrorActionPreference = 'Stop'
 $PSNativeCommandUseErrorActionPreference = $true
 
-# Run pytest with coverage
-# pytest -n auto --cov=./ --cov-report=xml
-pytest -v --full-trace -c pytest-windows.ini
+# Run native unit tests (via ctest)
+New-Item -Path "test-build" -ItemType Directory
+cd "test-build"
+
+cmake .. --preset windows -B . -DEXECUTORCH_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=$buildMode
 if ($LASTEXITCODE -ne 0) {
-    Write-Host "Pytest invocation was unsuccessful. Exit code: $LASTEXITCODE."
+    Write-Host "CMake configuration was unsuccessful. Exit code: $LASTEXITCODE."
+    exit $LASTEXITCODE
+}
+
+cmake --build . -j8 --config $buildMode --verbose
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "CMake build was unsuccessful. Exit code: $LASTEXITCODE."
     exit $LASTEXITCODE
 }
+
+ctest -j8 . --build-config $buildMode --output-on-failure -E "method_test|tensor_parser_test"
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "CTest run was unsuccessful. Exit code: $LASTEXITCODE."
+    exit $LASTEXITCODE
+}
+
+cd ..
+
+# Run pytest
+pytest -v -c pytest-windows.ini
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "Pytest invocation was unsuccessful. Exit code: $LASTEXITCODE."
+    exit $LASTEXITCODE
+}
\ No newline at end of file
diff --git a/.github/workflows/_unittest.yml b/.github/workflows/_unittest.yml
index a619b33dd2e..587f2cf5e5a 100644
--- a/.github/workflows/_unittest.yml
+++ b/.github/workflows/_unittest.yml
@@ -69,7 +69,15 @@ jobs:
           \$ErrorActionPreference = 'Stop'
           \$PSNativeCommandUseErrorActionPreference = \$true
 
-          .ci/scripts/setup-windows.ps1       
+          .ci/scripts/setup-windows.ps1 -editable "${{ inputs.editable }}"
+          if (\$LASTEXITCODE -ne 0) {
+              Write-Host "Setup failed. Exit code: \$LASTEXITCODE."
+              exit \$LASTEXITCODE
+          }
 
-          powershell .ci/scripts/unittest-windows.ps1 -editable "${{ inputs.editable }}"
+          .ci/scripts/unittest-windows.ps1 -buildMode "${{ inputs.build-mode }}"
+          if (\$LASTEXITCODE -ne 0) {
+              Write-Host "Unit tests failed. Exit code: \$LASTEXITCODE."
+              exit \$LASTEXITCODE
+          }
         }"
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 815e106ae1e..d8c551e8982 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -779,6 +779,7 @@ jobs:
       contents: read
     strategy:
       fail-fast: false
+    if: false # TODO Re-enable after fixing timeouts (#14314)
     with:
       runner: linux.2xlarge
       docker-image: ci-image:executorch-ubuntu-22.04-gcc9
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index f5c5161e0cc..975a8ebbb30 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -1032,5 +1032,5 @@ jobs:
 
           .ci/scripts/setup-windows.ps1
 
-          powershell .ci/scripts/test_model.ps1 -modelName ${{ matrix.model }} -backend ${{ matrix.backend }}
+          .ci/scripts/test_model.ps1 -modelName ${{ matrix.model }} -backend ${{ matrix.backend }}
         }"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2664b4491c9..fc427d517a9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -143,9 +143,13 @@ endif()
 
 # -ffunction-sections -fdata-sections: breaks function and data into sections so
 # they can be properly gc'd. -s: strip symbol.
-set(CMAKE_CXX_FLAGS_RELEASE
-    "-ffunction-sections -fdata-sections ${CMAKE_CXX_FLAGS_RELEASE}"
-)
+if(WIN32)
+  set(CMAKE_CXX_FLAGS_RELEASE "/Gy /Gw ${CMAKE_CXX_FLAGS_RELEASE}")
+else()
+  set(CMAKE_CXX_FLAGS_RELEASE
+      "-ffunction-sections -fdata-sections ${CMAKE_CXX_FLAGS_RELEASE}"
+  )
+endif()
 if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s")
 endif()
diff --git a/backends/arm/TARGETS b/backends/arm/TARGETS
index b00e8057df6..a78ab252739 100644
--- a/backends/arm/TARGETS
+++ b/backends/arm/TARGETS
@@ -22,8 +22,7 @@ runtime.python_library(
         "common/debug.py",
     ],
     deps = [
-        "fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/serializer:serializer",
-        "fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/serializer:serializer",
+        "fbsource//third-party/tosa_tools:serializer",
         "//caffe2:torch",
         "//executorch/exir:lib",
     ],
@@ -37,10 +36,8 @@ runtime.python_library(
     deps = [
         "fbsource//third-party/pypi/flatbuffers:flatbuffers",
         "fbsource//third-party/pypi/ml-dtypes:ml-dtypes",
-        "fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/serializer:serializer",
-        "fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/serializer:serializer",
-        "fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/tosa:tosa",
-        "fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/tosa:tosa",
+        "fbsource//third-party/tosa_tools:serializer",
+        "fbsource//third-party/tosa_tools:tosa",
         ":process_node",
         "//executorch/exir/backend:compile_spec_schema",
         "//executorch/backends/arm/operators:lib",
@@ -83,8 +80,7 @@ runtime.python_library(
     name = "process_node",
     srcs = ["process_node.py"],
     deps = [
-        "fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/tosa:tosa",
-        "fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/tosa:tosa",
+        "fbsource//third-party/tosa_tools:tosa",
         "//executorch/backends/arm/operators:node_visitor",
         "//executorch/backends/arm/tosa:mapping",
         "//executorch/backends/arm/tosa:quant_utils",
diff --git a/backends/arm/arm_vela.py b/backends/arm/arm_vela.py
index c47a5c58f49..90f9dcb8324 100644
--- a/backends/arm/arm_vela.py
+++ b/backends/arm/arm_vela.py
@@ -25,6 +25,8 @@
 # per-io structs to simplify runtime use.
 def vela_bin_pack_io(prefix, data):
     vela_input_shapes = data[prefix + "_shape"]
+    # Vela input/output shape is fixed to 6D
+    vela_io_shape_dims = 6
 
     ios = struct.pack("<i", len(vela_input_shapes))
     for i in range(len(vela_input_shapes)):
@@ -32,10 +34,10 @@ def vela_bin_pack_io(prefix, data):
         io_elem_size = data[prefix + "_elem_size"][i]
         io_offset = data[prefix + "_offset"][i]
         io_region = data[prefix + "_region"][i]
-        assert len(io_shape) <= 4
-        inp_pad = io_shape.tolist() + [0] * (4 - len(io_shape))
+        assert len(io_shape) == vela_io_shape_dims
+        inp_pad = io_shape.tolist()
         io_struct = struct.pack(
-            "<iiiiiii", *inp_pad, io_elem_size, io_offset, io_region
+            "<iiiiiiiii", *inp_pad, io_elem_size, io_offset, io_region
         )
         ios += io_struct
     return ios
diff --git a/backends/arm/debug/TARGETS b/backends/arm/debug/TARGETS
index 8ddfd9a285c..a88e3b077cd 100644
--- a/backends/arm/debug/TARGETS
+++ b/backends/arm/debug/TARGETS
@@ -8,7 +8,7 @@ runtime.python_library(
         "schema.py",
     ],
     deps = [
-        "fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/serializer:serializer",
+        "fbsource//third-party/tosa_tools:serializer",
         "//caffe2:torch",
     ],
 )
diff --git a/backends/arm/ethosu/backend.py b/backends/arm/ethosu/backend.py
index c748cf96e93..b7b8798c3e6 100644
--- a/backends/arm/ethosu/backend.py
+++ b/backends/arm/ethosu/backend.py
@@ -15,6 +15,7 @@
 from typing import final, List
 
 from executorch.backends.arm.arm_vela import vela_compile
+from executorch.backends.arm.ethosu.compile_spec import EthosUCompileSpec
 
 from executorch.backends.arm.tosa.backend import TOSABackend
 from executorch.exir.backend.backend_details import BackendDetails, PreprocessResult
@@ -35,16 +36,13 @@ class EthosUBackend(BackendDetails):
 
     @staticmethod
     def _compile_tosa_flatbuffer(
-        tosa_flatbuffer: bytes, compile_spec: List[CompileSpec]
+        tosa_flatbuffer: bytes, compile_spec: EthosUCompileSpec
     ) -> bytes:
         """
         Static helper method to do the compilation of the TOSA flatbuffer
         representation to a target specific binary stream.
         """
-        compile_flags = []
-        for spec in compile_spec:
-            if spec.key == "compile_flags":
-                compile_flags.append(spec.value.decode())
+        compile_flags = compile_spec.compiler_flags
 
         if len(compile_flags) == 0:
             # Not testing for compile_flags correctness here, just that they are
@@ -64,10 +62,11 @@ def _compile_tosa_flatbuffer(
     @staticmethod
     def preprocess(
         edge_program: ExportedProgram,
-        compile_spec: List[CompileSpec],
+        compile_specs: List[CompileSpec],
     ) -> PreprocessResult:
         logger.info(f"{EthosUBackend.__name__} preprocess")
 
+        compile_spec = EthosUCompileSpec.from_list(compile_specs)
         # deduce TOSA compile_spec from Ethos-U compile spec. We get a new
         # compile spec list, containing only elements relevant for the
         # TOSABackend.
@@ -77,7 +76,7 @@ def preprocess(
         # ('All backend implementation are final...'), so use composition instead.
         # preprocess returns the serialized TOSA flatbuffer in .processed_bytes,
         # which can be passed on to next compilation step.
-        tosa_preprocess = TOSABackend.preprocess(edge_program, tosa_compile_spec)
+        tosa_preprocess = TOSABackend._preprocess(edge_program, tosa_compile_spec)
 
         binary = EthosUBackend._compile_tosa_flatbuffer(
             tosa_preprocess.processed_bytes, compile_spec
diff --git a/backends/arm/operators/TARGETS b/backends/arm/operators/TARGETS
index 2c255b3c17a..afe1c4dd22c 100644
--- a/backends/arm/operators/TARGETS
+++ b/backends/arm/operators/TARGETS
@@ -20,8 +20,7 @@ runtime.python_library(
     name = "ops",
     srcs = glob(["op_*.py", "ops_*.py"]),
     deps = [
-        "fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/tosa:tosa",
-        "fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/tosa:tosa",
+        "fbsource//third-party/tosa_tools:tosa",
         ":node_visitor",
         ":operator_validation_utils",
         "//executorch/backends/arm/tosa:mapping",
diff --git a/backends/arm/operators/op_abs.py b/backends/arm/operators/op_abs.py
index 625293d66e0..ec76eb5517f 100644
--- a/backends/arm/operators/op_abs.py
+++ b/backends/arm/operators/op_abs.py
@@ -73,7 +73,9 @@ def define_node(
             abs_output = output
 
         # Do the INT32 Abs
-        tosa_graph.addOperator(
+        self._serialize_operator(
+            node,
+            tosa_graph,
             ts.TosaOp.Op().ABS,
             [
                 rescaled_inputs[0].name,
diff --git a/backends/arm/operators/op_rescale.py b/backends/arm/operators/op_rescale.py
index d7be2be737c..d331ebc80d5 100644
--- a/backends/arm/operators/op_rescale.py
+++ b/backends/arm/operators/op_rescale.py
@@ -46,13 +46,20 @@ def define_node(
         input_zp = cast(int, node.args[3])
         output_zp = cast(int, node.args[4])
 
-        if input_dtype != map_dtype(torch.int8, self.tosa_spec) and input_zp != 0:
+        if (
+            input_dtype
+            not in [
+                map_dtype(torch.int8, self.tosa_spec),
+                map_dtype(torch.int16, self.tosa_spec),
+            ]
+            and input_zp != 0
+        ):
             raise ValueError(
-                f"If input dtype is not int8, input_zp must be 0. Got input_dtype{input_dtype=}, {input_zp=}"
+                f"If input dtype is not int8 or int16, input_zp must be 0. Got input_dtype{input_dtype=}, {input_zp=}"
             )
-        if output_dtype != torch.int8 and output_zp != 0:
+        if output_dtype not in [torch.int8, torch.int16] and output_zp != 0:
             raise ValueError(
-                f"If output dtype is not int8, output_zp must be 0. Got {ts.DTypeNames[output_dtype]}, {output_zp=}"
+                f"If output dtype is not int8 or int16, output_zp must be 0. Got {ts.DTypeNames[output_dtype]}, {output_zp=}"
             )
 
         build_rescale(
diff --git a/backends/arm/operators/op_sum.py b/backends/arm/operators/op_sum.py
index 0bd152a8b8c..00676d9f9b3 100644
--- a/backends/arm/operators/op_sum.py
+++ b/backends/arm/operators/op_sum.py
@@ -67,7 +67,9 @@ def define_node(
             dtype=ts.DType.INT32,
         )
 
-        tosa_graph.addOperator(
+        self._serialize_operator(
+            node,
+            tosa_graph,
             ts.TosaOp.Op().REDUCE_SUM,
             [rescaled_inputs[0].name],
             [intermediate.name],
@@ -111,7 +113,9 @@ def define_node(
         attr = ts.TosaSerializerAttribute()
         attr.ReduceSumAttribute(tensor.dim_order.index(dim))
 
-        tosa_graph.addOperator(
+        self._serialize_operator(
+            node,
+            tosa_graph,
             ts.TosaOp.Op().REDUCE_SUM,
             [tensor.name],
             [output.name],
diff --git a/backends/arm/requirements-arm-ethos-u.txt b/backends/arm/requirements-arm-ethos-u.txt
index 5fad9d2fe94..a26fb014234 100644
--- a/backends/arm/requirements-arm-ethos-u.txt
+++ b/backends/arm/requirements-arm-ethos-u.txt
@@ -3,4 +3,4 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-ethos-u-vela @ git+https://gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-vela@d37febc1715edf0d236c2ff555739a8a9aadcf9a
+ethos-u-vela == 4.4.0
diff --git a/backends/arm/requirements-arm-tosa.txt b/backends/arm/requirements-arm-tosa.txt
index 4b7a3ec0273..0f9c2f702a4 100644
--- a/backends/arm/requirements-arm-tosa.txt
+++ b/backends/arm/requirements-arm-tosa.txt
@@ -5,5 +5,7 @@
 
 ml_dtypes == 0.5.1
 flatbuffers == 24.3.25
+tosa-adapter-model-explorer == 0.0.1
+ai-edge-model-explorer >= 0.1.16
 
 tosa-tools @ git+https://git.gitlab.arm.com/tosa/tosa-reference-model.git@v2025.07.0
diff --git a/backends/arm/runtime/EthosUBackend.cpp b/backends/arm/runtime/EthosUBackend.cpp
index bff5ff69284..8f63569eece 100644
--- a/backends/arm/runtime/EthosUBackend.cpp
+++ b/backends/arm/runtime/EthosUBackend.cpp
@@ -383,8 +383,8 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
       *tensor_count = *tensor_count * tensor.size(i);
     }
 
-    // The VelaIO type has a shape of fixed size 4
-    for (int i = 0; i < 4; i++) {
+    // The VelaIO type has a shape of fixed size 6
+    for (int i = 0; i < shapeDim; i++) {
       *io_count = *io_count * io->shape[i];
     }
   }
diff --git a/backends/arm/runtime/VelaBinStream.h b/backends/arm/runtime/VelaBinStream.h
index 7a7ea9b6266..7f6606200b3 100644
--- a/backends/arm/runtime/VelaBinStream.h
+++ b/backends/arm/runtime/VelaBinStream.h
@@ -34,9 +34,11 @@ typedef struct {
   char data[]; // block.name specific format data
 } VelaBinBlock;
 
+constexpr int shapeDim = 6; // Number of dimensions in VelaIO
+
 // A Vela input or output descriptor in the binary stream
 typedef struct {
-  int shape[4]; // Up to 4D shape of input or output
+  int shape[shapeDim]; // Shape of input or output
   int elem_size; // Element sizeof in bytes
   int offset; // Offset in bytes within SRAM working data
   int region; // Scratch region this belongs to
diff --git a/backends/arm/scripts/build_executor_runner.sh b/backends/arm/scripts/build_executor_runner.sh
index a05287ac4bf..104e3d02a25 100755
--- a/backends/arm/scripts/build_executor_runner.sh
+++ b/backends/arm/scripts/build_executor_runner.sh
@@ -44,7 +44,7 @@ help() {
     echo "  --memory_mode=<CONFIG>               Vela memory mode, used for setting the Timing Adapter parameters of the Corstone platforms."
     echo "                                       Valid values are Shared_Sram(for Ethos-U55, Ethos-U65, Ethos-85), Sram_Only(for Ethos-U55, Ethos-U65, Ethos-U85) or Dedicated_Sram(for Ethos-U65, Ethos-U85)."
     echo "                                       Default: Shared_Sram for the Ethos-U55 and Sram_Only for the Ethos-U85"
-    echo "  --etdump                             Adds Devtools etdump support to track timing, etdump area will be base64 encoded in the log"
+    echo "  --etdump                             Adds Devtools etdump support to track timing and output, etdump area will be base64 encoded in the log"
     echo "  --extra_build_flags=<FLAGS>          Extra flags to pass to cmake like -DET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=60000 Default: none "
     echo "  --output=<FOLDER>                    Output folder Default: <MODEL>/<MODEL>_<TARGET INFO>.pte"
     echo "  --et_build_root=<FOLDER>             Build output root folder to use, defaults to ${et_build_root}"
@@ -161,7 +161,7 @@ if [ "$bundleio" = true ] ; then
 fi
 
 if [ "$build_with_etdump" = true ] ; then
-    build_with_etdump_flags=" -DEXECUTORCH_ENABLE_EVENT_TRACER=ON "
+    build_with_etdump_flags=" -DEXECUTORCH_ENABLE_EVENT_TRACER=ON -DET_DUMP_INTERMEDIATE_OUTPUTS=ON "
 fi
 
 echo "Building with BundleIO/etdump/extra flags: ${build_bundleio_flags} ${build_with_etdump_flags} ${extra_build_flags}"
diff --git a/backends/arm/scripts/mlsdk_utils.sh b/backends/arm/scripts/mlsdk_utils.sh
index f62b9f6d4f0..7a7d2585e52 100755
--- a/backends/arm/scripts/mlsdk_utils.sh
+++ b/backends/arm/scripts/mlsdk_utils.sh
@@ -7,7 +7,7 @@
 set -euo pipefail
 
 mlsdk_manifest_url="https://github.com/arm/ai-ml-sdk-manifest.git"
-mlsdk_manifest_tag="dev-snapshot-2025-09-12"
+mlsdk_manifest_tag="refs/tags/dev-snapshot-2025-09-12"
 
 script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 
diff --git a/backends/arm/scripts/run_fvp.sh b/backends/arm/scripts/run_fvp.sh
index 769b2e30282..0f76d0496de 100755
--- a/backends/arm/scripts/run_fvp.sh
+++ b/backends/arm/scripts/run_fvp.sh
@@ -21,6 +21,7 @@ elf_file=""
 data_file=""
 target="ethos-u55-128"
 timeout="600"
+etrecord_file=""
 
 help() {
     echo "Usage: $(basename $0) [options]"
@@ -29,6 +30,7 @@ help() {
     echo "  --data=<FILE>@<ADDRESS>  Place a file in memory at this address, useful to emulate a PTE flashed into memory instead as part of the code."
     echo "  --target=<TARGET>        Target to build and run for Default: ${target}"
     echo "  --timeout=<TIME_IN_SEC>  Maximum target runtime, used to detect hanging, might need to be higer on large models Default: ${timeout}"
+    echo "  --etrecord=<FILE>        If ETDump is used you can supply a ETRecord file matching the PTE"
     exit 0
 }
 
@@ -39,6 +41,7 @@ for arg in "$@"; do
       --data=*) data_file="--data ${arg#*=}";;
       --target=*) target="${arg#*=}";;
       --timeout=*) timeout="${arg#*=}";;
+      --etrecord=*) etrecord_file="${arg#*=}";;
       *)
       ;;
     esac
@@ -115,15 +118,23 @@ echo "Checking for a etdump in log"
 ! grep "#\[RUN THIS\]" ${log_file} >/dev/null
 if [ $? != 0 ]; then
     echo "Found ETDump in log!"
+    devtools_extra_args=""
     echo "#!/bin/sh" > etdump_script.sh
     sed -n '/^#\[RUN THIS\]$/,/^#\[END\]$/p' ${log_file} >> etdump_script.sh
     # You can run etdump_script.sh if you do
     # $ chmod a+x etdump_script.sh
     # $ ./etdump_script.sh
     # But lets not trust the script as a bad patch would run bad code on your machine
-    grep ">etdump.bin" etdump_script.sh | cut -d\" -f2- | cut -d\" -f1 >etdump.base64
-    base64 -d etdump.base64 >etdump.bin
-    python3 -m devtools.inspector.inspector_cli --etdump_path etdump.bin  --source_time_scale cycles --target_time_scale cycles
+    grep ">etdump.bin" etdump_script.sh | cut -d\" -f2- | cut -d\" -f1 | base64 -d >etdump.bin
+    ! grep ">debug_buffer.bin" etdump_script.sh >/dev/null
+    if [ $? != 0 ]; then
+        grep ">debug_buffer.bin" etdump_script.sh | cut -d\" -f2- | cut -d\" -f1 | base64 -d >debug_buffer.bin
+        devtools_extra_args="${devtools_extra_args} --debug_buffer_path debug_buffer.bin"
+    fi
+    if [[ ${etrecord_file} != "" ]]; then
+        devtools_extra_args="${devtools_extra_args} --etrecord_path ${etrecord_file}"
+    fi
+    python3 -m devtools.inspector.inspector_cli --etdump_path etdump.bin ${devtools_extra_args} --source_time_scale cycles --target_time_scale cycles
 fi
 
 echo "Checking for problems in log:"
diff --git a/backends/arm/test/TARGETS b/backends/arm/test/TARGETS
index 8ffad640d5a..ec35b63f8f6 100644
--- a/backends/arm/test/TARGETS
+++ b/backends/arm/test/TARGETS
@@ -40,8 +40,17 @@ runtime.python_library(
 )
 
 runtime.python_library(
-    name = "arm_tester",
-    srcs = glob(["tester/*.py"]),
+    name = "arm_tester_serialize",
+    srcs = ["tester/serialize.py"],
+    deps = [
+        "//executorch/backends/xnnpack/test/tester:tester",
+        "//executorch/devtools/backend_debug:delegation_info",
+    ]
+)
+
+runtime.python_library(
+    name = "arm_tester_lib",
+    srcs = glob(["tester/*.py"], exclude = ["tester/serialize.py"]),
     deps = [
         ":common",
         "//executorch/backends/xnnpack/test/tester:tester",
@@ -55,4 +64,13 @@ runtime.python_library(
     ]
 )
 
+
+runtime.python_library(
+    name = "arm_tester",
+    deps = [
+        "//executorch/backends/arm/test:arm_tester_lib",
+        "//executorch/backends/arm/test:arm_tester_serialize",
+    ]
+)
+
 define_arm_tests()
diff --git a/backends/arm/test/misc/test_tosa_spec.py b/backends/arm/test/misc/test_tosa_spec.py
index 968512f54c6..190c50f4aa1 100644
--- a/backends/arm/test/misc/test_tosa_spec.py
+++ b/backends/arm/test/misc/test_tosa_spec.py
@@ -5,13 +5,8 @@
 
 import unittest
 
-from executorch.backends.arm.tosa.specification import (
-    get_tosa_spec,
-    Tosa_1_00,
-    TosaSpecification,
-)
+from executorch.backends.arm.tosa.specification import Tosa_1_00, TosaSpecification
 
-from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized  # type: ignore[import-untyped]
 
 test_valid_strings = [
@@ -43,14 +38,6 @@
     "TOSA-1.0.0+BF16+fft+int4+cf+INT",
 ]
 
-test_compile_specs = [
-    ([CompileSpec("tosa_spec", "TOSA-1.0.0+INT".encode())],),
-]
-
-test_compile_specs_no_version = [
-    ([CompileSpec("other_key", "some_value".encode())],),
-]
-
 
 class TestTosaSpecification(unittest.TestCase):
     """Tests the TOSA specification class"""
@@ -74,19 +61,6 @@ def test_invalid_version_strings(self, version_string: str):
 
         assert tosa_spec is None
 
-    @parameterized.expand(test_compile_specs)  # type: ignore[misc]
-    def test_create_from_compilespec(self, compile_specs: list[CompileSpec]):
-        tosa_spec = get_tosa_spec(compile_specs)
-        assert isinstance(tosa_spec, TosaSpecification)
-
-    @parameterized.expand(test_compile_specs_no_version)  # type: ignore[misc]
-    def test_create_from_invalid_compilespec(self, compile_specs: list[CompileSpec]):
-        tosa_spec = None
-        with self.assertRaises(ValueError):
-            tosa_spec = get_tosa_spec(compile_specs)
-
-        assert tosa_spec is None
-
     @parameterized.expand(test_valid_strings)
     def test_correct_string_representation(self, version_string: str):
         tosa_spec = TosaSpecification.create_from_string(version_string)
diff --git a/backends/arm/test/ops/test_logical.py b/backends/arm/test/ops/test_logical.py
index bb7c5773342..2b160ce7b50 100644
--- a/backends/arm/test/ops/test_logical.py
+++ b/backends/arm/test/ops/test_logical.py
@@ -86,9 +86,6 @@ def forward(self, tensor: torch.Tensor):
 #################
 
 
-xfails = {"rand_rank4": "MLBEDSW-11031: Output diff on u85 bool transpose."}
-
-
 @common.parametrize("test_data", And().test_data)
 def test_logical_and_tosa_FP(test_data: input_t2):
     pipeline = TosaPipelineFP[input_t2](
@@ -132,7 +129,7 @@ def test_logical_and_u55_INT_not_delegated(test_data: input_t2):
     pipeline.run()
 
 
-@common.parametrize("test_data", And().test_data, xfails=xfails)
+@common.parametrize("test_data", And().test_data)
 @common.XfailIfNoCorstone320
 def test_logical_and_u85_INT(test_data: input_t2):
     pipeline = EthosU85PipelineINT[input_t2](
@@ -226,7 +223,7 @@ def test_logical_xor_u55_INT_not_delegated(test_data: input_t2):
     pipeline.run()
 
 
-@common.parametrize("test_data", Xor().test_data, xfails=xfails)
+@common.parametrize("test_data", Xor().test_data)
 @common.XfailIfNoCorstone320
 def test_logical_xor_u85_INT(test_data: input_t2):
     pipeline = EthosU85PipelineINT[input_t2](
@@ -320,7 +317,7 @@ def test_logical_or_u55_INT_not_delegated(test_data: input_t2):
     pipeline.run()
 
 
-@common.parametrize("test_data", Or().test_data, xfails=xfails)
+@common.parametrize("test_data", Or().test_data)
 @common.XfailIfNoCorstone320
 def test_logical_or_u85_INT(test_data: input_t2):
     pipeline = EthosU85PipelineINT[input_t2](
@@ -414,7 +411,7 @@ def test_logical_not_u55_INT_not_delegated(test_data: input_t2):
     pipeline.run()
 
 
-@common.parametrize("test_data", Not().test_data, xfails=xfails)
+@common.parametrize("test_data", Not().test_data)
 @common.XfailIfNoCorstone320
 def test_logical_not_u85_INT(test_data: input_t2):
     pipeline = EthosU85PipelineINT[input_t2](
diff --git a/backends/arm/test/ops/test_logsoftmax.py b/backends/arm/test/ops/test_logsoftmax.py
index 791069aa4b0..c4a68caabac 100644
--- a/backends/arm/test/ops/test_logsoftmax.py
+++ b/backends/arm/test/ops/test_logsoftmax.py
@@ -64,13 +64,7 @@ def test_log_softmax_tosa_INT(test_data):
     pipeline.run()
 
 
-@common.parametrize(
-    "test_data",
-    LogSoftmax.test_data,
-    xfails={
-        "randn_neg_dim": "MLBEDSW-11032: ILLEGAL_OFM_BASE error: Base addresses must be aligned to brick depth on u55."
-    },
-)
+@common.parametrize("test_data", LogSoftmax.test_data)
 @common.XfailIfNoCorstone300()
 def test_log_softmax_u55_INT(test_data):
     data, dim = test_data()
diff --git a/backends/arm/test/ops/test_softmax.py b/backends/arm/test/ops/test_softmax.py
index dc258f20ec4..6b4455fc702 100644
--- a/backends/arm/test/ops/test_softmax.py
+++ b/backends/arm/test/ops/test_softmax.py
@@ -61,13 +61,7 @@ def test_softmax_tosa_INT(test_data):
     pipeline.run()
 
 
-@common.parametrize(
-    "test_data",
-    Softmax.test_data,
-    {
-        "randn_neg_dim": "MLBEDSW-11032: ILLEGAL_OFM_BASE error: Base addresses must be aligned to brick depth on u55."
-    },
-)
+@common.parametrize("test_data", Softmax.test_data)
 @common.XfailIfNoCorstone300
 def test_softmax_u55_INT(test_data):
     data, dim = test_data()
diff --git a/backends/arm/test/ops/test_squeeze.py b/backends/arm/test/ops/test_squeeze.py
index 5c9f031deec..0de51673496 100644
--- a/backends/arm/test/ops/test_squeeze.py
+++ b/backends/arm/test/ops/test_squeeze.py
@@ -29,6 +29,7 @@ class SqueezeDim(torch.nn.Module):
         "squeeze3d_dim_neg_2": lambda: (torch.randn(1, 1, 5), -2),
         "squeeze4d_dim_pos_3": lambda: (torch.randn(1, 2, 3, 1), 3),
         "squeeze4d_dim_neg_2": lambda: (torch.randn(1, 5, 1, 5), -2),
+        "squeeze5d_dim_neg_2": lambda: (torch.randn(1, 1, 5, 1, 5), -2),
     }
 
     def forward(self, x: torch.Tensor, dim: int):
@@ -40,6 +41,7 @@ class SqueezeDims(torch.nn.Module):
         "squeeze3d_dims_0_1": lambda: (torch.randn(1, 1, 5), (0, 1)),
         "squeeze4d_dims_0_neg_1": lambda: (torch.randn(1, 5, 5, 1), (0, -1)),
         "squeeze4d_dims_0_neg_2": lambda: (torch.randn(1, 5, 1, 5), (0, -2)),
+        "squeeze5d_dims_0_neg_2": lambda: (torch.randn(1, 1, 5, 1, 5), (0, -2)),
     }
 
     def forward(self, x: torch.Tensor, dims: tuple[int]):
@@ -51,6 +53,7 @@ class Squeeze(torch.nn.Module):
         "squeeze3d": lambda: (torch.randn(1, 1, 5),),
         "squeeze4d_dims": lambda: (torch.randn(1, 5, 5, 1),),
         "squeeze3d_dims_mix": lambda: (torch.randn(1, 5, 1, 5),),
+        "squeeze4d_dims_mix": lambda: (torch.randn(1, 1, 5, 1, 5),),
     }
 
     def forward(self, x: torch.Tensor):
diff --git a/backends/arm/test/ops/test_tanh.py b/backends/arm/test/ops/test_tanh.py
index 0e74618fd2f..f3f4df31d0e 100644
--- a/backends/arm/test/ops/test_tanh.py
+++ b/backends/arm/test/ops/test_tanh.py
@@ -70,25 +70,27 @@ def test_tanh_tosa_INT(test_data: Tuple):
 
 
 @common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone300
 def test_tanh_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
         Tanh(),
         (test_data(),),
         aten_op,
         exir_ops=[],
-        run_on_fvp=False,
+        run_on_fvp=True,
     )
     pipeline.run()
 
 
 @common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone320
 def test_tanh_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
         Tanh(),
         (test_data(),),
         aten_op,
         exir_ops=[],
-        run_on_fvp=False,
+        run_on_fvp=True,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_unflatten.py b/backends/arm/test/ops/test_unflatten.py
index 95c68b2940d..7f98ababd65 100644
--- a/backends/arm/test/ops/test_unflatten.py
+++ b/backends/arm/test/ops/test_unflatten.py
@@ -9,6 +9,8 @@
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineINT,
+    EthosU85PipelineINT,
     TosaPipelineFP,
     TosaPipelineINT,
     VgfPipeline,
@@ -30,8 +32,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return torch.unflatten(x, self.dim, self.sizes)
 
     test_data: dict[str, test_data_t] = {
-        "randn_4d": (lambda: (Unflatten(1, (2, 2)), (torch.randn(3, 4, 5, 1),))),
-        "rand_3d": (lambda: (Unflatten(1, (-1, 2)), (torch.rand(3, 4, 4),))),
+        "rand_3d_batch3": (lambda: (Unflatten(1, (-1, 2)), (torch.rand(3, 4, 4),))),
+        "rand_3d_batch1": (lambda: (Unflatten(1, (-1, 2)), (torch.rand(1, 4, 4),))),
+        "randn_4d_dim1": (lambda: (Unflatten(1, (2, 2)), (torch.randn(3, 4, 5, 1),))),
+        "randn_4d_dim3": (lambda: (Unflatten(3, (2, 2)), (torch.randn(1, 1, 5, 4),))),
     }
 
 
@@ -49,7 +53,33 @@ def test_unflatten_int_tosa_FP(test_data: test_data_t):
 @common.parametrize("test_data", Unflatten.test_data)
 def test_unflatten_int_tosa_INT(test_data: test_data_t):
     module, inputs = test_data()
-    pipeline = TosaPipelineINT[input_t](
+    pipeline = TosaPipelineINT[input_t](module, inputs, Unflatten.aten_op)
+    pipeline.run()
+
+
+xfails = {
+    "rand_3d_batch3": "Batch size > 1 currently not supported for FVP tests",
+    "randn_4d_dim1": "Batch size > 1 currently not supported for FVP tests",
+}
+
+
+@common.parametrize("test_data", Unflatten.test_data, xfails=xfails, strict=False)
+@common.XfailIfNoCorstone300
+def test_unflatten_int_u55_INT(test_data: test_data_t):
+    module, inputs = test_data()
+    pipeline = EthosU55PipelineINT[input_t](
+        module,
+        inputs,
+        Unflatten.aten_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", Unflatten.test_data, xfails=xfails, strict=False)
+@common.XfailIfNoCorstone320
+def test_unflatten_int_u85_INT(test_data: test_data_t):
+    module, inputs = test_data()
+    pipeline = EthosU85PipelineINT[input_t](
         module,
         inputs,
         Unflatten.aten_op,
diff --git a/backends/arm/test/ops/test_unsqueeze.py b/backends/arm/test/ops/test_unsqueeze.py
index 54e1b0dd0ce..9da1a352ebb 100644
--- a/backends/arm/test/ops/test_unsqueeze.py
+++ b/backends/arm/test/ops/test_unsqueeze.py
@@ -25,7 +25,7 @@
 
 
 class Unsqueeze(torch.nn.Module):
-    shapes: list[int | Sequence[int]] = [5, (5, 5), (5, 4), (5, 4, 3)]
+    shapes: list[int | Sequence[int]] = [5, (5, 5), (5, 4), (5, 4, 3), (1, 5, 4, 3)]
     test_parameters = {}
     for n in shapes:
         test_parameters[f"rand_{n}"] = (torch.randn(n),)
diff --git a/backends/arm/test/ops/test_view.py b/backends/arm/test/ops/test_view.py
index fb0ba54436e..ed942c07aa1 100644
--- a/backends/arm/test/ops/test_view.py
+++ b/backends/arm/test/ops/test_view.py
@@ -51,6 +51,10 @@ class View(torch.nn.Module):
         "rand_4d_4_3": lambda: (torch.rand(5, 10, 1, 1), (1, 25, 2)),
         "rand_4d_4_2": lambda: (torch.rand(2, 50, 1, 1), (1, 100)),
         "rand_4d_2_4_same": lambda: (torch.rand(2, 3, 2, 3), (2, 3, 3, 2)),
+        "rand_4d_5d": lambda: (torch.rand(1, 3, 4, 5), (1, 1, 4, 5, -1)),
+        "rand_5d_5d": lambda: (torch.rand(1, 1, 4, 5, 6), (1, 1, 4, -1, 6)),
+        "rand_5d_3d": lambda: (torch.rand(1, 1, 4, 5, 6), (2, 3, -1)),
+        "rand_3d_5d": lambda: (torch.rand(4, 5, 6), (1, 1, 2, -1, 3)),
     }
 
     rank_product_too_large = {
diff --git a/backends/arm/test/passes/test_rescale_pass.py b/backends/arm/test/passes/test_rescale_pass.py
index 0959a0eaa25..3baa03fde65 100644
--- a/backends/arm/test/passes/test_rescale_pass.py
+++ b/backends/arm/test/passes/test_rescale_pass.py
@@ -172,14 +172,7 @@ def test_quantized_rescale_tosa_bi(test_data: tuple[torch.Tensor, torch.Tensor])
     pipeline.run()
 
 
-u55_xfails = {
-    "ones": "MLBEDSW-11032: ILLEGAL_OFM_BASE error: Base addresses must be aligned to brick depth on u55.",
-    "randn_ones": "MLBEDSW-11032: ILLEGAL_OFM_BASE error: Base addresses must be aligned to brick depth on u55.",
-    "randn_large": "MLBEDSW-11032: ILLEGAL_OFM_BASE error: Base addresses must be aligned to brick depth on u55.",
-}
-
-
-@common.parametrize("test_data", RescaleNetwork.test_data, xfails=u55_xfails)
+@common.parametrize("test_data", RescaleNetwork.test_data)
 @common.XfailIfNoCorstone300
 def test_quantized_rescale_u55(test_data: tuple[torch.Tensor, torch.Tensor]):
     """Tests a model with many ops that requires rescales. As more ops are quantized to int32 and
diff --git a/backends/arm/test/targets.bzl b/backends/arm/test/targets.bzl
index 62bc5aef57a..f240855cdf4 100644
--- a/backends/arm/test/targets.bzl
+++ b/backends/arm/test/targets.bzl
@@ -1,4 +1,5 @@
 # load("//caffe2/test/fb:defs.bzl", "define_tests")
+load("@fbsource//tools/build_defs:fbsource_utils.bzl", "is_fbcode")
 load("@fbcode_macros//build_defs:python_pytest.bzl", "python_pytest")
 load("@bazel_skylib//lib:paths.bzl", "paths")
 
@@ -59,7 +60,7 @@ def define_arm_tests():
                 "//executorch/kernels/quantized:custom_ops_generated_lib",
             ],
             deps = [
-                "//executorch/backends/arm/test:arm_tester",
+                "//executorch/backends/arm/test/tester/fb:arm_tester_fb" if is_fbcode else "//executorch/backends/arm/test:arm_tester",
                 "//executorch/backends/arm/test:conftest",
                 "//executorch/backends/arm:ethosu",
                 "//executorch/backends/arm/tosa:compile_spec",
diff --git a/backends/arm/test/test_model.py b/backends/arm/test/test_model.py
index 8833b7050e7..c336d67ad51 100755
--- a/backends/arm/test/test_model.py
+++ b/backends/arm/test/test_model.py
@@ -184,7 +184,7 @@ def build_ethosu_runtime(
             "--build_type=Release",
             f"--system_config={system_config}",
             f"--memory_mode={memory_mode}",
-            f"--extra_build_flags=-DET_DUMP_OUTPUT=OFF {extra_flags}",
+            f"--extra_build_flags=-DET_LOG_DUMP_OUTPUT=OFF {extra_flags}",
             f"--output={elf_build_path}",
         ]
     )
diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py
index 284d4d6d1c4..bb249644c47 100644
--- a/backends/arm/test/tester/arm_tester.py
+++ b/backends/arm/test/tester/arm_tester.py
@@ -7,7 +7,6 @@
 
 import logging
 
-import os
 from collections import Counter
 from pprint import pformat
 from typing import (
@@ -42,10 +41,7 @@
 )
 from executorch.backends.arm.test.runner_utils import (
     dbg_tosa_fb_to_json,
-    get_elf_path,
     get_output_quantization_params,
-    get_target_board,
-    run_target,
     TosaReferenceModelDispatch,
 )
 
@@ -53,6 +49,7 @@
     dump_error_output,
     print_error_diffs,
 )
+from executorch.backends.arm.test.tester.serialize import Serialize
 from executorch.backends.arm.tosa import TosaSpecification
 from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec
 from executorch.backends.arm.tosa.mapping import extract_tensor_meta
@@ -90,7 +87,6 @@
 
 from torch.export.graph_signature import ExportGraphSignature, InputSpec, OutputSpec
 from torch.fx import Graph
-from torch.utils._pytree import tree_flatten
 
 
 logger = logging.getLogger(__name__)
@@ -179,43 +175,6 @@ def run(
         )
 
 
-class Serialize(tester.Serialize):
-    def __init__(self, compile_spec: ArmCompileSpec, timeout):
-        super().__init__()
-        self.timeout = timeout
-        self.executorch_program_manager: ExecutorchProgramManager | None
-        self.compile_spec = compile_spec
-
-    def run(self, artifact: ExecutorchProgramManager, inputs=None) -> None:
-        super().run(artifact, inputs)
-        # Keep the entire ExecutorchProgramManager for execution.
-        self.executorch_program_manager = artifact
-
-    def run_artifact(self, inputs):
-        if self.executorch_program_manager is None:
-            raise RuntimeError(
-                "Tried running artifact from Serialize stage without running the stage."
-            )
-        inputs_flattened, _ = tree_flatten(inputs)
-        intermediate_path = self.compile_spec.get_intermediate_path()
-        target_board = get_target_board(self.compile_spec)
-        elf_path = get_elf_path(target_board)
-
-        if not os.path.exists(elf_path):
-            raise FileNotFoundError(
-                f"Did not find build arm_executor_runner in path {elf_path}, run setup_testing.sh?"
-            )
-
-        return run_target(
-            self.executorch_program_manager,
-            inputs_flattened,
-            intermediate_path,
-            target_board,
-            elf_path,
-            self.timeout,
-        )
-
-
 class ToExecutorch(tester.ToExecutorch):
     def run_artifact(self, inputs):
         with TosaReferenceModelDispatch():
@@ -303,7 +262,7 @@ def __init__(
         Args:
             model (torch.nn.Module): The model to test
             example_inputs (Tuple[torch.Tensor]): Example inputs to the model
-            compile_spec (List[CompileSpec]): The compile spec to use
+            compile_spec (ArmCompileSpec): The compile spec to use
         """
 
         self.transform_passes = transform_passes
@@ -419,7 +378,11 @@ def serialize(
         self, serialize_stage: Optional[Serialize] = None, timeout: int = 480
     ):
         if serialize_stage is None:
-            serialize_stage = Serialize(self.compile_spec, timeout)
+            serialize_stage = Serialize(
+                compile_spec=self.compile_spec,
+                module=self.original_module,
+                timeout=timeout,
+            )
         assert (
             self.compile_spec.get_intermediate_path() is not None
         ), "Can't dump serialized file when compile specs do not contain an artifact path."
diff --git a/backends/arm/test/tester/serialize.py b/backends/arm/test/tester/serialize.py
new file mode 100644
index 00000000000..f0fd246b3a6
--- /dev/null
+++ b/backends/arm/test/tester/serialize.py
@@ -0,0 +1,75 @@
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+from typing import Optional
+
+import executorch.backends.xnnpack.test.tester.tester as tester
+
+import torch.fx
+
+from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec
+
+from executorch.backends.arm.test.runner_utils import (
+    get_elf_path,
+    get_target_board,
+    run_target,
+)
+
+from executorch.exir import ExecutorchProgramManager
+from torch.utils._pytree import tree_flatten
+
+
+logger = logging.getLogger(__name__)
+
+
+class Serialize(tester.Serialize):
+    def __init__(
+        self,
+        compile_spec: ArmCompileSpec,
+        module: Optional[torch.nn.Module],
+        timeout: int = 120,
+    ):
+        """
+        Args:
+            compile_spec: CompileSpecs to be used for serialization.
+            module: Original Module to be used for serialization. Optional - can be used for reference output generation.
+            timeout: Timeout for fvp. Default is 120 seconds.
+        """
+        super().__init__()
+        self.module = module
+        self.timeout = timeout
+        self.executorch_program_manager: ExecutorchProgramManager | None
+        self.compile_spec = compile_spec
+
+    def run(self, artifact: ExecutorchProgramManager, inputs=None) -> None:
+        super().run(artifact, inputs)
+        # Keep the entire ExecutorchProgramManager for execution.
+        self.executorch_program_manager = artifact
+
+    def run_artifact(self, inputs):
+        if self.executorch_program_manager is None:
+            raise RuntimeError(
+                "Tried running artifact from Serialize stage without running the stage."
+            )
+        inputs_flattened, _ = tree_flatten(inputs)
+        intermediate_path = self.compile_spec.get_intermediate_path()
+        target_board = get_target_board(self.compile_spec)
+        elf_path = get_elf_path(target_board)
+
+        if not os.path.exists(elf_path):
+            raise FileNotFoundError(
+                f"Did not find build arm_executor_runner in path {elf_path}, run setup_testing.sh?"
+            )
+
+        return run_target(
+            self.executorch_program_manager,
+            inputs_flattened,
+            intermediate_path,
+            target_board,
+            elf_path,
+            self.timeout,
+        )
diff --git a/backends/arm/tosa/TARGETS b/backends/arm/tosa/TARGETS
index df32689bc3e..51919025591 100644
--- a/backends/arm/tosa/TARGETS
+++ b/backends/arm/tosa/TARGETS
@@ -6,8 +6,7 @@ runtime.python_library(
         "mapping.py",
     ],
     deps = [
-        "fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/serializer:serializer",
-        "fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/serializer:serializer",
+        "fbsource//third-party/tosa_tools:serializer",
         "//caffe2:torch",
         ":specification",
     ],
@@ -19,10 +18,8 @@ runtime.python_library(
     ],
     deps = [
         "fbsource//third-party/pypi/numpy:numpy",
-        "fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/serializer:serializer",
-        "fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/serializer:serializer",
-        "fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/tosa:tosa",
-        "fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/tosa:tosa",
+        "fbsource//third-party/tosa_tools:serializer",
+        "fbsource//third-party/tosa_tools:tosa",
         "//executorch/backends/arm:constants",
         ":mapping",
         "//executorch/exir/dialects:lib",
@@ -44,7 +41,6 @@ runtime.python_library(
         "utils.py",
     ],
     deps = [
-        "fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/serializer:serializer",
         ":quant_utils",
         "//executorch/backends/arm/operators:node_visitor",
     ],
diff --git a/backends/arm/tosa/backend.py b/backends/arm/tosa/backend.py
index 08b0d55aaeb..afae6f8163f 100644
--- a/backends/arm/tosa/backend.py
+++ b/backends/arm/tosa/backend.py
@@ -24,7 +24,7 @@
     process_output,
     process_placeholder,
 )
-from executorch.backends.arm.tosa.specification import get_tosa_spec
+from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec
 from executorch.exir.backend.backend_details import BackendDetails, PreprocessResult
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from torch.export.exported_program import ExportedProgram
@@ -80,38 +80,24 @@ class TOSABackend(BackendDetails):
     """
 
     @staticmethod
-    def preprocess(  # noqa: C901
+    def preprocess(edge_program: ExportedProgram, compile_specs: List[CompileSpec]):
+        return TOSABackend._preprocess(
+            edge_program, TosaCompileSpec.from_list(compile_specs)
+        )
+
+    @staticmethod
+    def _preprocess(  # noqa: C901
         edge_program: ExportedProgram,
-        compile_spec: List[CompileSpec],
+        compile_spec: TosaCompileSpec,
     ) -> PreprocessResult:
         # if a debug/test build capture output files from TOSA stage
-        artifact_path = None
-        output_format = ""
-        compile_flags = []
-        dump_debug_info = None
-        for spec in compile_spec:
-            if spec.key == "debug_artifact_path":
-                artifact_path = spec.value.decode()
-            if spec.key == "output_format":
-                output_format = spec.value.decode()
-            if spec.key == "compile_flags":
-                compile_flags.append(spec.value.decode())
-            if spec.key == "dump_debug_info":
-                dump_debug_info = spec.value.decode()
-
-        # Check that the output format is set correctly in the compile spec
-        if output_format != "tosa":
-            raise ValueError(f'Invalid output format {output_format}, must be "tosa"')
+        artifact_path = compile_spec.get_intermediate_path()
+        tosa_spec = compile_spec.tosa_spec
+        dump_debug_info = compile_spec.tosa_debug_mode
 
         # Assign to every node external id
         node_2_id = _annotate_external_ids(edge_program.graph)
 
-        tosa_spec = get_tosa_spec(compile_spec)
-        if tosa_spec is None:
-            raise ValueError(
-                "TOSA backend needs a TOSA version specified in the CompileSpec"
-            )
-
         logger.info(f"Converting ExportedProgram to TOSA: {tosa_spec}")
 
         # Converted output for this subgraph, serializer needs path early as it emits
@@ -132,7 +118,7 @@ def preprocess(  # noqa: C901
 
         debug_hook = None
         if dump_debug_info is not None:
-            debug_hook = DebugHook(ArmCompileSpec.DebugMode[dump_debug_info])
+            debug_hook = DebugHook(dump_debug_info)
 
         # TODO: Fix the need to lazily import this.
         from executorch.backends.arm.operators.node_visitor import get_node_visitors
@@ -204,8 +190,8 @@ def _sort_key(t: Node) -> int:
 
     @staticmethod
     def filter_tosa_compile_specs(
-        compile_spec: List[CompileSpec],
-    ) -> List[CompileSpec]:
+        compile_spec: ArmCompileSpec,
+    ) -> TosaCompileSpec:
         """
         Filter out the CompileSpec elements relevant for the TOSA backend.
         This is needed to compose a backend targetting hardware IP with the
@@ -214,17 +200,9 @@ def filter_tosa_compile_specs(
         flatbuffer can then be consumed by the backend targetting specific
         hardware.
         """
-        tosa_compile_spec = []
-        tosa_compile_spec.append(CompileSpec("output_format", "tosa".encode()))
-
-        # Copy everything that's TOSA generic
-        tosa_backend_compile_spec_keys = [
-            "tosa_spec",
-            "debug_artifact_path",
-        ]
 
-        for spec in compile_spec:
-            if spec.key in tosa_backend_compile_spec_keys:
-                tosa_compile_spec.append(CompileSpec(spec.key, spec.value))
-
-        return tosa_compile_spec
+        new_compile_spec = TosaCompileSpec.__new__(TosaCompileSpec)
+        new_compile_spec._set_compile_specs(
+            compile_spec.tosa_spec, [], compile_spec.get_intermediate_path()
+        )
+        return new_compile_spec
diff --git a/backends/arm/tosa/partitioner.py b/backends/arm/tosa/partitioner.py
index ab381470968..3e512847109 100644
--- a/backends/arm/tosa/partitioner.py
+++ b/backends/arm/tosa/partitioner.py
@@ -65,6 +65,7 @@ def __init__(
         self.delegation_spec = DelegationSpec(
             TOSABackend.__name__, compile_spec.to_list()
         )
+        self.tosa_spec = compile_spec.tosa_spec
         self.additional_checks = additional_checks
         self.tosa_spec = compile_spec.tosa_spec
 
@@ -75,13 +76,13 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:  # no
         logger.info("TOSAPartitioner::partition")
         partition_tags: dict[str, DelegationSpec] = {}
 
-        tosa_spec = self.tosa_spec
-
-        logger.info(f"Partitioning for {self.delegation_spec.backend_id}: {tosa_spec}")
+        logger.info(
+            f"Partitioning for {self.delegation_spec.backend_id}: {self.tosa_spec}"
+        )
 
         reporter = WhyNoPartitionReporter()
         operator_support = tosa_support_factory(
-            tosa_spec, exported_program, reporter, self.additional_checks
+            self.tosa_spec, exported_program, reporter, self.additional_checks
         )
         capability_partitioner = CapabilityBasedPartitioner(
             exported_program.graph_module,
@@ -131,7 +132,7 @@ def is_partitioned(node: torch.fx.Node, tag=tag) -> bool:
                             break
                     continue
 
-                if tosa_spec.support_float():
+                if self.tosa_spec.support_float():
                     continue
 
                 if is_partitioned(node):
@@ -163,7 +164,7 @@ def is_partitioned(node: torch.fx.Node, tag=tag) -> bool:
                 )
 
         tag_constant_data(exported_program)
-        logger.info(f"The following nodes were rejected for {tosa_spec}:")
+        logger.info(f"The following nodes were rejected for {self.tosa_spec}:")
         logger.info("\n" + reporter.get_table_report())
         logger.info("(Placeholders and outputs are not included in this list)")
         return PartitionResult(
@@ -213,8 +214,7 @@ def filter_fn(node: torch.fx.Node) -> bool:
             torch.ops.aten.logit.default,
         ] + ops_to_not_decompose_if_quant_op
 
-        tosa_spec = self.tosa_spec
-        if not tosa_spec.is_U55_subset:
+        if not self.tosa_spec.is_U55_subset:
             # Tosa operator "RESIZE" is not supported on U55. Since upsample_bilinear2d
             # and upsample_nearest2d decompose into that it will not be possible to
             # delegate those operators on U55. If we have said here to not decompose
diff --git a/backends/arm/tosa/specification.py b/backends/arm/tosa/specification.py
index 92b68955cdd..b372cd5a636 100644
--- a/backends/arm/tosa/specification.py
+++ b/backends/arm/tosa/specification.py
@@ -15,10 +15,6 @@
 import re
 from typing import List
 
-from executorch.exir.backend.compile_spec_schema import (  # type: ignore[import-not-found]
-    CompileSpec,
-)
-
 from packaging.version import Version
 
 
@@ -199,10 +195,3 @@ def get_context_spec() -> TosaSpecification:
         return TosaLoweringContext.tosa_spec_var.get()
     except LookupError:
         raise RuntimeError("Function must be executed within a TosaLoweringContext")
-
-
-def get_tosa_spec(compile_spec: List[CompileSpec]) -> TosaSpecification:
-    for spec in compile_spec:
-        if spec.key == "tosa_spec":
-            return TosaSpecification.create_from_string(spec.value.decode())
-    raise ValueError("Could not find TOSA version in CompileSpec")
diff --git a/backends/arm/vgf/backend.py b/backends/arm/vgf/backend.py
index 7c408748529..3f65456bf8b 100644
--- a/backends/arm/vgf/backend.py
+++ b/backends/arm/vgf/backend.py
@@ -22,6 +22,7 @@
     arm_get_first_delegation_tag,
     TOSABackend,
 )
+from executorch.backends.arm.vgf.compile_spec import VgfCompileSpec
 from executorch.exir.backend.backend_details import BackendDetails, PreprocessResult
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from torch.export.exported_program import ExportedProgram
@@ -40,21 +41,15 @@ class VgfBackend(BackendDetails):
     @staticmethod
     def _compile_tosa_flatbuffer(
         tosa_flatbuffer: bytes,
-        compile_spec: List[CompileSpec],
+        compile_spec: VgfCompileSpec,
         tag_name: str = "",
     ) -> bytes:
         """
         Static helper method to do the compilation of the TOSA flatbuffer
         representation to a target specific binary stream.
         """
-        compile_flags = []
-        artifact_path = None
-        for spec in compile_spec:
-            if spec.key == "compile_flags":
-                compile_flags.append(spec.value.decode())
-            if spec.key == "debug_artifact_path":
-                artifact_path = spec.value.decode()
-
+        compile_flags = compile_spec.compiler_flags
+        artifact_path = compile_spec.get_intermediate_path()
         # Pass on the TOSA flatbuffer to the vgf compiler.
         binary = vgf_compile(tosa_flatbuffer, compile_flags, artifact_path, tag_name)
         return binary
@@ -62,10 +57,11 @@ def _compile_tosa_flatbuffer(
     @staticmethod
     def preprocess(
         edge_program: ExportedProgram,
-        compile_spec: List[CompileSpec],
+        compile_specs: List[CompileSpec],
     ) -> PreprocessResult:
         logger.info(f"{VgfBackend.__name__} preprocess")
 
+        compile_spec = VgfCompileSpec.from_list(compile_specs)
         # deduce TOSA compile_spec from VGF compile spec. We get a new
         # compile spec list, containing only elements relevant for the
         # TOSABackend.
@@ -75,7 +71,7 @@ def preprocess(
         # ('All backend implementation are final...'), so use composition instead.
         # preprocess returns the serialized TOSA flatbuffer in .processed_bytes,
         # which can be passed on to next compilation step.
-        tosa_preprocess = TOSABackend.preprocess(edge_program, tosa_compile_spec)
+        tosa_preprocess = TOSABackend._preprocess(edge_program, tosa_compile_spec)
 
         tag_name = arm_get_first_delegation_tag(edge_program.graph_module)
 
diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS
index 27f9c00f4ac..d547a1ed555 100644
--- a/backends/cadence/aot/TARGETS
+++ b/backends/cadence/aot/TARGETS
@@ -344,6 +344,7 @@ python_unittest(
     typing = True,
     deps = [
         ":ops_registrations",
+        ":typing_stubs",
         ":type_dispatch",
         "//caffe2:torch",
         "//executorch/backends/cadence/aot:graph_builder",
diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml
index 196480931e0..c8e7d6cb3fc 100644
--- a/backends/cadence/aot/functions.yaml
+++ b/backends/cadence/aot/functions.yaml
@@ -359,6 +359,26 @@
     - arg_meta: null
       kernel_name: impl::reference::quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out
 
+- func: cadence::quantized_conv1d_nchw_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out
+
+- func: cadence::quantized_conv1d_nchw_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out
+
+- func: cadence::quantized_conv1d_nhwc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out
+
+- func: cadence::quantized_conv1d_nhwc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out
+
 - func: cadence::quantized_fully_connected.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
index cf4c5a8fffb..1b62c215ab6 100644
--- a/backends/cadence/aot/functions_hifi.yaml
+++ b/backends/cadence/aot/functions_hifi.yaml
@@ -370,6 +370,26 @@
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out
 
+- func: cadence::quantized_conv1d_nchw_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out
+
+- func: cadence::quantized_conv1d_nchw_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out
+
+- func: cadence::quantized_conv1d_nhwc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out
+
+- func: cadence::quantized_conv1d_nhwc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out
+
 - func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
index 35b4cbf3902..efb22a9e7d6 100644
--- a/backends/cadence/aot/ops_registrations.py
+++ b/backends/cadence/aot/ops_registrations.py
@@ -169,6 +169,30 @@
 lib.define(
     "quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
 )
+lib.define(
+    "quantized_conv1d_nchw_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+)
+lib.define(
+    "quantized_conv1d_nchw_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.define(
+    "quantized_conv1d_nchw_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+)
+lib.define(
+    "quantized_conv1d_nchw_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.define(
+    "quantized_conv1d_nhwc_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+)
+lib.define(
+    "quantized_conv1d_nhwc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.define(
+    "quantized_conv1d_nhwc_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+)
+lib.define(
+    "quantized_conv1d_nhwc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+)
 lib.define(
     "quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
 )
@@ -2153,6 +2177,150 @@ def roi_align_box_processor_meta(
     return rois.new_empty((rois.shape[0], 80), dtype=torch.uint8)
 
 
+@register_fake("cadence::quantized_conv1d_nchw_asym8sxsym8s_asym8s.per_tensor")
+def quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_meta(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: Tuple[int],
+    padding: Tuple[int],
+    dilation: Tuple[int],
+    groups: int,
+    in_zero_point: int,
+    weight_zero_point: int,
+    bias_scale: float,
+    output_scale: float,
+    output_zero_point: int,
+    out_multiplier: int,
+    out_shift: int,
+) -> torch.Tensor:
+    assert input.dim() == 3 and weight.dim() == 3
+    assert (
+        input.dtype == torch.int8
+        and weight.dtype == torch.int8
+        and bias.dtype == torch.int32
+    )
+    out_channels, _, kernel_size = weight.shape
+    output_size = get_conv1d_output_size(
+        input.shape,
+        out_channels,
+        stride[1],
+        padding[1],
+        dilation[1],
+        kernel_size,
+        False,
+    )
+    return input.new_empty(output_size, dtype=input.dtype)
+
+
+@register_fake("cadence::quantized_conv1d_nchw_asym8uxsym8u_asym8u.per_tensor")
+def quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_meta(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: Tuple[int],
+    padding: Tuple[int],
+    dilation: Tuple[int],
+    groups: int,
+    in_zero_point: int,
+    weight_zero_point: int,
+    bias_scale: float,
+    output_scale: float,
+    output_zero_point: int,
+    out_multiplier: int,
+    out_shift: int,
+) -> torch.Tensor:
+    assert input.dim() == 3 and weight.dim() == 3
+    assert (
+        input.dtype == torch.uint8
+        and weight.dtype == torch.uint8
+        and bias.dtype == torch.int32
+    )
+    out_channels, _, kernel_size = weight.shape
+    output_size = get_conv1d_output_size(
+        input.shape,
+        out_channels,
+        stride[1],
+        padding[1],
+        dilation[1],
+        kernel_size,
+        False,
+    )
+    return input.new_empty(output_size, dtype=input.dtype)
+
+
+@register_fake("cadence::quantized_conv1d_nhwc_asym8sxsym8s_asym8s.per_tensor")
+def quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_meta(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: Tuple[int],
+    padding: Tuple[int],
+    dilation: Tuple[int],
+    groups: int,
+    in_zero_point: int,
+    weight_zero_point: int,
+    bias_scale: float,
+    output_scale: float,
+    output_zero_point: int,
+    out_multiplier: int,
+    out_shift: int,
+) -> torch.Tensor:
+    assert input.dim() == 3 and weight.dim() == 3
+    assert (
+        input.dtype == torch.int8
+        and weight.dtype == torch.int8
+        and bias.dtype == torch.int32
+    )
+    out_channels, kernel_size, _ = weight.shape
+    output_size = get_conv1d_output_size(
+        input.shape,
+        out_channels,
+        stride[1],
+        padding[1],
+        dilation[1],
+        kernel_size,
+        True,
+    )
+    return input.new_empty(output_size, dtype=input.dtype)
+
+
+@register_fake("cadence::quantized_conv1d_nhwc_asym8uxsym8u_asym8u.per_tensor")
+def quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_meta(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: Tuple[int],
+    padding: Tuple[int],
+    dilation: Tuple[int],
+    groups: int,
+    in_zero_point: int,
+    weight_zero_point: int,
+    bias_scale: float,
+    output_scale: float,
+    output_zero_point: int,
+    out_multiplier: int,
+    out_shift: int,
+) -> torch.Tensor:
+    assert input.dim() == 3 and weight.dim() == 3
+    assert (
+        input.dtype == torch.uint8
+        and weight.dtype == torch.uint8
+        and bias.dtype == torch.int32
+    )
+    out_channels, kernel_size, _ = weight.shape
+    output_size = get_conv1d_output_size(
+        input.shape,
+        out_channels,
+        stride[1],
+        padding[1],
+        dilation[1],
+        kernel_size,
+        True,
+    )
+    return input.new_empty(output_size, dtype=input.dtype)
+
+
 @register_fake("cadence::_softmax_f32_f32")
 def softmax_f32_f32_meta(
     self: torch.Tensor,
diff --git a/backends/cadence/aot/quantizer/fusion_pass.py b/backends/cadence/aot/quantizer/fusion_pass.py
index 729056ea2c8..8f106a815ac 100644
--- a/backends/cadence/aot/quantizer/fusion_pass.py
+++ b/backends/cadence/aot/quantizer/fusion_pass.py
@@ -15,7 +15,11 @@
     BmmPattern,
     CatPattern,
     Conv1dPattern,
+    Conv1dReluPattern0,
+    Conv1dReluPattern1,
     Conv2dPattern,
+    Conv2dReluPattern0,
+    Conv2dReluPattern1,
     LayerNormPattern,
     LinearPattern,
     MatmulPattern,
@@ -23,6 +27,7 @@
     ReluPattern1,
 )
 from executorch.backends.cadence.aot.quantizer.utils import (
+    check_out_zero_point_is_min_range,
     create_zero_bias_int32,
     find_sequential_partitions_aten,
     get_conv_args,
@@ -41,6 +46,13 @@
 
 # Use this part for patterns with multiple aten ops
 ReluPatterns = (ReluPattern0, ReluPattern1)
+ConvPatterns = (Conv1dPattern, Conv2dPattern)
+ConvReluPatterns = (
+    Conv1dReluPattern0,
+    Conv1dReluPattern1,
+    Conv2dReluPattern0,
+    Conv2dReluPattern1,
+)
 
 
 def get_args_and_kwargs_add(
@@ -432,12 +444,12 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                 other_inputs = [node.args[idx] for node, idx in anchors.others]
 
                 # The node is the first index of the list and first of the tuple
-                op_node = anchors.output[0][0]
+                anchor_output_node = anchors.output[0][0]
 
-                assert len(op_node.users) == 1
-                quant_node = list(op_node.users.keys())[0]
+                assert len(anchor_output_node.users) == 1
+                quant_node = list(anchor_output_node.users.keys())[0]
 
-                with graph_module.graph.inserting_after(op_node):
+                with graph_module.graph.inserting_after(anchor_output_node):
                     args = tuple(
                         inputs_inputs + weights_inputs + other_inputs + bias_inputs
                     )
@@ -451,9 +463,29 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                         )
                     elif isinstance(pattern, CatPattern):
                         args, kwargs = get_args_and_kwargs_cat(
-                            inputs_inputs, other_inputs, op_node
+                            inputs_inputs, other_inputs, anchor_output_node
+                        )
+                    elif isinstance(pattern, ConvReluPatterns):
+                        # For ConvReLU, we are fusing Conv+ReLU
+                        # This means that the op we want to get
+                        # the replacement args and kwargs for is the
+                        # *conv* op, which is the anchor input, NOT
+                        # the anchor output (which is the ReLU)
+                        check_out_zero_point_is_min_range(
+                            quant_node.args[2], quant_node.args[5]
+                        )
+                        anchor_input_node = anchors.inputs[0][0]
+                        args, kwargs = get_args_and_kwargs_conv(
+                            graph_module,
+                            inputs_inputs,
+                            dequants_inputs,
+                            weights_inputs,
+                            dequants_weights,
+                            bias_inputs,
+                            quant_node,
+                            anchor_input_node,
                         )
-                    elif isinstance(pattern, (Conv1dPattern, Conv2dPattern)):
+                    elif isinstance(pattern, ConvPatterns):
                         args, kwargs = get_args_and_kwargs_conv(
                             graph_module,
                             inputs_inputs,
@@ -462,7 +494,7 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                             dequants_weights,
                             bias_inputs,
                             quant_node,
-                            op_node,
+                            anchor_output_node,
                         )
                     elif isinstance(pattern, LinearPattern):
                         args, kwargs = get_args_and_kwargs_linear(
diff --git a/backends/cadence/aot/quantizer/patterns.py b/backends/cadence/aot/quantizer/patterns.py
index 74987f8b38d..b653be27e8f 100644
--- a/backends/cadence/aot/quantizer/patterns.py
+++ b/backends/cadence/aot/quantizer/patterns.py
@@ -417,3 +417,71 @@ def partition_types(self) -> List[OpOverload]:
 class ReluPattern1(ReluBasePattern):
     def partition_types(self) -> List[OpOverload]:
         return [torch.ops.aten.relu_.default]
+
+
+# This is a base class for Conv+ReLU fusion, since it can be used with two different relu aten ops
+class ConvReluBasePattern(QuantizationPattern):
+    @abstractmethod
+    def partition_types(self) -> List[OpOverload]:
+        pass
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> PartitionAnchors:
+        # The first node should be conv, the second should be relu
+        # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
+        conv_node = fused_partition[0].nodes[-1]  # Second to last node
+        # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
+        relu_node = fused_partition[1].nodes[-1]  # Last node
+
+        bias_qspec = DerivedQuantizationSpec(
+            derived_from=[
+                (conv_node.args[0], conv_node),
+                (conv_node.args[1], conv_node),
+            ],
+            derive_qparams_fn=get_bias_qparams,
+            dtype=torch.int32,
+            quant_min=-(2**31),
+            quant_max=2**31 - 1,
+            qscheme=torch.per_tensor_affine,
+        )
+
+        # Keep bias empty if not supplied
+        bias = []
+        if len(conv_node.args) > 2 and conv_node.args[2] is not None:
+            bias = [(conv_node, 2, bias_qspec)]
+
+        return PartitionAnchors(
+            inputs=[(conv_node, 0)],
+            weights=[(conv_node, 1)],
+            # pyre-fixme[6]: Incompatible parameter type
+            biases=bias,
+            output=[(relu_node,)],  # Output is from the relu node
+        )
+
+    def replacement_op(self) -> OpOverload:
+        return torch.ops.cadence.quantized_conv_nchw.default
+
+
+# Conv1d + regular relu op fusion
+class Conv1dReluPattern0(ConvReluBasePattern):
+    def partition_types(self) -> List[OpOverload]:
+        return [torch.ops.aten.conv1d.default, torch.ops.aten.relu.default]
+
+
+# Conv1d + alternate relu op fusion
+class Conv1dReluPattern1(ConvReluBasePattern):
+    def partition_types(self) -> List[OpOverload]:
+        return [torch.ops.aten.conv1d.default, torch.ops.aten.relu_.default]
+
+
+# Conv2d + regular relu op fusion
+class Conv2dReluPattern0(ConvReluBasePattern):
+    def partition_types(self) -> List[OpOverload]:
+        return [torch.ops.aten.conv2d.default, torch.ops.aten.relu.default]
+
+
+# Conv2d + alternate relu op fusion
+class Conv2dReluPattern1(ConvReluBasePattern):
+    def partition_types(self) -> List[OpOverload]:
+        return [torch.ops.aten.conv2d.default, torch.ops.aten.relu_.default]
diff --git a/backends/cadence/aot/quantizer/quantizer.py b/backends/cadence/aot/quantizer/quantizer.py
index 8c78ac87e58..cce7c207a6b 100644
--- a/backends/cadence/aot/quantizer/quantizer.py
+++ b/backends/cadence/aot/quantizer/quantizer.py
@@ -16,7 +16,11 @@
     BmmPattern,
     CatPattern,
     Conv1dPattern,
+    Conv1dReluPattern0,
+    Conv1dReluPattern1,
     Conv2dPattern,
+    Conv2dReluPattern0,
+    Conv2dReluPattern1,
     LayerNormPattern,
     LinearPattern,
     MatmulPattern,
@@ -260,3 +264,22 @@ def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None:
         quantizers.append(CadenceAtenQuantizer(AddPattern(), qconfig_A8W8))
         quantizers.append(CadenceAtenQuantizer(CatPattern(), qconfig_A8W8))
         super().__init__(quantizers)
+
+
+class CadenceFusedConvReluQuantizer(CadenceQuantizer):
+    """
+    Quantizer using fused conv+relu patterns, and including add and cat
+    """
+
+    def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None:
+        if quantizers is None:
+            quantizers = []
+        # Order matters here, perform the "fused" patterns first
+        quantizers.append(CadenceAtenQuantizer(Conv1dReluPattern0(), qconfig_A8W8sym))
+        quantizers.append(CadenceAtenQuantizer(Conv1dReluPattern1(), qconfig_A8W8sym))
+        quantizers.append(CadenceAtenQuantizer(Conv2dReluPattern0(), qconfig_A8W8sym))
+        quantizers.append(CadenceAtenQuantizer(Conv2dReluPattern1(), qconfig_A8W8sym))
+        quantizers = quantizers + get_cadence_default_quantizers()
+        quantizers.append(CadenceAtenQuantizer(AddPattern(), qconfig_A8W8))
+        quantizers.append(CadenceAtenQuantizer(CatPattern(), qconfig_A8W8))
+        super().__init__(quantizers)
diff --git a/backends/cadence/aot/quantizer/utils.py b/backends/cadence/aot/quantizer/utils.py
index beacd1b9e86..68fc6740cb4 100644
--- a/backends/cadence/aot/quantizer/utils.py
+++ b/backends/cadence/aot/quantizer/utils.py
@@ -234,3 +234,19 @@ def find_sequential_partitions_aten(
         if _partitions_sequential(candidate):
             fused_partitions.append(candidate)
     return fused_partitions
+
+
+def check_out_zero_point_is_min_range(
+    out_zero_point: int,
+    out_dtype: torch.dtype,
+) -> bool:
+    """
+    Checks if the out_zero_point is the minimum range of the quant type.
+    """
+    if out_dtype == torch.int8:
+        return out_zero_point == -128
+    elif out_dtype == torch.int16:
+        return out_zero_point == -32768
+    elif out_dtype == torch.uint8 or torch.uint16:
+        return out_zero_point == 0
+    return False
diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
index 40ae6d23085..2a53c2dde7a 100644
--- a/backends/cadence/aot/ref_implementations.py
+++ b/backends/cadence/aot/ref_implementations.py
@@ -127,14 +127,14 @@ def dequantize_per_tensor(
     return (input_tensor - zero_point).to(dtype) * scale
 
 
-@impl(m, "quantized_add")
-def quantized_add(
+@impl(m, "quantized_add.per_tensor")
+def quantized_add_per_tensor(
     X: torch.Tensor,
-    X_scale: torch.Tensor,
-    X_zero_point: torch.Tensor,
+    X_scale: float,
+    X_zero_point: int,
     Y: torch.Tensor,
-    Y_scale: torch.Tensor,
-    Y_zero_point: torch.Tensor,
+    Y_scale: float,
+    Y_zero_point: int,
     out_scale: float,
     out_zero_point: int,
 ) -> torch.Tensor:
@@ -149,17 +149,17 @@ def quantized_add(
     out = (X_scale(X - X_zero_point) + Y_scale(Y - Y_zero_point)) / out_scale + out_zero_point
 
     Args:
-        - X (Tensor): The first operand
-        - X_scale (Tensor): The ratio between the sizes of X's floating point and quantized
+        - X: The first operand
+        - X_scale: The ratio between the sizes of X's floating point and quantized
             ranges
-        - X_zero_point (Tensor): The quantized mapping of zero for X
-        - Y (Tensor): The second operand
-        - Y_scale (Tensor): The ratio between the sizes of Y's floating point and quantized
+        - X_zero_point: The quantized mapping of zero for X
+        - Y: The second operand
+        - Y_scale: The ratio between the sizes of Y's floating point and quantized
             ranges
-        - Y_zero_point (Tensor): The quantized mapping of zero for Y
-        - out_scale (float): The ratio between the sizes of the output's floating point and
+        - Y_zero_point: The quantized mapping of zero for Y
+        - out_scale: The ratio between the sizes of the output's floating point and
             quantized ranges
-        - out_zero_point (int): The quantized mapping of zero for the output
+        - out_zero_point: The quantized mapping of zero for the output
     """
     supported_dtypes = [torch.int8, torch.uint8]
     if X.dtype != Y.dtype:
@@ -193,13 +193,55 @@ def quantized_add(
     )
 
 
+@impl(m, "quantized_add_asym8sxasym8s_asym8s.per_tensor")
+def quantized_add_asym8sxasym8s_asym8s_per_tensor(
+    X: torch.Tensor,
+    X_scale: float,
+    X_zero_point: int,
+    Y: torch.Tensor,
+    Y_scale: float,
+    Y_zero_point: int,
+    out_scale: float,
+    out_zero_point: int,
+) -> torch.Tensor:
+    if X.dtype != torch.int8:
+        raise ValueError("X dtype must be torch.int8")
+    if Y.dtype != torch.int8:
+        raise ValueError("Y dtype must be torch.int8")
+
+    return quantized_add_per_tensor(
+        X, X_scale, X_zero_point, Y, Y_scale, Y_zero_point, out_scale, out_zero_point
+    )
+
+
+@impl(m, "quantized_add_asym8uxasym8u_asym8u.per_tensor")
+def quantized_add_asym8uxasym8u_asym8u_per_tensor(
+    X: torch.Tensor,
+    X_scale: float,
+    X_zero_point: int,
+    Y: torch.Tensor,
+    Y_scale: float,
+    Y_zero_point: int,
+    out_scale: float,
+    out_zero_point: int,
+) -> torch.Tensor:
+    if X.dtype != torch.uint8:
+        raise ValueError("X dtype must be torch.int8")
+    if Y.dtype != torch.uint8:
+        raise ValueError("Y dtype must be torch.int8")
+
+    return quantized_add_per_tensor(
+        X, X_scale, X_zero_point, Y, Y_scale, Y_zero_point, out_scale, out_zero_point
+    )
+
+
 def quantized_linear_common(
     src: torch.Tensor,
     weight: torch.Tensor,
     bias: torch.Tensor,
     in_zero_point: int,
     weight_zero_point: torch.Tensor | int,
-    out_multiplier: torch.Tensor | int,
+    out_multiplier: int,
     out_shift: int,
     out_zero_point: int,
 ) -> torch.Tensor:
@@ -287,34 +329,30 @@ def variant(
                 assert isinstance(weight_zero_point, int)
                 assert isinstance(out_multiplier, int)
                 assert isinstance(out_shift, int)
-                return quantized_linear_common(
-                    src,
-                    weight,
-                    bias,
-                    in_zero_point,
-                    weight_zero_point,
-                    out_multiplier,
-                    out_shift,
-                    out_zero_point,
-                )
+                _out_shift = out_shift
+                _out_multiplier = out_multiplier
             else:
                 assert isinstance(out_shift, torch.Tensor)
+                assert isinstance(out_multiplier, torch.Tensor)
                 if out_shift.numel() != 1:
                     raise ValueError("out_shift must be a scalar")
 
                 if out_shift.dtype != torch.int64:
                     raise ValueError("out_shift must be an int64")
 
-                return quantized_linear_common(
-                    src,
-                    weight,
-                    bias,
-                    in_zero_point,
-                    weight_zero_point,
-                    out_multiplier,
-                    int(out_shift.item()),
-                    out_zero_point,
-                )
+                _out_shift = int(out_shift.item())
+                _out_multiplier = int(out_multiplier[0].item())
+
+            return quantized_linear_common(
+                src,
+                weight,
+                bias,
+                in_zero_point,
+                weight_zero_point,
+                _out_multiplier,
+                _out_shift,
+                out_zero_point,
+            )
 
         return variant
 
@@ -361,6 +399,112 @@ def quantized_fully_connected_asym8sxasym8s_asym8s_per_tensor() -> torch.Tensor:
 def quantized_fully_connected_asym8uxasym8u_asym8u_per_tensor() -> torch.Tensor: ...
 
 
+@impl(m, "quantized_matmul")
+def quantized_matmul(
+    X: torch.Tensor,
+    X_zero_point: int,
+    Y: torch.Tensor,
+    Y_zero_point: int,
+    bias: torch.Tensor | None,
+    out_multiplier: int,
+    out_shift: int,
+    out_zero_point: int,
+    transposed: bool = False,
+) -> torch.Tensor:
+    """
+    Quantized matmul operation.
+
+    Args:
+        - X (Tensor): The activations tensor
+        - X_zero_point (int): The quantized mapping of zero for the input
+        - Y (Tensor): The weight tensor
+        - Y_zero_point (int): The quantized mapping of zero for the weight
+        - bias (Tensor): The bias tensor
+        - out_multiplier (int): The multiplier used to scale the output
+        - out_shift (int): The shift used to scale the output
+        - out_zero_point (int): The quantized mapping of zero for the output
+        - transposed (bool): Whether to transpose the weight tensor
+    """
+    if bias is not None and not torch.all(bias == 0):
+        raise ValueError("bias must be None or all zeros since unused in out variant")
+
+    # Looks weird, but quantized linear assumes weights are pre-transposed,
+    # hence we transpose only if `transposed` is False.
+    if not transposed:
+        Y = Y.T
+
+    return quantized_linear_common(
+        X,
+        Y,
+        bias or torch.zeros(1, dtype=torch.int32),
+        X_zero_point,
+        Y_zero_point,
+        out_multiplier,
+        out_shift,
+        out_zero_point,
+    )
+
+
+@impl(m, "quantized_matmul_asym8sxasym8s_asym8s")
+def quantized_matmul_asym8sxasym8s_asym8s(
+    X: torch.Tensor,
+    X_zero_point: int,
+    Y: torch.Tensor,
+    Y_zero_point: int,
+    bias: torch.Tensor | None,
+    out_multiplier: int,
+    out_shift: int,
+    out_zero_point: int,
+    transposed: bool = False,
+) -> torch.Tensor:
+    if X.dtype != torch.int8:
+        raise ValueError("X dtype must be torch.int8")
+    if Y.dtype != torch.int8:
+        raise ValueError("Y dtype must be torch.int8")
+
+    return quantized_matmul(
+        X,
+        X_zero_point,
+        Y,
+        Y_zero_point,
+        bias,
+        out_multiplier,
+        out_shift,
+        out_zero_point,
+        transposed,
+    )
+
+
+@impl(m, "quantized_matmul_asym8uxasym8u_asym8u")
+def quantized_matmul_asym8uxasym8u_asym8u(
+    X: torch.Tensor,
+    X_zero_point: int,
+    Y: torch.Tensor,
+    Y_zero_point: int,
+    bias: torch.Tensor | None,
+    out_multiplier: int,
+    out_shift: int,
+    out_zero_point: int,
+    transposed: bool = False,
+) -> torch.Tensor:
+    if X.dtype != torch.uint8:
+        raise ValueError("X dtype must be torch.uint8")
+    if Y.dtype != torch.uint8:
+        raise ValueError("Y dtype must be torch.uint8")
+
+    return quantized_matmul(
+        X,
+        X_zero_point,
+        Y,
+        Y_zero_point,
+        bias,
+        out_multiplier,
+        out_shift,
+        out_zero_point,
+        transposed,
+    )
+
+
 @impl(m, "quantized_layer_norm.per_tensor")
 def quantized_layer_norm_per_tensor(
     input_tensor: torch.Tensor,
@@ -613,6 +757,7 @@ def quantized_conv_variant(
     layout: str,
     input_dtype: torch.dtype,
     weight_dtype: torch.dtype,
+    is_1d: bool = False,
 ) -> Callable[[Callable[..., torch.Tensor]], Callable[..., torch.Tensor]]:
     """Create a quantized conv variant with type checking."""
 
@@ -644,6 +789,14 @@ def variant(
                 bias.dtype == torch.int32
             ), f"Expected bias dtype int32, got {bias.dtype}"
 
+            if is_1d:
+                assert (
+                    len(input_tensor.shape) == 3
+                ), f"1D convolution requires 3D input tensor, got {len(input_tensor.shape)}D"
+                assert (
+                    len(weight.shape) == 3
+                ), f"1D convolution requires 3D weight tensor, got {len(weight.shape)}D"
+
             # Call the appropriate base function
             match layout:
                 case "nchw":
@@ -748,6 +901,26 @@ def quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor() -> torch.Tens
 def quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ...
 
 
+@impl(m, "quantized_conv1d_nchw_asym8sxsym8s_asym8s.per_tensor")
+@quantized_conv_variant("nchw", torch.int8, torch.int8, is_1d=True)
+def quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ...
+
+
+@impl(m, "quantized_conv1d_nchw_asym8uxsym8u_asym8u.per_tensor")
+@quantized_conv_variant("nchw", torch.uint8, torch.uint8, is_1d=True)
+def quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ...
+
+
+@impl(m, "quantized_conv1d_nhwc_asym8sxsym8s_asym8s.per_tensor")
+@quantized_conv_variant("nhwc", torch.int8, torch.int8, is_1d=True)
+def quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ...
+
+
+@impl(m, "quantized_conv1d_nhwc_asym8uxsym8u_asym8u.per_tensor")
+@quantized_conv_variant("nhwc", torch.uint8, torch.uint8, is_1d=True)
+def quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ...
+
+
 def quantized_relu_common(
     X: torch.Tensor,
     X_zero_point: torch.Tensor | int,
diff --git a/backends/cadence/aot/tests/test_ref_implementations.py b/backends/cadence/aot/tests/test_ref_implementations.py
index 04b3e8e75ba..30b30e085dc 100644
--- a/backends/cadence/aot/tests/test_ref_implementations.py
+++ b/backends/cadence/aot/tests/test_ref_implementations.py
@@ -100,7 +100,7 @@ def test_dequantize_per_tensor(
         [
             # Only these types need to be tested as per ET_FORALL_JARVIS_QUANTIZED_TYPES in
             # on_device_ai/Assistant/Jarvis/min_runtime/operators/generic/operators.h
-            ("int16", 5, 0.8, 4, 5, 0.8, 4, 0.8, 4, 6, torch.int8),
+            ("int8", 5, 0.8, 4, 5, 0.8, 4, 0.8, 4, 6, torch.int8),
             ("uint8", 5, 0.8, 4, 5, 0.8, 4, 0.8, 4, 6, torch.uint8),
         ]
     )
@@ -122,13 +122,34 @@ def test_quantized_add(
         Y_tensor = torch.tensor([Y], dtype=dtype)
         expected_output = torch.tensor([expected_value], dtype=dtype)
 
+        quantized_add = (
+            torch.ops.cadence.quantized_add_asym8sxasym8s_asym8s.per_tensor
+            if dtype == torch.int8
+            else torch.ops.cadence.quantized_add_asym8uxasym8u_asym8u.per_tensor
+        )
+        output = quantized_add(
+            X_tensor,
+            X_scale,
+            X_zero_point,
+            Y_tensor,
+            Y_scale,
+            Y_zero_point,
+            out_scale,
+            out_zero_point,
+        )
+
+        self.assertTrue(
+            torch.equal(output, expected_output),
+            f"Values don't match in {name}: got {output}, expected {expected_output}",
+        )
+
         output = torch.ops.cadence.quantized_add(
             X_tensor,
-            torch.tensor(X_scale),
-            torch.tensor(X_zero_point, dtype=dtype),
+            X_scale,
+            X_zero_point,
             Y_tensor,
-            torch.tensor(Y_scale),
-            torch.tensor(Y_zero_point, dtype=dtype),
+            Y_scale,
+            Y_zero_point,
             out_scale,
             out_zero_point,
         )
@@ -156,6 +177,8 @@ def test_quantized_add(
                     0,  # out_zero_point
                     torch.tensor([[-2]], dtype=dtype),  # expected_output
                     per_tensor,
+                    False,
+                    False,
                 )
                 for (per_tensor, dtype) in (
                     (False, torch.int8),
@@ -179,6 +202,8 @@ def test_quantized_add(
                     0,  # out_zero_point
                     torch.tensor([[-10, -30]], dtype=dtype),  # expected_output
                     per_tensor,
+                    False,
+                    False,
                 )
                 for (per_tensor, dtype) in (
                     (False, torch.int8),
@@ -204,6 +229,8 @@ def test_quantized_add(
                         [[[-2, -8, -14], [-6, -28, -50]]], dtype=dtype
                     ),  # expected_output
                     per_tensor,
+                    False,
+                    False,
                 )
                 for (per_tensor, dtype) in (
                     (False, torch.int8),
@@ -227,6 +254,8 @@ def test_quantized_add(
                     1,  # out_zero_point
                     torch.tensor([[-15, 25]], dtype=dtype),  # expected_output
                     per_tensor,
+                    False,
+                    False,
                 )
                 for (per_tensor, dtype) in (
                     (False, torch.int8),
@@ -250,6 +279,8 @@ def test_quantized_add(
                     1,  # out_zero_point
                     torch.tensor([[-23, 17]], dtype=dtype),  # expected_output
                     False,
+                    False,
+                    False,
                 )
                 for dtype in (torch.int8, torch.uint8)
             ],
@@ -271,9 +302,34 @@ def test_quantized_add(
                     1,  # out_zero_point
                     torch.tensor([[-7, 13]], dtype=dtype),  # expected_output
                     per_tensor,
+                    False,
+                    False,
                 )
                 for (per_tensor, dtype) in ((False, torch.int8), (True, torch.int8))
             ],
+            *[
+                (
+                    torch.Size([1, 2]),  # src_shape: 1 sample, 2 input features
+                    torch.Size(
+                        [2, 2]
+                    ),  # weight_shape: 2 output features, 2 input features
+                    2,  # in_zero_point
+                    torch.tensor([1, 1], dtype=dtype),  # weight_zero_point
+                    torch.tensor(
+                        [268435456], dtype=torch.int32
+                    ),  # out_multiplier (0.125 * 2^31)
+                    torch.tensor(
+                        [1], dtype=torch.int64
+                    ),  # out_shift (shift=1, doubles the scale)
+                    1,  # out_zero_point
+                    torch.tensor([[-7, 17]], dtype=dtype),  # expected_output
+                    per_tensor,
+                    matmul,
+                    transposed_matmul,
+                )
+                for (matmul, transposed_matmul) in ((True, False), (True, True))
+                for (per_tensor, dtype) in ((True, torch.int8), (True, torch.uint8))
+            ],
         ]
     )
     def test_quantized_linear(
@@ -287,7 +343,12 @@ def test_quantized_linear(
         out_zero_point: int,
         expected_output: torch.Tensor,
         per_tensor: bool,
+        matmul: bool,
+        transposed_matmul: bool,
     ) -> None:
+        if not per_tensor and matmul:
+            self.skipTest("Only per_tensor supported for matmul")
+
         src = (
             torch.arange(np.prod(src_shape))
             .reshape(src_shape)
@@ -298,7 +359,9 @@ def test_quantized_linear(
             .reshape(weight_shape)
             .to(expected_output.dtype)
         )
-        bias = torch.arange(weight_shape[0]).to(torch.int32)
+        if matmul and not transposed_matmul:
+            weight = weight.T
+
         if per_tensor:
             weight_zero_point = weight_zero_point[0]
             out_multiplier = out_multiplier[0]
@@ -307,20 +370,34 @@ def test_quantized_linear(
         if per_tensor:
             match expected_output.dtype:
                 case torch.int8:
-                    linear_ops = (
-                        torch.ops.cadence.quantized_linear_asym8sxasym8s_asym8s.per_tensor,
-                        torch.ops.cadence.quantized_fully_connected_asym8sxasym8s_asym8s.per_tensor,
-                    )
+                    if matmul:
+                        linear_ops = (
+                            # Doesn't have per tensor name, but it is per tensor
+                            torch.ops.cadence.quantized_matmul_asym8sxasym8s_asym8s,
+                        )
+                    else:
+                        linear_ops = (
+                            torch.ops.cadence.quantized_linear_asym8sxasym8s_asym8s.per_tensor,
+                            torch.ops.cadence.quantized_fully_connected_asym8sxasym8s_asym8s.per_tensor,
+                        )
                 case torch.uint8:
-                    linear_ops = (
-                        torch.ops.cadence.quantized_linear_asym8uxasym8u_asym8u.per_tensor,
-                        torch.ops.cadence.quantized_fully_connected_asym8uxasym8u_asym8u.per_tensor,
-                    )
+                    if matmul:
+                        linear_ops = (
+                            torch.ops.cadence.quantized_matmul_asym8uxasym8u_asym8u,
+                        )
+                    else:
+                        linear_ops = (
+                            torch.ops.cadence.quantized_linear_asym8uxasym8u_asym8u.per_tensor,
+                            torch.ops.cadence.quantized_fully_connected_asym8uxasym8u_asym8u.per_tensor,
+                        )
                 case _:
-                    linear_ops = (
-                        torch.ops.cadence.quantized_linear.per_tensor,
-                        torch.ops.cadence.quantized_fully_connected.per_tensor,
-                    )
+                    if matmul:
+                        linear_ops = (torch.ops.cadence.quantized_matmul,)
+                    else:
+                        linear_ops = (
+                            torch.ops.cadence.quantized_linear.per_tensor,
+                            torch.ops.cadence.quantized_fully_connected.per_tensor,
+                        )
         else:
             linear_ops = (
                 torch.ops.cadence.quantized_linear,
@@ -328,17 +405,40 @@ def test_quantized_linear(
             )
 
         for linear_op in linear_ops:
-            output = linear_op(
-                src,
-                weight,
-                bias,
-                in_zero_point,
-                weight_zero_point,
-                out_multiplier,
-                out_shift,
-                out_zero_point,
-                typing.cast(torch.Tensor, None),
+            # Get the function name for linear_op for debugging
+            op_name = (
+                linear_op.__name__ if hasattr(linear_op, "__name__") else str(linear_op)
             )
+            if matmul:
+                assert "quantized_matmul" in op_name
+                output = linear_op(
+                    src,
+                    in_zero_point,
+                    weight,
+                    weight_zero_point,
+                    None,
+                    out_multiplier,
+                    out_shift,
+                    out_zero_point,
+                    transposed_matmul,
+                )
+            else:
+                assert (
+                    "quantized_linear" in op_name
+                    or "quantized_fully_connected" in op_name
+                )
+                bias = torch.arange(weight_shape[0]).to(torch.int32)
+                output = linear_op(
+                    src,
+                    weight,
+                    bias,
+                    in_zero_point,
+                    weight_zero_point,
+                    out_multiplier,
+                    out_shift,
+                    out_zero_point,
+                    typing.cast(torch.Tensor, None),
+                )
 
             self.assertTrue(output.dtype == expected_output.dtype, "Dtype mismatch")
 
diff --git a/backends/cadence/aot/tests/test_type_dispatch_passes.py b/backends/cadence/aot/tests/test_type_dispatch_passes.py
index 52904aecb41..4ae10ea83dd 100644
--- a/backends/cadence/aot/tests/test_type_dispatch_passes.py
+++ b/backends/cadence/aot/tests/test_type_dispatch_passes.py
@@ -13,41 +13,36 @@
 from executorch.backends.cadence.aot.graph_builder import single_op_builder
 from executorch.backends.cadence.aot.pass_utils import count_node
 from executorch.backends.cadence.aot.type_dispatch import CompileTimeTypeDispatchPass
+from executorch.backends.cadence.aot.typing_stubs import expand
 from executorch.exir.dialects._ops import ops as exir_ops
 from torch.fx.passes.infra.pass_base import PassResult
 
 
 class TestTypeDispatchPasses(unittest.TestCase):
-    def test_int8_dispatch_quantized_fully_connected(self) -> None:
-        """Test int8 x int8 inputs should dispatch to asym8sxasym8s_asym8s variant"""
-        x = torch.randint(-128, 127, (1, 3), dtype=torch.int8)
-        w = torch.randint(-128, 127, (4, 3), dtype=torch.int8)
-        b = torch.randint(-2147483648, 2147483647, (4,), dtype=torch.int32)
-        gm = single_op_builder(
-            placeholders=(x, w, b),
-            op=exir_ops.edge.cadence.quantized_fully_connected.per_tensor,
-            args=(x, w, b, 0, 0, 1, 0, 0, None),
-        )
-        p = CompileTimeTypeDispatchPass()
-        gm = cast(PassResult, p(gm)).graph_module
-        # Original op should be replaced
-        self.assertEqual(
-            count_node(gm, exir_ops.edge.cadence.quantized_fully_connected.per_tensor),
-            0,
-        )
-        # Should be replaced with int8 specific variant
-        self.assertEqual(
-            count_node(
-                gm,
+    @expand(
+        [
+            (
+                "int8",
+                torch.int8,
                 exir_ops.edge.cadence.quantized_fully_connected_asym8sxasym8s_asym8s.per_tensor,
             ),
-            1,
-        )
-
-    def test_uint8_dispatch_quantized_fully_connected(self) -> None:
-        """Test uint8 x uint8 inputs should dispatch to asym8uxasym8u_asym8u variant"""
-        x = torch.randint(0, 255, (1, 3), dtype=torch.uint8)
-        w = torch.randint(0, 255, (4, 3), dtype=torch.uint8)
+            (
+                "uint8",
+                torch.uint8,
+                exir_ops.edge.cadence.quantized_fully_connected_asym8uxasym8u_asym8u.per_tensor,
+            ),
+        ]
+    )
+    def test_dispatch_quantized_fully_connected(
+        self,
+        _: str,
+        dtype: torch.dtype,
+        expected_op: torch._ops.OpOverload,
+    ) -> None:
+        """Test quantized_fully_connected dispatches to correct dtype-specific variant"""
+        min_val, max_val = torch.iinfo(dtype).min, torch.iinfo(dtype).max
+        x = torch.randint(min_val, max_val, (1, 3), dtype=dtype)
+        w = torch.randint(min_val, max_val, (4, 3), dtype=dtype)
         b = torch.randint(-2147483648, 2147483647, (4,), dtype=torch.int32)
         gm = single_op_builder(
             placeholders=(x, w, b),
@@ -61,45 +56,33 @@ def test_uint8_dispatch_quantized_fully_connected(self) -> None:
             count_node(gm, exir_ops.edge.cadence.quantized_fully_connected.per_tensor),
             0,
         )
-        # Should be replaced with uint8 specific variant
-        self.assertEqual(
-            count_node(
-                gm,
-                exir_ops.edge.cadence.quantized_fully_connected_asym8uxasym8u_asym8u.per_tensor,
-            ),
-            1,
-        )
+        # Should be replaced with dtype-specific variant
+        self.assertEqual(count_node(gm, expected_op), 1)
 
-    def test_int8_dispatch_quantized_linear(self) -> None:
-        """Test int8 x int8 inputs should dispatch to asym8sxasym8s_asym8s variant for quantized_linear"""
-        x = torch.randint(-128, 127, (2, 3), dtype=torch.int8)
-        w = torch.randint(-128, 127, (4, 3), dtype=torch.int8)
-        b = torch.randint(-2147483648, 2147483647, (4,), dtype=torch.int32)
-        gm = single_op_builder(
-            placeholders=(x, w, b),
-            op=exir_ops.edge.cadence.quantized_linear.per_tensor,
-            args=(x, w, b, 0, 0, 1, 0, 0, None),
-        )
-        p = CompileTimeTypeDispatchPass()
-        gm = cast(PassResult, p(gm)).graph_module
-        # Original op should be replaced
-        self.assertEqual(
-            count_node(gm, exir_ops.edge.cadence.quantized_linear.per_tensor),
-            0,
-        )
-        # Should be replaced with int8 specific variant
-        self.assertEqual(
-            count_node(
-                gm,
+    @expand(
+        [
+            (
+                "int8",
+                torch.int8,
                 exir_ops.edge.cadence.quantized_linear_asym8sxasym8s_asym8s.per_tensor,
             ),
-            1,
-        )
-
-    def test_uint8_quantized_linear_dispatch(self) -> None:
-        """Test uint8 x uint8 inputs should dispatch to asym8uxasym8u_asym8u variant for quantized_linear"""
-        x = torch.randint(0, 255, (2, 3), dtype=torch.uint8)
-        w = torch.randint(0, 255, (4, 3), dtype=torch.uint8)
+            (
+                "uint8",
+                torch.uint8,
+                exir_ops.edge.cadence.quantized_linear_asym8uxasym8u_asym8u.per_tensor,
+            ),
+        ]
+    )
+    def test_dispatch_quantized_linear(
+        self,
+        _: str,
+        dtype: torch.dtype,
+        expected_op: torch._ops.OpOverload,
+    ) -> None:
+        """Test quantized_linear dispatches to correct dtype-specific variant"""
+        min_val, max_val = torch.iinfo(dtype).min, torch.iinfo(dtype).max
+        x = torch.randint(min_val, max_val, (2, 3), dtype=dtype)
+        w = torch.randint(min_val, max_val, (4, 3), dtype=dtype)
         b = torch.randint(-2147483648, 2147483647, (4,), dtype=torch.int32)
         gm = single_op_builder(
             placeholders=(x, w, b),
@@ -113,14 +96,8 @@ def test_uint8_quantized_linear_dispatch(self) -> None:
             count_node(gm, exir_ops.edge.cadence.quantized_linear.per_tensor),
             0,
         )
-        # Should be replaced with uint8 specific variant
-        self.assertEqual(
-            count_node(
-                gm,
-                exir_ops.edge.cadence.quantized_linear_asym8uxasym8u_asym8u.per_tensor,
-            ),
-            1,
-        )
+        # Should be replaced with dtype-specific variant
+        self.assertEqual(count_node(gm, expected_op), 1)
 
     def test_mixed_types_error(self) -> None:
         """Test mixed int8/uint8 inputs should raise RuntimeError"""
@@ -138,33 +115,29 @@ def test_mixed_types_error(self) -> None:
             cast(PassResult, p(gm)).graph_module
         self.assertIn("Unsupported input types", str(context.exception))
 
-    def test_int8_dispatch_quantized_relu(self) -> None:
-        """Test int8 input should dispatch to asym8s_asym8s variant for quantized_relu"""
-        x = torch.randint(-128, 127, (2, 3), dtype=torch.int8)
-        gm = single_op_builder(
-            placeholders=(x,),
-            op=exir_ops.edge.cadence.quantized_relu.per_tensor,
-            args=(x, 0, 0, 1, 0),
-        )
-        p = CompileTimeTypeDispatchPass()
-        gm = cast(PassResult, p(gm)).graph_module
-        # Original op should be replaced
-        self.assertEqual(
-            count_node(gm, exir_ops.edge.cadence.quantized_relu.per_tensor),
-            0,
-        )
-        # Should be replaced with int8 specific variant
-        self.assertEqual(
-            count_node(
-                gm,
+    @expand(
+        [
+            (
+                "int8",
+                torch.int8,
                 exir_ops.edge.cadence.quantized_relu_asym8s_asym8s.per_tensor,
             ),
-            1,
-        )
-
-    def test_uint8_dispatch_quantized_relu(self) -> None:
-        """Test uint8 input should dispatch to asym8u_asym8u variant for quantized_relu"""
-        x = torch.randint(0, 255, (2, 3), dtype=torch.uint8)
+            (
+                "uint8",
+                torch.uint8,
+                exir_ops.edge.cadence.quantized_relu_asym8u_asym8u.per_tensor,
+            ),
+        ]
+    )
+    def test_dispatch_quantized_relu(
+        self,
+        _: str,
+        dtype: torch.dtype,
+        expected_op: torch._ops.OpOverload,
+    ) -> None:
+        """Test quantized_relu dispatches to correct dtype-specific variant"""
+        min_val, max_val = torch.iinfo(dtype).min, torch.iinfo(dtype).max
+        x = torch.randint(min_val, max_val, (2, 3), dtype=dtype)
         gm = single_op_builder(
             placeholders=(x,),
             op=exir_ops.edge.cadence.quantized_relu.per_tensor,
@@ -177,45 +150,33 @@ def test_uint8_dispatch_quantized_relu(self) -> None:
             count_node(gm, exir_ops.edge.cadence.quantized_relu.per_tensor),
             0,
         )
-        # Should be replaced with uint8 specific variant
-        self.assertEqual(
-            count_node(
-                gm,
-                exir_ops.edge.cadence.quantized_relu_asym8u_asym8u.per_tensor,
-            ),
-            1,
-        )
+        # Should be replaced with dtype-specific variant
+        self.assertEqual(count_node(gm, expected_op), 1)
 
-    def test_int8_dispatch_quantized_matmul(self) -> None:
-        """Test int8 x int8 inputs should dispatch to asym8sxasym8s_asym8s variant for quantized_matmul"""
-        x = torch.randint(-128, 127, (2, 3), dtype=torch.int8)
-        y = torch.randint(-128, 127, (3, 4), dtype=torch.int8)
-        bias = torch.randint(-2147483648, 2147483647, (4,), dtype=torch.int32)
-        gm = single_op_builder(
-            placeholders=(x, y, bias),
-            op=exir_ops.edge.cadence.quantized_matmul.default,
-            args=(x, 0, y, 0, bias, 1, 0, 0, False),
-        )
-        p = CompileTimeTypeDispatchPass()
-        gm = cast(PassResult, p(gm)).graph_module
-        # Original op should be replaced
-        self.assertEqual(
-            count_node(gm, exir_ops.edge.cadence.quantized_matmul.default),
-            0,
-        )
-        # Should be replaced with int8 specific variant
-        self.assertEqual(
-            count_node(
-                gm,
+    @expand(
+        [
+            (
+                "int8",
+                torch.int8,
                 exir_ops.edge.cadence.quantized_matmul_asym8sxasym8s_asym8s.default,
             ),
-            1,
-        )
-
-    def test_uint8_dispatch_quantized_matmul(self) -> None:
-        """Test uint8 x uint8 inputs should dispatch to asym8uxasym8u_asym8u variant for quantized_matmul"""
-        x = torch.randint(0, 255, (2, 3), dtype=torch.uint8)
-        y = torch.randint(0, 255, (3, 4), dtype=torch.uint8)
+            (
+                "uint8",
+                torch.uint8,
+                exir_ops.edge.cadence.quantized_matmul_asym8uxasym8u_asym8u.default,
+            ),
+        ]
+    )
+    def test_dispatch_quantized_matmul(
+        self,
+        _: str,
+        dtype: torch.dtype,
+        expected_op: torch._ops.OpOverload,
+    ) -> None:
+        """Test quantized_matmul dispatches to correct dtype-specific variant"""
+        min_val, max_val = torch.iinfo(dtype).min, torch.iinfo(dtype).max
+        x = torch.randint(min_val, max_val, (2, 3), dtype=dtype)
+        y = torch.randint(min_val, max_val, (3, 4), dtype=dtype)
         bias = torch.randint(-2147483648, 2147483647, (4,), dtype=torch.int32)
         gm = single_op_builder(
             placeholders=(x, y, bias),
@@ -229,252 +190,204 @@ def test_uint8_dispatch_quantized_matmul(self) -> None:
             count_node(gm, exir_ops.edge.cadence.quantized_matmul.default),
             0,
         )
-        # Should be replaced with uint8 specific variant
-        self.assertEqual(
-            count_node(
-                gm,
-                exir_ops.edge.cadence.quantized_matmul_asym8uxasym8u_asym8u.default,
-            ),
-            1,
-        )
+        # Should be replaced with dtype-specific variant
+        self.assertEqual(count_node(gm, expected_op), 1)
 
-    def test_int8_dispatch_quantized_conv_nchw(self) -> None:
-        """Test int8 x int8 inputs should dispatch to asym8sxasym8s_asym8s variant for quantized_conv_nchw"""
-        x = torch.randint(-128, 127, (1, 3, 8, 8), dtype=torch.int8)
-        w = torch.randint(-128, 127, (16, 3, 3, 3), dtype=torch.int8)
-        b = torch.randint(-2147483648, 2147483647, (16,), dtype=torch.int32)
-        gm = single_op_builder(
-            placeholders=(x, w, b),
-            op=exir_ops.edge.cadence.quantized_conv_nchw.per_tensor,
-            args=(x, w, b, [1, 1], [0, 0], [1, 1], 1, 0, 0, 1.0, 1.0, 0, 1, 1),
-        )
-        p = CompileTimeTypeDispatchPass()
-        gm = cast(PassResult, p(gm)).graph_module
-        # Original op should be replaced
-        self.assertEqual(
-            count_node(gm, exir_ops.edge.cadence.quantized_conv_nchw.per_tensor),
-            0,
-        )
-        # Should be replaced with int8 specific variant
-        self.assertEqual(
-            count_node(
-                gm,
+    @expand(
+        [
+            (
+                "int8_nchw",
+                torch.int8,
+                (1, 3, 8, 8),  # x_shape
+                exir_ops.edge.cadence.quantized_conv_nchw.per_tensor,
                 exir_ops.edge.cadence.quantized_conv_nchw_asym8sxsym8s_asym8s.per_tensor,
             ),
-            1,
-        )
-
-    def test_uint8_dispatch_quantized_conv_nchw(self) -> None:
-        """Test uint8 x uint8 inputs should dispatch to asym8uxasym8u_asym8u variant for quantized_conv_nchw"""
-        x = torch.randint(0, 255, (1, 3, 8, 8), dtype=torch.uint8)
-        w = torch.randint(0, 255, (16, 3, 3, 3), dtype=torch.uint8)
-        b = torch.randint(-2147483648, 2147483647, (16,), dtype=torch.int32)
-        gm = single_op_builder(
-            placeholders=(x, w, b),
-            op=exir_ops.edge.cadence.quantized_conv_nchw.per_tensor,
-            args=(x, w, b, [1, 1], [0, 0], [1, 1], 1, 0, 0, 1.0, 1.0, 0, 1, 1),
-        )
-        p = CompileTimeTypeDispatchPass()
-        gm = cast(PassResult, p(gm)).graph_module
-        # Original op should be replaced
-        self.assertEqual(
-            count_node(gm, exir_ops.edge.cadence.quantized_conv_nchw.per_tensor),
-            0,
-        )
-        # Should be replaced with uint8 specific variant
-        self.assertEqual(
-            count_node(
-                gm,
+            (
+                "uint8_nchw",
+                torch.uint8,
+                (1, 3, 8, 8),  # x_shape
+                exir_ops.edge.cadence.quantized_conv_nchw.per_tensor,
                 exir_ops.edge.cadence.quantized_conv_nchw_asym8uxsym8u_asym8u.per_tensor,
             ),
-            1,
-        )
-
-    def test_int8_dispatch_quantized_conv_nhwc(self) -> None:
-        """Test int8 x int8 inputs should dispatch to asym8sxasym8s_asym8s variant for quantized_conv_nhwc"""
-        x = torch.randint(-128, 127, (1, 8, 8, 3), dtype=torch.int8)
-        w = torch.randint(-128, 127, (16, 3, 3, 3), dtype=torch.int8)
-        b = torch.randint(-2147483648, 2147483647, (16,), dtype=torch.int32)
-        gm = single_op_builder(
-            placeholders=(x, w, b),
-            op=exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor,
-            args=(x, w, b, [1, 1], [0, 0], [1, 1], 1, 0, 0, 1.0, 1.0, 0, 1, 1),
-        )
-        p = CompileTimeTypeDispatchPass()
-        gm = cast(PassResult, p(gm)).graph_module
-        # Original op should be replaced
-        self.assertEqual(
-            count_node(gm, exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor),
-            0,
-        )
-        # Should be replaced with int8 specific variant
-        self.assertEqual(
-            count_node(
-                gm,
+            (
+                "int8_nhwc",
+                torch.int8,
+                (1, 8, 8, 3),  # x_shape
+                exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor,
                 exir_ops.edge.cadence.quantized_conv_nhwc_asym8sxsym8s_asym8s.per_tensor,
             ),
-            1,
-        )
-
-    def test_uint8_dispatch_quantized_conv_nhwc(self) -> None:
-        """Test uint8 x uint8 inputs should dispatch to asym8uxasym8u_asym8u variant for quantized_conv_nhwc"""
-        x = torch.randint(0, 255, (1, 8, 8, 3), dtype=torch.uint8)
-        w = torch.randint(0, 255, (16, 3, 3, 3), dtype=torch.uint8)
-        b = torch.randint(-2147483648, 2147483647, (16,), dtype=torch.int32)
-        gm = single_op_builder(
-            placeholders=(x, w, b),
-            op=exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor,
-            args=(x, w, b, [1, 1], [0, 0], [1, 1], 1, 0, 0, 1.0, 1.0, 0, 1, 1),
-        )
-        p = CompileTimeTypeDispatchPass()
-        gm = cast(PassResult, p(gm)).graph_module
-        # Original op should be replaced
-        self.assertEqual(
-            count_node(gm, exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor),
-            0,
-        )
-        # Should be replaced with uint8 specific variant
-        self.assertEqual(
-            count_node(
-                gm,
+            (
+                "uint8_nhwc",
+                torch.uint8,
+                (1, 8, 8, 3),  # x_shape
+                exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor,
                 exir_ops.edge.cadence.quantized_conv_nhwc_asym8uxsym8u_asym8u.per_tensor,
             ),
-            1,
-        )
-
-    def test_int8_dispatch_quantized_conv_nchw_dilated(self) -> None:
-        """Test int8 x int8 inputs with dilation should dispatch to dilated_asym8sxasym8s_asym8s variant for quantized_conv_nchw_dilated"""
-        x = torch.randint(-128, 127, (1, 3, 8, 8), dtype=torch.int8)
-        w = torch.randint(-128, 127, (16, 3, 3, 3), dtype=torch.int8)
+        ]
+    )
+    def test_dispatch_quantized_conv_2d(
+        self,
+        _: str,
+        dtype: torch.dtype,
+        x_shape: tuple[int, ...],
+        original_op: torch._ops.OpOverload,
+        expected_op: torch._ops.OpOverload,
+    ) -> None:
+        """Test quantized_conv_2d (nchw/nhwc) dispatches to correct dtype-specific variant"""
+        min_val, max_val = torch.iinfo(dtype).min, torch.iinfo(dtype).max
+        x = torch.randint(min_val, max_val, x_shape, dtype=dtype)
+        w = torch.randint(min_val, max_val, (16, 3, 3, 3), dtype=dtype)
         b = torch.randint(-2147483648, 2147483647, (16,), dtype=torch.int32)
         gm = single_op_builder(
             placeholders=(x, w, b),
-            op=exir_ops.edge.cadence.quantized_conv_nchw.per_tensor,
-            args=(x, w, b, [1, 1], [0, 0], [2, 2], 1, 0, 0, 1.0, 1.0, 0, 1, 1),
+            op=original_op,
+            args=(x, w, b, [1, 1], [0, 0], [1, 1], 1, 0, 0, 1.0, 1.0, 0, 1, 1),
         )
         p = CompileTimeTypeDispatchPass()
         gm = cast(PassResult, p(gm)).graph_module
         # Original op should be replaced
-        self.assertEqual(
-            count_node(gm, exir_ops.edge.cadence.quantized_conv_nchw.per_tensor),
-            0,
-        )
-        # Should be replaced with int8 specific variant
-        self.assertEqual(
-            count_node(
-                gm,
+        self.assertEqual(count_node(gm, original_op), 0)
+        # Should be replaced with dtype-specific variant
+        self.assertEqual(count_node(gm, expected_op), 1)
+
+    @expand(
+        [
+            (
+                "int8_nchw_dilated",
+                torch.int8,
+                (1, 3, 8, 8),  # x_shape
+                exir_ops.edge.cadence.quantized_conv_nchw.per_tensor,
                 exir_ops.edge.cadence.quantized_conv_nchw_dilated_asym8sxsym8s_asym8s.per_tensor,
             ),
-            1,
-        )
-
-    def test_uint8_dispatch_quantized_conv_nchw_dilated(self) -> None:
-        """Test uint8 x uint8 inputs with dilation should dispatch to dilated_asym8uxasym8u_asym8u variant for quantized_conv_nchw"""
-        x = torch.randint(0, 255, (1, 3, 8, 8), dtype=torch.uint8)
-        w = torch.randint(0, 255, (16, 3, 3, 3), dtype=torch.uint8)
-        b = torch.randint(-2147483648, 2147483647, (16,), dtype=torch.int32)
-        gm = single_op_builder(
-            placeholders=(x, w, b),
-            op=exir_ops.edge.cadence.quantized_conv_nchw.per_tensor,
-            args=(x, w, b, [1, 1], [0, 0], [2, 2], 1, 0, 0, 1.0, 1.0, 0, 1, 1),
-        )
-        p = CompileTimeTypeDispatchPass()
-        gm = cast(PassResult, p(gm)).graph_module
-        # Original op should be replaced
-        self.assertEqual(
-            count_node(gm, exir_ops.edge.cadence.quantized_conv_nchw.per_tensor),
-            0,
-        )
-        # Should be replaced with uint8 specific variant
-        self.assertEqual(
-            count_node(
-                gm,
+            (
+                "uint8_nchw_dilated",
+                torch.uint8,
+                (1, 3, 8, 8),  # x_shape
+                exir_ops.edge.cadence.quantized_conv_nchw.per_tensor,
                 exir_ops.edge.cadence.quantized_conv_nchw_dilated_asym8uxsym8u_asym8u.per_tensor,
             ),
-            1,
-        )
-
-    def test_int8_dispatch_quantized_conv_nhwc_dilated(self) -> None:
-        """Test int8 x int8 inputs with dilation should dispatch to dilated_asym8sxasym8s_asym8s variant for quantized_conv_nhwc"""
-        x = torch.randint(-128, 127, (1, 8, 8, 3), dtype=torch.int8)
-        w = torch.randint(-128, 127, (16, 3, 3, 3), dtype=torch.int8)
+            (
+                "int8_nhwc_dilated",
+                torch.int8,
+                (1, 8, 8, 3),  # x_shape
+                exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor,
+                exir_ops.edge.cadence.quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor,
+            ),
+            (
+                "uint8_nhwc_dilated",
+                torch.uint8,
+                (1, 8, 8, 3),  # x_shape
+                exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor,
+                exir_ops.edge.cadence.quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor,
+            ),
+        ]
+    )
+    def test_dispatch_quantized_conv_2d_dilated(
+        self,
+        _: str,
+        dtype: torch.dtype,
+        x_shape: tuple[int, ...],
+        original_op: torch._ops.OpOverload,
+        expected_op: torch._ops.OpOverload,
+    ) -> None:
+        """Test quantized_conv_2d with dilation dispatches to correct dtype-specific variant"""
+        min_val, max_val = torch.iinfo(dtype).min, torch.iinfo(dtype).max
+        x = torch.randint(min_val, max_val, x_shape, dtype=dtype)
+        w = torch.randint(min_val, max_val, (16, 3, 3, 3), dtype=dtype)
         b = torch.randint(-2147483648, 2147483647, (16,), dtype=torch.int32)
         gm = single_op_builder(
             placeholders=(x, w, b),
-            op=exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor,
+            op=original_op,
             args=(x, w, b, [1, 1], [0, 0], [2, 2], 1, 0, 0, 1.0, 1.0, 0, 1, 1),
         )
         p = CompileTimeTypeDispatchPass()
         gm = cast(PassResult, p(gm)).graph_module
         # Original op should be replaced
-        self.assertEqual(
-            count_node(gm, exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor),
-            0,
-        )
-        # Should be replaced with int8 specific variant
-        self.assertEqual(
-            count_node(
-                gm,
-                exir_ops.edge.cadence.quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor,
-            ),
-            1,
-        )
+        self.assertEqual(count_node(gm, original_op), 0)
+        # Should be replaced with dtype-specific variant
+        self.assertEqual(count_node(gm, expected_op), 1)
 
-    def test_uint8_dispatch_quantized_conv_nhwc_dilated(self) -> None:
-        """Test uint8 x uint8 inputs with dilation should dispatch to dilated_asym8uxasym8u_asym8u variant for quantized_conv_nhwc"""
-        x = torch.randint(0, 255, (1, 8, 8, 3), dtype=torch.uint8)
-        w = torch.randint(0, 255, (16, 3, 3, 3), dtype=torch.uint8)
+    @expand(
+        [
+            (
+                "int8_nchw_1d",
+                torch.int8,
+                (1, 3, 8),  # x_shape
+                exir_ops.edge.cadence.quantized_conv_nchw.per_tensor,
+                exir_ops.edge.cadence.quantized_conv1d_nchw_asym8sxsym8s_asym8s.per_tensor,
+            ),
+            (
+                "uint8_nchw_1d",
+                torch.uint8,
+                (1, 3, 8),  # x_shape
+                exir_ops.edge.cadence.quantized_conv_nchw.per_tensor,
+                exir_ops.edge.cadence.quantized_conv1d_nchw_asym8uxsym8u_asym8u.per_tensor,
+            ),
+            (
+                "int8_nhwc_1d",
+                torch.int8,
+                (1, 8, 3),  # x_shape
+                exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor,
+                exir_ops.edge.cadence.quantized_conv1d_nhwc_asym8sxsym8s_asym8s.per_tensor,
+            ),
+            (
+                "uint8_nhwc_1d",
+                torch.uint8,
+                (1, 8, 3),  # x_shape
+                exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor,
+                exir_ops.edge.cadence.quantized_conv1d_nhwc_asym8uxsym8u_asym8u.per_tensor,
+            ),
+        ]
+    )
+    def test_dispatch_quantized_conv_1d(
+        self,
+        _: str,
+        dtype: torch.dtype,
+        x_shape: tuple[int, ...],
+        original_op: torch._ops.OpOverload,
+        expected_op: torch._ops.OpOverload,
+    ) -> None:
+        """Test quantized_conv_1d (nchw/nhwc) dispatches to correct dtype-specific variant"""
+        min_val, max_val = torch.iinfo(dtype).min, torch.iinfo(dtype).max
+        x = torch.randint(min_val, max_val, x_shape, dtype=dtype)
+        w = torch.randint(min_val, max_val, (16, 3, 3), dtype=dtype)
         b = torch.randint(-2147483648, 2147483647, (16,), dtype=torch.int32)
         gm = single_op_builder(
             placeholders=(x, w, b),
-            op=exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor,
-            args=(x, w, b, [1, 1], [0, 0], [2, 2], 1, 0, 0, 1.0, 1.0, 0, 1, 1),
+            op=original_op,
+            args=(x, w, b, [1, 1], [0, 0], [1, 1], 1, 0, 0, 1.0, 1.0, 0, 1, 1),
         )
         p = CompileTimeTypeDispatchPass()
         gm = cast(PassResult, p(gm)).graph_module
         # Original op should be replaced
-        self.assertEqual(
-            count_node(gm, exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor),
-            0,
-        )
-        # Should be replaced with uint8 specific variant
-        self.assertEqual(
-            count_node(
-                gm,
-                exir_ops.edge.cadence.quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor,
-            ),
-            1,
-        )
+        self.assertEqual(count_node(gm, original_op), 0)
+        # Should be replaced with dtype-specific variant
+        self.assertEqual(count_node(gm, expected_op), 1)
 
-    def test_int8_dispatch_quantized_add(self) -> None:
-        """Test int8 x int8 inputs should dispatch to asym8sxasym8s_asym8s variant for quantized_add"""
-        x = torch.randint(-128, 127, (2, 3), dtype=torch.int8)
-        y = torch.randint(-128, 127, (2, 3), dtype=torch.int8)
-        gm = single_op_builder(
-            placeholders=(x, y),
-            op=exir_ops.edge.cadence.quantized_add.per_tensor,
-            args=(x, 1.0, 0, y, 1.0, 0, 1.0, 0),
-        )
-        p = CompileTimeTypeDispatchPass()
-        gm = cast(PassResult, p(gm)).graph_module
-        # Original op should be replaced
-        self.assertEqual(
-            count_node(gm, exir_ops.edge.cadence.quantized_add.per_tensor),
-            0,
-        )
-        # Should be replaced with int8 specific variant
-        self.assertEqual(
-            count_node(
-                gm,
+    @expand(
+        [
+            (
+                "int8",
+                torch.int8,
                 exir_ops.edge.cadence.quantized_add_asym8sxasym8s_asym8s.per_tensor,
             ),
-            1,
-        )
-
-    def test_uint8_dispatch_quantized_add(self) -> None:
-        """Test uint8 x uint8 inputs should dispatch to asym8uxasym8u_asym8u variant for quantized_add"""
-        x = torch.randint(0, 255, (2, 3), dtype=torch.uint8)
-        y = torch.randint(0, 255, (2, 3), dtype=torch.uint8)
+            (
+                "uint8",
+                torch.uint8,
+                exir_ops.edge.cadence.quantized_add_asym8uxasym8u_asym8u.per_tensor,
+            ),
+        ]
+    )
+    def test_dispatch_quantized_add(
+        self,
+        _: str,
+        dtype: torch.dtype,
+        expected_op: torch._ops.OpOverload,
+    ) -> None:
+        """Test quantized_add dispatches to correct dtype-specific variant"""
+        min_val, max_val = torch.iinfo(dtype).min, torch.iinfo(dtype).max
+        x = torch.randint(min_val, max_val, (2, 3), dtype=dtype)
+        y = torch.randint(min_val, max_val, (2, 3), dtype=dtype)
         gm = single_op_builder(
             placeholders=(x, y),
             op=exir_ops.edge.cadence.quantized_add.per_tensor,
@@ -487,158 +400,62 @@ def test_uint8_dispatch_quantized_add(self) -> None:
             count_node(gm, exir_ops.edge.cadence.quantized_add.per_tensor),
             0,
         )
-        # Should be replaced with uint8 specific variant
-        self.assertEqual(
-            count_node(
-                gm,
-                exir_ops.edge.cadence.quantized_add_asym8uxasym8u_asym8u.per_tensor,
-            ),
-            1,
-        )
+        # Should be replaced with dtype-specific variant
+        self.assertEqual(count_node(gm, expected_op), 1)
 
-    def test_int8_dispatch_quantized_conv_nchw_depthwise(self) -> None:
-        """Test int8 x int8 inputs with depthwise should dispatch to depthwise_asym8sxsym8s_asym8s variant for quantized_conv_nchw"""
-        # Depthwise convolution: groups == input_channels
-        x = torch.randint(-128, 127, (1, 3, 8, 8), dtype=torch.int8)
-        w = torch.randint(
-            -128, 127, (3, 1, 3, 3), dtype=torch.int8
-        )  # groups=3, input_channels=3
-        b = torch.randint(-2147483648, 2147483647, (3,), dtype=torch.int32)
-        gm = single_op_builder(
-            placeholders=(x, w, b),
-            op=exir_ops.edge.cadence.quantized_conv_nchw.per_tensor,
-            args=(
-                x,
-                w,
-                b,
-                [1, 1],
-                [0, 0],
-                [1, 1],
-                3,
-                0,
-                0,
-                1.0,
-                1.0,
-                0,
-                1,
-                1,
-            ),  # groups=3
-        )
-        p = CompileTimeTypeDispatchPass()
-        gm = cast(PassResult, p(gm)).graph_module
-        # Original op should be replaced
-        self.assertEqual(
-            count_node(gm, exir_ops.edge.cadence.quantized_conv_nchw.per_tensor),
-            0,
-        )
-        # Should be replaced with int8 depthwise specific variant
-        self.assertEqual(
-            count_node(
-                gm,
+    @expand(
+        [
+            (
+                "int8_nchw_depthwise",
+                torch.int8,
+                (1, 3, 8, 8),  # x_shape
+                (3, 1, 3, 3),  # w_shape (groups=3, input_channels=3)
+                exir_ops.edge.cadence.quantized_conv_nchw.per_tensor,
                 exir_ops.edge.cadence.quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor,
             ),
-            1,
-        )
-
-    def test_uint8_dispatch_quantized_conv_nchw_depthwise(self) -> None:
-        """Test uint8 x uint8 inputs with depthwise should dispatch to depthwise_asym8uxasym8u_asym8u variant for quantized_conv_nchw"""
-        # Depthwise convolution: groups == input_channels
-        x = torch.randint(0, 255, (1, 3, 8, 8), dtype=torch.uint8)
-        w = torch.randint(
-            0, 255, (3, 1, 3, 3), dtype=torch.uint8
-        )  # groups=3, input_channels=3
-        b = torch.randint(-2147483648, 2147483647, (3,), dtype=torch.int32)
-        gm = single_op_builder(
-            placeholders=(x, w, b),
-            op=exir_ops.edge.cadence.quantized_conv_nchw.per_tensor,
-            args=(
-                x,
-                w,
-                b,
-                [1, 1],
-                [0, 0],
-                [1, 1],
-                3,
-                0,
-                0,
-                1.0,
-                1.0,
-                0,
-                1,
-                1,
-            ),  # groups=3
-        )
-        p = CompileTimeTypeDispatchPass()
-        gm = cast(PassResult, p(gm)).graph_module
-        # Original op should be replaced
-        self.assertEqual(
-            count_node(gm, exir_ops.edge.cadence.quantized_conv_nchw.per_tensor),
-            0,
-        )
-        # Should be replaced with uint8 depthwise specific variant
-        self.assertEqual(
-            count_node(
-                gm,
+            (
+                "uint8_nchw_depthwise",
+                torch.uint8,
+                (1, 3, 8, 8),  # x_shape
+                (3, 1, 3, 3),  # w_shape (groups=3, input_channels=3)
+                exir_ops.edge.cadence.quantized_conv_nchw.per_tensor,
                 exir_ops.edge.cadence.quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor,
             ),
-            1,
-        )
-
-    def test_int8_dispatch_quantized_conv_nhwc_depthwise(self) -> None:
-        """Test int8 x int8 inputs with depthwise should dispatch to depthwise_asym8sxsym8s_asym8s variant for quantized_conv_nhwc"""
-        # Depthwise convolution: groups == input_channels
-        x = torch.randint(-128, 127, (1, 8, 8, 3), dtype=torch.int8)
-        w = torch.randint(
-            -128, 127, (3, 3, 3, 1), dtype=torch.int8
-        )  # groups=3, input_channels=3
-        b = torch.randint(-2147483648, 2147483647, (3,), dtype=torch.int32)
-        gm = single_op_builder(
-            placeholders=(x, w, b),
-            op=exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor,
-            args=(
-                x,
-                w,
-                b,
-                [1, 1],
-                [0, 0],
-                [1, 1],
-                3,
-                0,
-                0,
-                1.0,
-                1.0,
-                0,
-                1,
-                1,
-            ),  # groups=3
-        )
-        p = CompileTimeTypeDispatchPass()
-        gm = cast(PassResult, p(gm)).graph_module
-        # Original op should be replaced
-        self.assertEqual(
-            count_node(gm, exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor),
-            0,
-        )
-        # Should be replaced with int8 depthwise specific variant
-        self.assertEqual(
-            count_node(
-                gm,
+            (
+                "int8_nhwc_depthwise",
+                torch.int8,
+                (1, 8, 8, 3),  # x_shape
+                (3, 3, 3, 1),  # w_shape (groups=3, input_channels=3)
+                exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor,
                 exir_ops.edge.cadence.quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor,
             ),
-            1,
-        )
-
-    def test_uint8_dispatch_quantized_conv_nhwc_depthwise(self) -> None:
-        """Test uint8 x uint8 inputs with depthwise should dispatch to depthwise_asym8uxasym8u_asym8u variant for quantized_conv_nhwc"""
-        # Depthwise convolution: groups == input_channels
-        x = torch.randint(0, 255, (1, 8, 8, 3), dtype=torch.uint8)
-        w = torch.randint(
-            0, 255, (3, 3, 3, 1), dtype=torch.uint8
-        )  # groups=3, input_channels=3
+            (
+                "uint8_nhwc_depthwise",
+                torch.uint8,
+                (1, 8, 8, 3),  # x_shape
+                (3, 3, 3, 1),  # w_shape (groups=3, input_channels=3)
+                exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor,
+                exir_ops.edge.cadence.quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor,
+            ),
+        ]
+    )
+    def test_dispatch_quantized_conv_depthwise(
+        self,
+        _: str,
+        dtype: torch.dtype,
+        x_shape: tuple[int, ...],
+        w_shape: tuple[int, ...],
+        original_op: torch._ops.OpOverload,
+        expected_op: torch._ops.OpOverload,
+    ) -> None:
+        """Test quantized_conv depthwise (groups == input_channels) dispatches to correct dtype-specific variant"""
+        min_val, max_val = torch.iinfo(dtype).min, torch.iinfo(dtype).max
+        x = torch.randint(min_val, max_val, x_shape, dtype=dtype)
+        w = torch.randint(min_val, max_val, w_shape, dtype=dtype)
         b = torch.randint(-2147483648, 2147483647, (3,), dtype=torch.int32)
         gm = single_op_builder(
             placeholders=(x, w, b),
-            op=exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor,
+            op=original_op,
             args=(
                 x,
                 w,
@@ -654,20 +471,11 @@ def test_uint8_dispatch_quantized_conv_nhwc_depthwise(self) -> None:
                 0,
                 1,
                 1,
-            ),  # groups=3
+            ),
         )
         p = CompileTimeTypeDispatchPass()
         gm = cast(PassResult, p(gm)).graph_module
         # Original op should be replaced
-        self.assertEqual(
-            count_node(gm, exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor),
-            0,
-        )
-        # Should be replaced with uint8 depthwise specific variant
-        self.assertEqual(
-            count_node(
-                gm,
-                exir_ops.edge.cadence.quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor,
-            ),
-            1,
-        )
+        self.assertEqual(count_node(gm, original_op), 0)
+        # Should be replaced with dtype-specific variant
+        self.assertEqual(count_node(gm, expected_op), 1)
diff --git a/backends/cadence/aot/type_dispatch.py b/backends/cadence/aot/type_dispatch.py
index 108c4fb1a92..958a78a4808 100644
--- a/backends/cadence/aot/type_dispatch.py
+++ b/backends/cadence/aot/type_dispatch.py
@@ -129,6 +129,8 @@ def call_operator(
         type_suffix = config.type_dispatch_suffixes[dtype_key]
         base_name = config.base_name
 
+        typed_op_name = f"{base_name}_{type_suffix}"
+
         if op in [
             exir_ops.edge.cadence.quantized_conv_nchw.per_tensor,
             exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor,
@@ -140,17 +142,18 @@ def call_operator(
                 else args[0].to_tensor().shape[-1]
             )
             is_depthwise = groups == input_channels
-
-            dilation = args[5]
             # pyre-ignore[16]: None has no attribute '__iter__'.
-            is_dilated = any(d > 1 for d in dilation)
-
-            if is_dilated:
-                type_suffix = f"dilated_{type_suffix}"
-            elif is_depthwise:
-                type_suffix = f"depthwise_{type_suffix}"
-
-        typed_op_name = f"{base_name}_{type_suffix}"
+            is_dilated = any(d > 1 for d in args[5])
+            is_1d = len(args[0].to_tensor().shape) == 3
+
+            if is_depthwise:
+                typed_op_name = f"{base_name}_depthwise_{type_suffix}"
+            elif is_dilated:
+                typed_op_name = f"{base_name}_dilated_{type_suffix}"
+            elif is_1d and groups == 1:
+                typed_op_name = (
+                    f"quantized_conv1d_{base_name.split('_')[-1]}_{type_suffix}"
+                )
 
         typed_op = getattr(
             getattr(exir_ops.edge.cadence, typed_op_name), config.variant
diff --git a/backends/cadence/hifi/kernels/kernels.cpp b/backends/cadence/hifi/kernels/kernels.cpp
index feabe6e1828..d9b60ab29cf 100644
--- a/backends/cadence/hifi/kernels/kernels.cpp
+++ b/backends/cadence/hifi/kernels/kernels.cpp
@@ -23,17 +23,9 @@ memcpy(void* dst, const void* src, size_t num_bytes) {
 void* allocate_temp_memory(KernelRuntimeContext& ctx, size_t size) {
   constexpr size_t kAlignment =
       16; // 16-byte alignment for vectorized operations
-  ET_LOG(
-      Info,
-      "Attempting to allocate %zu bytes of temp memory (16-byte aligned)",
-      size);
   Result<void*> temp_mem_res = ctx.allocate_temp(size, kAlignment);
   if (temp_mem_res.ok()) {
     void* ptr = temp_mem_res.get();
-    ET_LOG(
-        Info,
-        "Successfully allocated temp memory at %p (16-byte aligned)",
-        ptr);
     return ptr;
   } else {
     ET_LOG(
diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp
new file mode 100644
index 00000000000..c1b5a1836a3
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/backends/cadence/hifi/operators/operators.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1)))
+
+using Tensor = executorch::aten::Tensor;
+using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
+using ScalarType = executorch::aten::ScalarType;
+using ::executorch::aten::IntArrayRef;
+
+namespace cadence {
+namespace impl {
+namespace HiFi {
+namespace native {
+
+// Optimized NCHW 1D convolution for int8 x int8 -> int8
+void xa_opt_quantized_conv1d_nchw_asym8sxsym8s_asym8s(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    int32_t in_zero_point,
+    int32_t weight_zero_point,
+    float bias_scale,
+    float output_scale,
+    int32_t output_zero_point,
+    Tensor& out) {
+  constexpr int kNnlibMaxDim = 3;
+
+  WORD8* __restrict__ p_out =
+      (WORD8* __restrict__)out.mutable_data_ptr<int8_t>();
+  WORD8* __restrict__ p_inp =
+      (WORD8* __restrict__)input.const_data_ptr<int8_t>();
+  WORD8* __restrict__ p_kernel =
+      (WORD8* __restrict__)weight.const_data_ptr<int8_t>();
+  WORD32* __restrict__ p_bias =
+      (WORD32* __restrict__)bias.const_data_ptr<int32_t>();
+
+  WORD32 batches = input.size(0);
+  WORD32 input_channels = input.size(1);
+  WORD32 input_width = input.size(2);
+  WORD32 out_channels = weight.size(0);
+  WORD32 kernel_channels = weight.size(1);
+  WORD32 kernel_width = weight.size(2);
+  WORD32 out_width = out.size(2);
+  WORD32 x_stride = stride[1];
+  WORD32 x_padding = padding[1];
+  WORD32 input_zero_bias = -in_zero_point;
+  WORD32 out_multiplier32[out_channels];
+  WORD32 out_shift32[out_channels];
+  WORD32 kernel_zero_bias = -weight_zero_point;
+  float out_scale = 1. / output_scale;
+
+  for (int i = 0; i < out_channels; i++) {
+    out_multiplier32[i] = bias_scale * out_scale * 2147483648;
+    out_shift32[i] = 0;
+  }
+
+  WORD32 out_zero_bias = output_zero_point;
+  WORD32 out_data_format = 1;
+  WORD8* ptr1 = (WORD8*)kernels::allocate_temp_memory(
+      ctx, ((batches * input_channels * input_width) + 8) * sizeof(WORD8));
+  WORD8* ptr2 = (WORD8*)kernels::allocate_temp_memory(
+      ctx,
+      ((out_channels * kernel_channels * kernel_width) + 8) * sizeof(WORD8));
+  WORD8* pin = (WORD8*)ALIGN_PTR(ptr1, 8);
+  WORD8* pkernel = (WORD8*)ALIGN_PTR(ptr2, 8);
+
+  WORD32 p_inp_shape[kNnlibMaxDim];
+  p_inp_shape[0] = batches;
+  p_inp_shape[1] = input_channels;
+  p_inp_shape[2] = input_width;
+
+  WORD32 p_out_shape[kNnlibMaxDim];
+  p_out_shape[0] = batches;
+  p_out_shape[1] = input_width;
+  p_out_shape[2] = input_channels;
+
+  WORD32 p_permute_vec[kNnlibMaxDim] = {0, 2, 1};
+
+  xa_nn_transpose_8_8(
+      pin,
+      p_out_shape,
+      p_inp,
+      p_inp_shape,
+      p_permute_vec,
+      kNnlibMaxDim,
+      kNnlibMaxDim);
+
+  WORD32 p_inp_shape1[kNnlibMaxDim];
+  p_inp_shape1[0] = out_channels;
+  p_inp_shape1[1] = kernel_channels;
+  p_inp_shape1[2] = kernel_width;
+
+  WORD32 p_out_shape1[kNnlibMaxDim];
+  p_out_shape1[0] = out_channels;
+  p_out_shape1[1] = kernel_width;
+  p_out_shape1[2] = kernel_channels;
+
+  xa_nn_transpose_8_8(
+      pkernel,
+      p_out_shape1,
+      p_kernel,
+      p_inp_shape1,
+      p_permute_vec,
+      kNnlibMaxDim,
+      kNnlibMaxDim);
+
+  WORD32 scratch_size =
+      xa_nn_conv1d_std_getsize(kernel_width, input_width, input_channels, 8);
+  scratch_size = scratch_size < 0 ? 0 : scratch_size;
+  WORD32* ptr_scratch =
+      (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size);
+  pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8);
+
+  for (int _n = 0; _n < batches; _n++) {
+    WORD8* in_batch = pin + _n * input_channels * input_width;
+    WORD8* out_batch = p_out + _n * out_channels * out_width;
+
+    xa_nn_conv1d_std_asym8xasym8(
+        out_batch,
+        in_batch,
+        pkernel,
+        p_bias,
+        1,
+        input_width,
+        input_channels,
+        kernel_width,
+        out_channels,
+        x_stride,
+        x_padding,
+        out_width,
+        input_zero_bias,
+        kernel_zero_bias,
+        out_multiplier32,
+        out_shift32,
+        out_zero_bias,
+        out_data_format,
+        p_scratch);
+  }
+}
+
+void quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    __ET_UNUSED IntArrayRef dilation,
+    __ET_UNUSED int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED int64_t out_multiplier,
+    __ET_UNUSED int64_t out_shift,
+    Tensor& out) {
+  xa_opt_quantized_conv1d_nchw_asym8sxsym8s_asym8s(
+      ctx,
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      in_zero_point,
+      weight_zero_point,
+      bias_scale,
+      output_scale,
+      output_zero_point,
+      out);
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
+} // namespace cadence
diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp
new file mode 100644
index 00000000000..fae49ec97c7
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/backends/cadence/hifi/operators/operators.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1)))
+
+using Tensor = executorch::aten::Tensor;
+using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
+using ScalarType = executorch::aten::ScalarType;
+using ::executorch::aten::IntArrayRef;
+
+namespace cadence {
+namespace impl {
+namespace HiFi {
+namespace native {
+
+// Optimized NCHW 1D convolution for uint8 x uint8 -> uint8
+void xa_opt_quantized_conv1d_nchw_asym8uxsym8u_asym8u(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    int32_t in_zero_point,
+    int32_t weight_zero_point,
+    float bias_scale,
+    float output_scale,
+    int32_t output_zero_point,
+    Tensor& out) {
+  constexpr int kNnlibMaxDim = 3;
+
+  UWORD8* __restrict__ p_out =
+      (UWORD8* __restrict__)out.mutable_data_ptr<uint8_t>();
+  UWORD8* __restrict__ p_inp =
+      (UWORD8* __restrict__)input.const_data_ptr<uint8_t>();
+  UWORD8* __restrict__ p_kernel =
+      (UWORD8* __restrict__)weight.const_data_ptr<uint8_t>();
+  WORD32* __restrict__ p_bias =
+      (WORD32* __restrict__)bias.const_data_ptr<int32_t>();
+
+  WORD32 batches = input.size(0);
+  WORD32 input_channels = input.size(1);
+  WORD32 input_width = input.size(2);
+  WORD32 out_channels = weight.size(0);
+  WORD32 kernel_channels = weight.size(1);
+  WORD32 kernel_width = weight.size(2);
+  WORD32 out_width = out.size(2);
+  WORD32 x_stride = stride[1];
+  WORD32 x_padding = padding[1];
+  WORD32 input_zero_bias = -in_zero_point;
+  WORD32 out_multiplier32[out_channels];
+  WORD32 out_shift32[out_channels];
+  WORD32 kernel_zero_bias = -weight_zero_point;
+  float out_scale = 1. / output_scale;
+
+  for (int i = 0; i < out_channels; i++) {
+    out_multiplier32[i] = bias_scale * out_scale * 2147483648;
+    out_shift32[i] = 0;
+  }
+
+  WORD32 out_zero_bias = output_zero_point;
+  WORD32 out_data_format = 1;
+  UWORD8* ptr1 = (UWORD8*)kernels::allocate_temp_memory(
+      ctx, ((batches * input_channels * input_width) + 8) * sizeof(UWORD8));
+  UWORD8* ptr2 = (UWORD8*)kernels::allocate_temp_memory(
+      ctx,
+      ((out_channels * kernel_channels * kernel_width) + 8) * sizeof(UWORD8));
+  UWORD8* pin = (UWORD8*)ALIGN_PTR(ptr1, 8);
+  UWORD8* pkernel = (UWORD8*)ALIGN_PTR(ptr2, 8);
+
+  WORD32 p_inp_shape[kNnlibMaxDim];
+  p_inp_shape[0] = batches;
+  p_inp_shape[1] = input_channels;
+  p_inp_shape[2] = input_width;
+
+  WORD32 p_out_shape[kNnlibMaxDim];
+  p_out_shape[0] = batches;
+  p_out_shape[1] = input_width;
+  p_out_shape[2] = input_channels;
+
+  WORD32 p_permute_vec[kNnlibMaxDim] = {0, 2, 1};
+
+  xa_nn_transpose_8_8(
+      pin,
+      p_out_shape,
+      p_inp,
+      p_inp_shape,
+      p_permute_vec,
+      kNnlibMaxDim,
+      kNnlibMaxDim);
+
+  WORD32 p_inp_shape1[kNnlibMaxDim];
+  p_inp_shape1[0] = out_channels;
+  p_inp_shape1[1] = kernel_channels;
+  p_inp_shape1[2] = kernel_width;
+
+  WORD32 p_out_shape1[kNnlibMaxDim];
+  p_out_shape1[0] = out_channels;
+  p_out_shape1[1] = kernel_width;
+  p_out_shape1[2] = kernel_channels;
+
+  xa_nn_transpose_8_8(
+      pkernel,
+      p_out_shape1,
+      p_kernel,
+      p_inp_shape1,
+      p_permute_vec,
+      kNnlibMaxDim,
+      kNnlibMaxDim);
+
+  WORD32 scratch_size =
+      xa_nn_conv1d_std_getsize(kernel_width, input_width, input_channels, 8);
+  scratch_size = scratch_size < 0 ? 0 : scratch_size;
+  WORD32* ptr_scratch =
+      (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size);
+  pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8);
+
+  for (int _n = 0; _n < batches; _n++) {
+    UWORD8* in_batch = pin + _n * input_channels * input_width;
+    UWORD8* out_batch = p_out + _n * out_channels * out_width;
+
+    xa_nn_conv1d_std_asym8uxasym8u(
+        out_batch,
+        in_batch,
+        pkernel,
+        p_bias,
+        1,
+        input_width,
+        input_channels,
+        kernel_width,
+        out_channels,
+        x_stride,
+        x_padding,
+        out_width,
+        input_zero_bias,
+        kernel_zero_bias,
+        out_multiplier32,
+        out_shift32,
+        out_zero_bias,
+        out_data_format,
+        p_scratch);
+  }
+}
+
+void quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    __ET_UNUSED IntArrayRef dilation,
+    __ET_UNUSED int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED int64_t out_multiplier,
+    __ET_UNUSED int64_t out_shift,
+    Tensor& out) {
+  xa_opt_quantized_conv1d_nchw_asym8uxsym8u_asym8u(
+      ctx,
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      in_zero_point,
+      weight_zero_point,
+      bias_scale,
+      output_scale,
+      output_zero_point,
+      out);
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
+} // namespace cadence
diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out.cpp
new file mode 100644
index 00000000000..a2cb591b3a7
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out.cpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/backends/cadence/hifi/operators/operators.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1)))
+
+using Tensor = executorch::aten::Tensor;
+using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
+using ScalarType = executorch::aten::ScalarType;
+using ::executorch::aten::IntArrayRef;
+
+namespace cadence {
+namespace impl {
+namespace HiFi {
+namespace native {
+
+// Optimized NHWC 1D convolution for int8 x int8 -> int8
+void xa_opt_quantized_conv1d_nhwc_asym8sxsym8s_asym8s(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    int32_t in_zero_point,
+    int32_t weight_zero_point,
+    float bias_scale,
+    float output_scale,
+    int32_t output_zero_point,
+    Tensor& out) {
+  WORD8* __restrict__ p_out =
+      (WORD8* __restrict__)out.mutable_data_ptr<int8_t>();
+  WORD8* __restrict__ p_inp =
+      (WORD8* __restrict__)input.const_data_ptr<int8_t>();
+  WORD8* __restrict__ p_kernel =
+      (WORD8* __restrict__)weight.const_data_ptr<int8_t>();
+  WORD32* __restrict__ p_bias =
+      (WORD32* __restrict__)bias.const_data_ptr<int32_t>();
+
+  WORD32 batches = input.size(0);
+  WORD32 input_channels = input.size(1);
+  WORD32 input_width = input.size(2);
+  WORD32 out_channels = weight.size(0);
+  WORD32 kernel_width = weight.size(2);
+  WORD32 out_width = out.size(2);
+  WORD32 x_stride = stride[1];
+  WORD32 x_padding = padding[1];
+  WORD32 input_zero_bias = -in_zero_point;
+  WORD32 out_multiplier32[out_channels];
+  WORD32 out_shift32[out_channels];
+  WORD32 kernel_zero_bias = -weight_zero_point;
+  float out_scale = 1. / output_scale;
+
+  for (int i = 0; i < out_channels; i++) {
+    out_multiplier32[i] = bias_scale * out_scale * 2147483648;
+    out_shift32[i] = 0;
+  }
+
+  WORD32 out_zero_bias = output_zero_point;
+  WORD32 out_data_format = 0;
+  WORD32 scratch_size =
+      xa_nn_conv1d_std_getsize(kernel_width, input_width, input_channels, 8);
+  scratch_size = scratch_size < 0 ? 0 : scratch_size;
+  WORD32* ptr_scratch =
+      (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size);
+  pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8);
+
+  for (int _n = 0; _n < batches; _n++) {
+    WORD8* in_batch = p_inp + _n * input_channels * input_width;
+    WORD8* out_batch = p_out + _n * out_channels * out_width;
+
+    xa_nn_conv1d_std_asym8xasym8(
+        out_batch,
+        in_batch,
+        p_kernel,
+        p_bias,
+        1,
+        input_width,
+        input_channels,
+        kernel_width,
+        out_channels,
+        x_stride,
+        x_padding,
+        out_width,
+        input_zero_bias,
+        kernel_zero_bias,
+        out_multiplier32,
+        out_shift32,
+        out_zero_bias,
+        out_data_format,
+        p_scratch);
+  }
+}
+
+void quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    __ET_UNUSED IntArrayRef dilation,
+    __ET_UNUSED int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED int64_t out_multiplier,
+    __ET_UNUSED int64_t out_shift,
+    Tensor& out) {
+  xa_opt_quantized_conv1d_nhwc_asym8sxsym8s_asym8s(
+      ctx,
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      in_zero_point,
+      weight_zero_point,
+      bias_scale,
+      output_scale,
+      output_zero_point,
+      out);
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
+} // namespace cadence
diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out.cpp
new file mode 100644
index 00000000000..441952ca189
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out.cpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/backends/cadence/hifi/operators/operators.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1)))
+
+using Tensor = executorch::aten::Tensor;
+using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
+using ScalarType = executorch::aten::ScalarType;
+using ::executorch::aten::IntArrayRef;
+
+namespace cadence {
+namespace impl {
+namespace HiFi {
+namespace native {
+
+// Optimized NHWC 1D convolution for uint8 x uint8 -> uint8
+void xa_opt_quantized_conv1d_nhwc_asym8uxsym8u_asym8u(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    int32_t in_zero_point,
+    int32_t weight_zero_point,
+    float bias_scale,
+    float output_scale,
+    int32_t output_zero_point,
+    Tensor& out) {
+  UWORD8* __restrict__ p_out =
+      (UWORD8* __restrict__)out.mutable_data_ptr<uint8_t>();
+  UWORD8* __restrict__ p_inp =
+      (UWORD8* __restrict__)input.const_data_ptr<uint8_t>();
+  UWORD8* __restrict__ p_kernel =
+      (UWORD8* __restrict__)weight.const_data_ptr<uint8_t>();
+  WORD32* __restrict__ p_bias =
+      (WORD32* __restrict__)bias.const_data_ptr<int32_t>();
+
+  WORD32 batches = input.size(0);
+  WORD32 input_channels = input.size(1);
+  WORD32 input_width = input.size(2);
+  WORD32 out_channels = weight.size(0);
+  WORD32 kernel_width = weight.size(2);
+  WORD32 out_width = out.size(2);
+  WORD32 x_stride = stride[1];
+  WORD32 x_padding = padding[1];
+  WORD32 input_zero_bias = -in_zero_point;
+  WORD32 out_multiplier32[out_channels];
+  WORD32 out_shift32[out_channels];
+  WORD32 kernel_zero_bias = -weight_zero_point;
+  float out_scale = 1. / output_scale;
+
+  for (int i = 0; i < out_channels; i++) {
+    out_multiplier32[i] = bias_scale * out_scale * 2147483648;
+    out_shift32[i] = 0;
+  }
+
+  WORD32 out_zero_bias = output_zero_point;
+  WORD32 out_data_format = 0;
+  WORD32 scratch_size =
+      xa_nn_conv1d_std_getsize(kernel_width, input_width, input_channels, 8);
+  scratch_size = scratch_size < 0 ? 0 : scratch_size;
+  WORD32* ptr_scratch =
+      (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size);
+  pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8);
+
+  for (int _n = 0; _n < batches; _n++) {
+    UWORD8* in_batch = p_inp + _n * input_channels * input_width;
+    UWORD8* out_batch = p_out + _n * out_channels * out_width;
+
+    xa_nn_conv1d_std_asym8uxasym8u(
+        out_batch,
+        in_batch,
+        p_kernel,
+        p_bias,
+        1,
+        input_width,
+        input_channels,
+        kernel_width,
+        out_channels,
+        x_stride,
+        x_padding,
+        out_width,
+        input_zero_bias,
+        kernel_zero_bias,
+        out_multiplier32,
+        out_shift32,
+        out_zero_bias,
+        out_data_format,
+        p_scratch);
+  }
+}
+
+void quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    __ET_UNUSED IntArrayRef dilation,
+    __ET_UNUSED int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED int64_t out_multiplier,
+    __ET_UNUSED int64_t out_shift,
+    Tensor& out) {
+  xa_opt_quantized_conv1d_nhwc_asym8uxsym8u_asym8u(
+      ctx,
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      in_zero_point,
+      weight_zero_point,
+      bias_scale,
+      output_scale,
+      output_zero_point,
+      out);
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
+} // namespace cadence
diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl
index d310396c262..fa263d4017c 100644
--- a/backends/cadence/hifi/operators/targets.bzl
+++ b/backends/cadence/hifi/operators/targets.bzl
@@ -66,6 +66,8 @@ OPERATORS = [
     "quantized_conv_nchw_out",
     "quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out",
     "quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out",
+    "quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out",
+    "quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out",
     "quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out",
     "quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out",
     "quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out",
@@ -73,6 +75,8 @@ OPERATORS = [
     "quantized_conv_nhwc_out",
     "quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out",
     "quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out",
+    "quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out",
+    "quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out",
     "quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out",
     "quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out",
     "quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out",
diff --git a/backends/cadence/reference/operators/quantized_conv_nchw_out.cpp b/backends/cadence/reference/operators/quantized_conv_nchw_out.cpp
index aefa75d7047..1a4faeed250 100644
--- a/backends/cadence/reference/operators/quantized_conv_nchw_out.cpp
+++ b/backends/cadence/reference/operators/quantized_conv_nchw_out.cpp
@@ -496,6 +496,72 @@ void quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out(
       out);
 }
 
+void quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED int64_t out_multiplier,
+    __ET_UNUSED int64_t out_shift,
+    Tensor& out) {
+  quantized_conv_nchw(
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      groups,
+      in_zero_point,
+      weight_zero_point,
+      bias_scale,
+      output_scale,
+      output_zero_point,
+      out);
+}
+
+void quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED int64_t out_multiplier,
+    __ET_UNUSED int64_t out_shift,
+    Tensor& out) {
+  quantized_conv_nchw(
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      groups,
+      in_zero_point,
+      weight_zero_point,
+      bias_scale,
+      output_scale,
+      output_zero_point,
+      out);
+}
+
 } // namespace native
 } // namespace reference
 } // namespace impl
diff --git a/backends/cadence/reference/operators/quantized_conv_nhwc_out.cpp b/backends/cadence/reference/operators/quantized_conv_nhwc_out.cpp
index 26fbc86d5b0..21b17fb0724 100644
--- a/backends/cadence/reference/operators/quantized_conv_nhwc_out.cpp
+++ b/backends/cadence/reference/operators/quantized_conv_nhwc_out.cpp
@@ -417,6 +417,72 @@ void quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out(
       out);
 }
 
+void quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED int64_t out_multiplier,
+    __ET_UNUSED int64_t out_shift,
+    Tensor& out) {
+  quantized_conv_nhwc(
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      groups,
+      in_zero_point,
+      weight_zero_point,
+      bias_scale,
+      output_scale,
+      output_zero_point,
+      out);
+}
+
+void quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED int64_t out_multiplier,
+    __ET_UNUSED int64_t out_shift,
+    Tensor& out) {
+  quantized_conv_nhwc(
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      groups,
+      in_zero_point,
+      weight_zero_point,
+      bias_scale,
+      output_scale,
+      output_zero_point,
+      out);
+}
+
 void quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
diff --git a/backends/mediatek/runtime/include/api/NeuronAdapter.h b/backends/mediatek/runtime/include/api/NeuronAdapter.h
deleted file mode 100644
index 3a4af8299b0..00000000000
--- a/backends/mediatek/runtime/include/api/NeuronAdapter.h
+++ /dev/null
@@ -1,2385 +0,0 @@
-/* Copyright Statement:
- *
- * This software/firmware and related documentation ("MediaTek Software") are
- * protected under relevant copyright laws. The information contained herein
- * is confidential and proprietary to MediaTek Inc. and/or its licensors.
- * Without the prior written permission of MediaTek inc. and/or its licensors,
- * any reproduction, modification, use or disclosure of MediaTek Software,
- * and information contained herein, in whole or in part, shall be strictly
- * prohibited.
- */
-/* MediaTek Inc. (C) 2020. All rights reserved.
- *
- * BY OPENING THIS FILE, RECEIVER HEREBY UNEQUIVOCALLY ACKNOWLEDGES AND AGREES
- * THAT THE SOFTWARE/FIRMWARE AND ITS DOCUMENTATIONS ("MEDIATEK SOFTWARE")
- * RECEIVED FROM MEDIATEK AND/OR ITS REPRESENTATIVES ARE PROVIDED TO RECEIVER ON
- * AN "AS-IS" BASIS ONLY. MEDIATEK EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE OR NONINFRINGEMENT.
- * NEITHER DOES MEDIATEK PROVIDE ANY WARRANTY WHATSOEVER WITH RESPECT TO THE
- * SOFTWARE OF ANY THIRD PARTY WHICH MAY BE USED BY, INCORPORATED IN, OR
- * SUPPLIED WITH THE MEDIATEK SOFTWARE, AND RECEIVER AGREES TO LOOK ONLY TO SUCH
- * THIRD PARTY FOR ANY WARRANTY CLAIM RELATING THERETO. RECEIVER EXPRESSLY
- * ACKNOWLEDGES THAT IT IS RECEIVER'S SOLE RESPONSIBILITY TO OBTAIN FROM ANY
- * THIRD PARTY ALL PROPER LICENSES CONTAINED IN MEDIATEK SOFTWARE. MEDIATEK
- * SHALL ALSO NOT BE RESPONSIBLE FOR ANY MEDIATEK SOFTWARE RELEASES MADE TO
- * RECEIVER'S SPECIFICATION OR TO CONFORM TO A PARTICULAR STANDARD OR OPEN
- * FORUM. RECEIVER'S SOLE AND EXCLUSIVE REMEDY AND MEDIATEK'S ENTIRE AND
- * CUMULATIVE LIABILITY WITH RESPECT TO THE MEDIATEK SOFTWARE RELEASED HEREUNDER
- * WILL BE, AT MEDIATEK'S OPTION, TO REVISE OR REPLACE THE MEDIATEK SOFTWARE AT
- * ISSUE, OR REFUND ANY SOFTWARE LICENSE FEES OR SERVICE CHARGE PAID BY RECEIVER
- * TO MEDIATEK FOR SUCH MEDIATEK SOFTWARE AT ISSUE.
- *
- * The following software/firmware and/or related documentation ("MediaTek
- * Software") have been modified by MediaTek Inc. All revisions are subject to
- * any receiver's applicable license agreements with MediaTek Inc.
- */
-
-/**
- * @file NeuronAdapter.h
- */
-
-#pragma once
-
-#ifdef __ANDROID__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wnullability-extension"
-#include <android/hardware_buffer.h>
-#pragma clang diagnostic pop
-#endif
-
-#include <stddef.h>
-#include <stdint.h>
-#include <sys/cdefs.h>
-
-__BEGIN_DECLS
-
-/**
- * NeuronModel is an opaque type that contains a description of the mathematical
- * operations that constitute the model.
- */
-typedef struct NeuronModel NeuronModel;
-
-/**
- * NeuronCompilation is an opaque type that can be used to compile a machine
- * learning model.
- */
-typedef struct NeuronCompilation NeuronCompilation;
-
-/**
- * NeuronExecution is an opaque type that can be used to apply a machine
- * learning model to a set of inputs.
- */
-typedef struct NeuronExecution NeuronExecution;
-
-/**
- * NeuronDevice is an opaque type that represents a device.
- *
- * This type is used to query basic properties and supported operations of the
- * corresponding device, and control which device(s) a model is to be run on.
- *
- * Available since 4.1.0
- */
-typedef struct NeuronDevice NeuronDevice;
-
-/**
- * This type is used to represent shared memory, memory mapped files, and
- * similar memories.
- *
- * It is the application's responsibility to ensure that there are no uses of
- * the memory after calling NeuronMemory_free. This includes the execution which
- * references this memory because of a call to
- * NeuronExecution_setInputFromMemory or NeuronExecution_setOutputFromMemory.
- *
- * Available since 4.1.0
- */
-typedef struct NeuronMemory NeuronMemory;
-
-/**
- * NeuronEvent is an opaque type that represents an event
- * that will be signaled once an execution completes.
- *
- * Available since 5.0.0
- */
-typedef struct NeuronEvent NeuronEvent;
-
-/**
- * Result codes.
- */
-typedef enum {
-  NEURON_NO_ERROR = 0,
-  NEURON_OUT_OF_MEMORY = 1,
-  NEURON_INCOMPLETE = 2,
-  NEURON_UNEXPECTED_NULL = 3,
-  NEURON_BAD_DATA = 4,
-  NEURON_OP_FAILED = 5,
-  NEURON_UNMAPPABLE = 6,
-  NEURON_BAD_STATE = 7,
-  NEURON_BAD_VERSION = 8,
-
-  // Available since 5.0.0
-  NEURON_OUTPUT_INSUFFICIENT_SIZE = 9,
-  NEURON_UNAVAILABLE_DEVICE = 10,
-  NEURON_MISSED_DEADLINE_TRANSIENT = 11,
-  NEURON_MISSED_DEADLINE_PERSISTENT = 12,
-  NEURON_RESOURCE_EXHAUSTED_TRANSIENT = 13,
-  NEURON_RESOURCE_EXHAUSTED_PERSISTENT = 14,
-  NEURON_DEAD_OBJECT = 15,
-} NeuronAdapterResultCode;
-
-/**
- * Operand values with size in bytes that are smaller or equal to this will be
- * immediately copied into the model.
- */
-enum { NEURON_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES = 128 };
-
-/**
- * Size of the cache token, in bytes, required from the application.
- */
-enum { NEURON_BYTE_SIZE_OF_CACHE_TOKEN = 32 };
-
-/**
- * Operand types.
- * The type of operands that can be added to a model.
- *
- * Some notes on quantized tensors
- *
- * <p>NEURON_TENSOR_QUANT8_ASYMM
- * <p>Attached to this tensor are two numbers that can be used to convert the 8
- * bit integer to the real value and vice versa. These two numbers are:
- * - scale: a 32 bit floating point value greater than zero.
- * - zeroPoint: a 32 bit integer, in range [0, 255].
- * <p>The formula is: real_value = (integer_value - zero_value) * scale.
- *
- * <p>NEURON_TENSOR_QUANT16_SYMM
- * <p>Attached to this tensor is a number representing real value scale that is
- * used to convert the 16 bit number to a real value in the following way:
- * realValue = integerValue * scale. scale is a 32 bit floating point with value
- * greater than zero.
- *
- * <p>NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL
- * <p>This tensor is associated with additional fields that can be used to
- * convert the 8 bit signed integer to the real value and vice versa. These
- * fields are:
- * - channelDim: a 32 bit unsigned integer indicating channel dimension.
- * - scales: an array of positive 32 bit floating point values.
- * <p>The size of the scales array must be equal to dimensions[channelDim].
- * NeuronModel_setOperandSymmPerChannelQuantParams must be used to set the
- * parameters for an Operand of this type. The channel dimension of this tensor
- * must not be unknown (dimensions[channelDim] != 0). The formula is:
- * realValue[..., C, ...] = integerValue[..., C, ...] * scales[C] where C is an
- * index in the Channel dimension.
- *
- * <p>NEURON_TENSOR_QUANT16_ASYMM
- * <p>Attached to this tensor are two numbers that can be used to convert the 16
- * bit integer to the real value and vice versa. These two numbers are:
- * - scale: a 32 bit floating point value greater than zero.
- * - zeroPoint: a 32 bit integer, in range [0, 65535].
- * <p>The formula is: real_value = (integer_value - zeroPoint) * scale.
- *
- * <p>NEURON_TENSOR_QUANT8_SYMM
- * <p>Attached to this tensor is a number representing real value scale that is
- * used to convert the 8 bit number to a real value in the following way:
- * realValue = integerValue * scale. scale is a 32 bit floating point with value
- * greater than zero.
- *
- * <p>NEURON_TENSOR_QUANT8_ASYMM_SIGNED
- * <P>Attached to this tensor are two numbers that can be used to convert the 8
- * bit integer to the real value and vice versa. These two numbers are:
- * - scale: a 32 bit floating point value greater than zero.
- * - zeroPoint: a 32 bit integer, in range [-128, 127].
- * <p>The formula is: real_value = (integer_value - zeroPoint) * scale.
- */
-enum {
-  /** A 32 bit floating point scalar value. */
-  NEURON_FLOAT32 = 0,
-  /** A signed 32 bit integer scalar value. */
-  NEURON_INT32 = 1,
-  /** An unsigned 32 bit integer scalar value. */
-  NEURON_UINT32 = 2,
-  /** A tensor of 32 bit floating point values. */
-  NEURON_TENSOR_FLOAT32 = 3,
-  /** A tensor of 32 bit integer values. */
-  NEURON_TENSOR_INT32 = 4,
-  /** A tensor of 8 bit integers that represent real numbers. */
-  NEURON_TENSOR_QUANT8_ASYMM = 5,
-  /** An 8 bit boolean scalar value. */
-  NEURON_BOOL = 6,
-  /** A tensor of 16 bit signed integers that represent real numbers. */
-  NEURON_TENSOR_QUANT16_SYMM = 7,
-  /** A tensor of IEEE 754 16 bit floating point values. */
-  NEURON_TENSOR_FLOAT16 = 8,
-  /** A tensor of 8 bit boolean values. */
-  NEURON_TENSOR_BOOL8 = 9,
-  /** An IEEE 754 16 bit floating point scalar value. */
-  NEURON_FLOAT16 = 10,
-  /** A tensor of 8 bit signed integers that represent real numbers. */
-  NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL = 11,
-  /** A tensor of 16 bit unsigned integers that represent real numbers. */
-  NEURON_TENSOR_QUANT16_ASYMM = 12,
-  /** A tensor of 8 bit signed integers that represent real numbers. */
-  NEURON_TENSOR_QUANT8_SYMM = 13,
-  /** A tensor of 8 bit signed integers that represent real numbers. */
-  NEURON_TENSOR_QUANT8_ASYMM_SIGNED = 14,
-  /** A reference to a model. */
-  NEURON_MODEL = 15,
-  /** Extended data type - tensor uint32 */
-  NEURON_EXT_TENSOR_UINT32 = 9001,
-  /** Extended data type -A tensor of 8 bit unsigned integers that represent
-     real numbers. */
-  NEURON_EXT_TENSOR_QUANT8_ASYMM_PER_CHANNEL = 9002,
-  /** Extended data type -A tensor of 4 bit unsigned integers that represent
-     real numbers. */
-  NEURON_EXT_TENSOR_QUANT4_ASYMM = 9003,
-  /** Extended data type -A tensor of 4 bit signed integers that represent real
-     numbers. */
-  NEURON_EXT_TENSOR_QUANT4_ASYMM_SIGNED = 9004,
-  /** Extended data type -A tensor of 4 bit signed integers that represent real
-     numbers. */
-  NEURON_EXT_TENSOR_QUANT4_SYMM = 9005,
-  /** Extended data type -A tensor of 16 bit signed integers that represent real
-     numbers. */
-  NEURON_EXT_TENSOR_QUANT16_ASYMM_SIGNED = 9006,
-  /** Extended data type -A raw tensor. */
-  NEURON_EXT_TENSOR_RAW = 9007,
-  /** Extended data type -A tensor of 8 bit signed integers that represent real
-     numbers. */
-  NEURON_EXT_TENSOR_QUANT8_ASYMM_SIGNED_PER_CHANNEL = 9008,
-};
-
-/**
- * NeuronOperandType describes the type of an operand.
- * This structure is used to describe both scalars and tensors.
- */
-typedef struct NeuronOperandType {
-  /** The data type, e.g NEURON_INT8. */
-  int32_t type;
-  /** The number of dimensions. It should be 0 for scalars. */
-  uint32_t dimensionCount;
-  /** The dimensions of the tensor. It should be nullptr for scalars. */
-  const uint32_t* dimensions;
-  /**
-   * These two fields are only used for quantized tensors.
-   * They should be zero for scalars and non-fixed point tensors.
-   * The dequantized value of each entry is (value - zeroPoint) * scale.
-   */
-  float scale;
-  /** Only used with scale for quantized tensors */
-  int32_t zeroPoint;
-} NeuronOperandType;
-
-/**
- * Parameters for NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL operand.
- */
-typedef struct NeuronSymmPerChannelQuantParams {
-  /** The index of the channel dimension. */
-  uint32_t channelDim;
-  /** The size of the scale array. Should be equal to dimension[channelDim] of
-   * the Operand. */
-  uint32_t scaleCount;
-  /** The array of scaling values for each channel. Each value must be greater
-   * than zero. */
-  const float* scales;
-} NeuronSymmPerChannelQuantParams;
-
-/**
- * Parameters for NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL and
- * NEURON_TENSOR_QUANT8_ASYMM_PER_CHANNEL operand.
- */
-typedef struct NeuronPerChannelQuantParams {
-  /** The index of the channel dimension. */
-  uint32_t channelDim;
-  /** The size of the scale array. Should be equal to dimension[channelDim] of
-   * the Operand. */
-  uint32_t scaleCount;
-  /** The array of scaling values for each channel. Each value must be greater
-   * than zero. */
-  const float* scales;
-  /** The size of the zeroPoints. Should be equal to dimension[channelDim] of
-   * the Operand. */
-  uint32_t zeroPointCount;
-  /** The array of zero point values for each channel. */
-  const int32_t* zeroPoints;
-} NeuronPerChannelQuantParams;
-
-/**
- * Operation Types
- *
- * Supported operations are listed with available versions. See
- * Neuron_getVersion for querying version number.
- *
- * Attempting to compile models with operations marked as not available
- * will get a compilation failure.
- *
- * Refer to the operation support status of each hardware platform.
- * Attempting to compile models with operations supported by this library but
- * not supported by the underlying hardware platform will get a compilation
- * failure too.
- *
- * Compatible NNAPI levels are also listed.
- */
-typedef enum {
-  NEURON_ADD = 0, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_AVERAGE_POOL_2D = 1, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_CONCATENATION = 2, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_CONV_2D = 3, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_DEPTHWISE_CONV_2D = 4, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_DEPTH_TO_SPACE = 5, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_DEQUANTIZE = 6, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_EMBEDDING_LOOKUP = 7, ///< Not available.
-  NEURON_FLOOR = 8, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_FULLY_CONNECTED = 9, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_HASHTABLE_LOOKUP = 10, ///< Not available.
-  NEURON_L2_NORMALIZATION = 11, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_L2_POOL_2D = 12, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_LOCAL_RESPONSE_NORMALIZATION = 13, ///< Not available.
-  NEURON_LOGISTIC = 14, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_LSH_PROJECTION = 15, ///< Not available.
-  NEURON_LSTM = 16, ///< Not available.
-  NEURON_MAX_POOL_2D = 17, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_MUL = 18, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_RELU = 19, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_RELU1 = 20, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_RELU6 = 21, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_RESHAPE = 22, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_RESIZE_BILINEAR = 23, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_RNN = 24, ///< Not available.
-  NEURON_SOFTMAX = 25, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_SPACE_TO_DEPTH = 26, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_SVDF = 27, ///< Not available.
-  NEURON_TANH = 28, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_BATCH_TO_SPACE_ND = 29, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_DIV = 30, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_MEAN = 31, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_PAD = 32, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_SPACE_TO_BATCH_ND = 33, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_SQUEEZE = 34, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_STRIDED_SLICE = 35, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_SUB = 36, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_TRANSPOSE = 37, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_ABS = 38, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_ARGMAX = 39, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_ARGMIN = 40, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_AXIS_ALIGNED_BBOX_TRANSFORM =
-      41, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_BIDIRECTIONAL_SEQUENCE_LSTM = 42, ///< Not available.
-  NEURON_BIDIRECTIONAL_SEQUENCE_RNN = 43, ///< Not available.
-  NEURON_BOX_WITH_NMS_LIMIT = 44, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_CAST = 45, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_CHANNEL_SHUFFLE = 46, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_DETECTION_POSTPROCESSING = 47, ///< Not available.
-  NEURON_EQUAL = 48, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_EXP = 49, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_EXPAND_DIMS = 50, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_GATHER = 51, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_GENERATE_PROPOSALS = 52, ///< Not available.
-  NEURON_GREATER = 53, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_GREATER_EQUAL = 54, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_GROUPED_CONV_2D = 55, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_HEATMAP_MAX_KEYPOINT = 56, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_INSTANCE_NORMALIZATION =
-      57, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_LESS = 58, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_LESS_EQUAL = 59, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_LOG = 60, ///< Not available.
-  NEURON_LOGICAL_AND = 61, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_LOGICAL_NOT = 62, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_LOGICAL_OR = 63, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_LOG_SOFTMAX = 64, ///< Not available.
-  NEURON_MAXIMUM = 65, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_MINIMUM = 66, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_NEG = 67, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_NOT_EQUAL = 68, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_PAD_V2 = 69, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_POW = 70, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_PRELU = 71, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_QUANTIZE = 72, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_QUANTIZED_16BIT_LSTM = 73, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_RANDOM_MULTINOMIAL = 74, ///< Not available.
-  NEURON_REDUCE_ALL = 75, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_REDUCE_ANY = 76, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_REDUCE_MAX = 77, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_REDUCE_MIN = 78, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_REDUCE_PROD = 79, ///< Not available.
-  NEURON_REDUCE_SUM = 80, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_ROI_ALIGN = 81, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_ROI_POOLING = 82, ///< Not available.
-  NEURON_RSQRT = 83, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_SELECT = 84, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_SIN = 85, ///< Not available.
-  NEURON_SLICE = 86, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_SPLIT = 87, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_SQRT = 88, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_TILE = 89, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_TOPK_V2 = 90, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_TRANSPOSE_CONV_2D = 91, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_UNIDIRECTIONAL_SEQUENCE_LSTM = 92, ///< Not available.
-  NEURON_UNIDIRECTIONAL_SEQUENCE_RNN = 93, ///< Not available.
-  NEURON_RESIZE_NEAREST_NEIGHBOR =
-      94, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_QUANTIZED_LSTM = 95, ///< Not available.
-  NEURON_IF = 96, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_WHILE = 97, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_ELU = 98, ///< Not available.
-  NEURON_HARD_SWISH = 99, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_FILL = 100, ///< Available since 4.1.0. NNAPI level 30.
-  NEURON_RANK = 101, ///< Not available.
-  NEURON_BATCH_MATMUL = 102, ///< Available since 5.1.2. NNAPI FL6.
-  NEURON_PACK = 103, ///< Not available.
-  NEURON_MIRROR_PAD = 104, ///< Not available.
-  NEURON_MIRROR_REVERSE = 105, ///< Not available.
-  /**
-   * Decompress HyFBC to YUV420 frame, support both YUV420_8BITS and
-   * YUV420_10BITS formats. HyFBC (Hybrid Frame Buffer Compression) is a
-   * compressed format used by video decoder (VDEC). This format uses YUV420 to
-   * compress.
-   *
-   * For input part, need to set two inputs with different shape, representing Y
-   * and UV plane respectively. The same HyFBC data will be used for both
-   * inputs. Similarly, the output part also needs to be set to two,
-   * representing Y and UV plane respectively.
-   *
-   * The shape of the two inputs/ outputs (inputY, inputUV, outputY, outputUV)
-   * depends on the original images' shape ([batches, height, width, channels]).
-   * Both height and width shold follow 64 alignment rule. For example, if
-   * original height is 480, its 64 alignment should be 512. For Y plane,
-   * channel size should be 1; for UV plane, channel size should be 2. Besides,
-   * the height and width of UV plane should be half of Y's height and width.
-   * Example:
-   *
-   *      original_img.shape = [1, 384, 640, 3]
-   *      inputY.shape = [1, 384, 640, 1]
-   *      inputUV.shape = [1, 192, 320, 2]
-   *      outputY.shape = [1, 384, 640, 1]
-   *      outputUV.shape = [1, 192, 320, 2]
-   *
-   * Supported tensor {@link OperandCode}:
-   * * {@link NEURON_EXT_TENSOR_RAW} (for inputY, inputUV)
-   * * {@link NEURON_TENSOR_QUANT8_ASYMM} (for outputY, outputUV)
-   * * {@link NEURON_TENSOR_QUANT16_ASYMM} (for outputY, outputUV)
-   * Note:
-   * If image mode is YUV420_8BITS, use NEURON_TENSOR_QUANT8_ASYMM; if mode is
-   * YUV420_10BITS, use NEURON_TENSOR_QUANT16_ASYMM.
-   *
-   * Tensor rank: both input and output require rank 4, with "NHWC" data layout.
-   *
-   * Inputs:
-   * * 0: inputY, a 4-D {@link NEURON_EXT_TENSOR_RAW} tensor.
-   * * 1: inputUV, a 4-D {@link NEURON_EXT_TENSOR_RAW} tensor.
-   * * 2: YHeaderAlignment, an {@link NEURON_INT32} scalar, specifying
-   * the header alignment in Hyfbc format.
-   * * 3: UVHeaderAlignment, an {@link NEURON_INT32} scalar, specifying
-   * the header alignment in Hyfbc format.
-   * * 4: xAlign, an {@link NEURON_INT32} scalar, specifying the frame
-   * width alignment of video decoder.
-   * * 5: yAlign, an {@link NEURON_INT32} scalar, specifying the frame
-   * height alignment of video decoder.
-   * * 6: xOffset, an {@link NEURON_INT32} scalar, specifying the frame
-   * width offset of video decoder.
-   * * 7: yOffset, an {@link NEURON_INT32} scalar, specifying the frame
-   * height offset of video decoder.
-   * * 8: mode, an {@link NEURON_INT32} scalar. Set to 0 for
-   * YUV420_8BITS. Set to 1 for YUV420_10BITS. Note that 8b, 10b here means the
-   * compressed bit width in Hyfbc frame, where the decompressed YUV420 is 8b
-   * for Hyfbc_8b, and YUV420 is 16b for Hyfbc_10b.
-   * * 9: outPitchN, an {@link NEURON_INT32} scalar, specifying the
-   * YUV420 N-axis pitch. Must be set to 1, because only a single batch is
-   * supported for HyfbcDecompress.
-   * * 10: outPitchH, an {@link NEURON_INT32} scalar, specifying the
-   * YUV420 H-axis pitch. Set to the original compressed image height with video
-   * codec alignment.
-   * * 11: outPitchW, an {@link NEURON_INT32} scalar, specifying the
-   * YUV420 W-axis pitch. Set to the original compressed image width with video
-   * codec alignment.
-   * * 12: outPitchC, an {@link NEURON_INT32} scalar, specifying the
-   * YUV420 C-axis pitch. Set to 1 for interleaved YUV420.
-   *
-   * Outputs:
-   * * 0: output Y, a 4-D tensor. Tensor type can be either {@link
-   * NEURON_TENSOR_QUANT8_ASYMM} or {@link
-   * NEURON_TENSOR_QUANT16_ASYMM}, depends on YUV420 bit mode.
-   * * 1: output UV, a 4-D tensor. Tensor type can be either {@link
-   * NEURON_TENSOR_QUANT8_ASYMM} or {@link
-   * NEURON_TENSOR_QUANT16_ASYMM}, depends on YUV420 bit mode.
-   *
-   * Available since NeuroPilot 7.0.0.
-   */
-  NEURON_HYFBCTOYUV420 = 106,
-  /**
-   * Compress YUV420 to AFBC frame, support both YUV420_8BITS and
-   * YUV420_10BITS formats. AFBC (Arm Frame Buffer Compression) is a lossless
-   * compressed image format, created by ARM to reduce the size of images.
-   *
-   * For input part, need to set two inputs with different shape, representing Y
-   * and UV plane respectively. For output part, need to set one output for
-   * AFBC.
-   *
-   * The shape of the two inputs (inputY, inputUV) and output (AFBC)
-   * depends on the original images' shape ([batches, height, width, channels]).
-   * Both height and width shold follow 64 alignment rule. For example, if
-   * original height is 480, its 64 alignment should be 512. For Y plane,
-   * channel size should be 1; for UV plane, channel size should be 2. Besides,
-   * the height and width of UV plane should be half of Y's height and width.
-   * For AFBC output, its height shoud be 3/2 of Y's height, and its width
-   * equals to Y's width. Example:
-   *
-   *      original_img.shape = [1, 384, 640, 3]
-   *      inputY.shape = [1, 384, 640, 1]
-   *      inputUV.shape = [1, 192, 320, 2]
-   *      output.shape = [1, 576, 640, 1]
-   *
-   * Supported tensor {@link OperandCode}:
-   * * {@link NEURON_EXT_TENSOR_RAW} (for output)
-   * * {@link NEURON_TENSOR_QUANT8_ASYMM} (for inputY, inputUV)
-   * * {@link NEURON_TENSOR_QUANT16_ASYMM} (for inputY, inputUV)
-   * Note:
-   * If image mode is YUV420_8BITS, use NEURON_TENSOR_QUANT8_ASYMM; if mode is
-   * YUV420_10BITS, use NEURON_TENSOR_QUANT16_ASYMM.
-   *
-   * Tensor rank: both input and output require rank 4, with "NHWC" data layout.
-   *
-   * Inputs:
-   * * 0: inputY, a 4-D tensor. Tensor type can be either {@link
-   * NEURON_TENSOR_QUANT8_ASYMM} or {@link
-   * NEURON_TENSOR_QUANT16_ASYMM}, depends on YUV420 bit mode.
-   * * 1: inputUV, a 4-D tensor. Tensor type can be either {@link
-   * NEURON_TENSOR_QUANT8_ASYMM} or {@link
-   * NEURON_TENSOR_QUANT16_ASYMM}, depends on YUV420 bit mode.
-   * * 2: HeaderAlignment, an {@link NEURON_INT32} scalar, specifying
-   * the header alignment in AFBC format.
-   * * 3: xAlign, an {@link NEURON_INT32} scalar, specifying the frame
-   * width alignment of AFBC format.
-   * * 4: yAlign, an {@link NEURON_INT32} scalar, specifying the frame
-   * height alignment of AFBC format.
-   * * 5: xOffset, an {@link NEURON_INT32} scalar, specifying the frame
-   * width offset of AFBC format.
-   * * 6: yOffset, an {@link NEURON_INT32} scalar, specifying the frame
-   * height offset of AFBC format.
-   * * 7: mode, an {@link NEURON_INT32} scalar. Set to 0 for
-   * YUV420_8BITS. Set to 1 for YUV420_10BITS. Note that 8b, 10b here means the
-   * compressed bit width in AFBC frame, where the YUV420 must be 8b for
-   * AFBC_8b, and must be 16b for AFBC_10b.
-   * * 8: inPitchN, an {@link NEURON_INT32} scalar, specifying the
-   * YUV420 N-axis pitch. Must be set to 1, because only a single batch is
-   * supported for AfbcCompress.
-   * * 9: inPitchH, an {@link NEURON_INT32} scalar, specifying the
-   * YUV420 H-axis pitch. Set to the expected compressed image height.
-   * * 10: inPitchW, an {@link NEURON_INT32} scalar, specifying the
-   * YUV420 W-axis pitch. Set to the expected compressed image height.
-   * * 11: inPitchC, an {@link NEURON_INT32} scalar, specifying the
-   * YUV420 C-axis pitch. Set to 1 for interleaved YUV420.
-   *
-   * Outputs:
-   * * 0: output, a 4-D {@link NEURON_EXT_TENSOR_RAW} tensor.
-   *
-   * Available since NeuroPilot 7.0.0.
-   */
-  NEURON_YUV420TOAFBC = 107,
-  NEURON_NUMBER_OF_OPERATIONS,
-} NeuronOperationType;
-
-/**
- * Fused activation function types.
- */
-typedef enum {
-  // NO fused activation function.
-  NEURON_FUSED_NONE = 0,
-  // Fused ReLU activation function.
-  NEURON_FUSED_RELU = 1,
-  // Fused ReLU1 activation function.
-  NEURON_FUSED_RELU1 = 2,
-  // Fused ReLU6 activation function.
-  NEURON_FUSED_RELU6 = 3,
-} NeuronAdapterFuseCode;
-
-/**
- * Implicit padding algorithms.
- */
-typedef enum {
-  /**
-   * SAME padding.
-   * Padding on both ends are the "same":
-   *     padding_to_beginning =  total_padding / 2
-   *     padding_to_end       = (total_padding + 1)/2.
-   * i.e., for even number of padding, padding to both ends are exactly
-   * the same; for odd number of padding, padding to the ending is bigger
-   * than the padding to the beginning by 1.
-   *
-   * total_padding is a function of input, stride and filter size.
-   * It could be computed as follows:
-   *    out_size = (input + stride - 1) / stride;
-   *    needed_input = (out_size - 1) * stride + filter_size
-   *    total_padding = max(0, needed_input - input_size)
-   *  The computation is the same for the horizontal and vertical directions.
-   */
-  NEURON_PADDING_SAME = 1,
-
-  /**
-   * VALID padding.
-   * No padding. When the input size is not evenly divisible by
-   * the filter size, the input at the end that could not fill
-   * the whole filter tile will simply be ignored.
-   */
-  NEURON_PADDING_VALID = 2,
-} NeuronAdapterPaddingCode;
-
-/**
- * Execution preferences.
- */
-typedef enum {
-  /* Prefer executing in a way that minimizes battery drain. */
-  NEURON_PREFER_LOW_POWER = 0,
-  /* Prefer executing as fast as possible. (more power consumption)*/
-  NEURON_PREFER_FAST_SINGLE_ANSWER = 1,
-  /* Prefer maximizing the throughput of successive frames */
-  NEURON_PREFER_SUSTAINED_SPEED = 2,
-  /* Prefer executing with turbo boost. (most power consumption) */
-  NEURON_PREFER_TURBO_BOOST = 3,
-} NeuronAdapterPreferenceCode;
-
-/**
- * Relative execution priority.
- */
-typedef enum {
-  NEURON_PRIORITY_LOW = 90,
-  NEURON_PRIORITY_MEDIUM = 100,
-  NEURON_PRIORITY_HIGH = 110,
-  NEURON_PRIORITY_DEFAULT = NEURON_PRIORITY_MEDIUM,
-} NeuronAdapterPriorityCode;
-
-/**
- * Compiler optimization hint.
- */
-typedef enum {
-  /**
-   * Normal optimization.
-   * Available since 4.3.1
-   */
-  NEURON_OPTIMIZATION_NORMAL = 0,
-  /**
-   * Reduce latency by utilizing as many APU cores as possible.
-   * Available since 4.3.1
-   */
-  NEURON_OPTIMIZATION_LOW_LATENCY = 1 << 0,
-  /**
-   * Reducing DRAM access as more as possible.
-   * Available since 4.4.0
-   */
-  NEURON_OPTIMIZATION_DEEP_FUSION = 1 << 1,
-  /**
-   * Reduce latency by using as many APU cores as possible in batch-dimension.
-   * (For models with batch > 1)
-   * Available since 4.4.0
-   */
-  NEURON_OPTIMIZATION_BATCH_PROCESSING = 1 << 2,
-  /**
-   * Default optimization setting.
-   * Available since 4.3.1
-   */
-  NEURON_OPTIMIZATION_DEFAULT = NEURON_OPTIMIZATION_NORMAL,
-} OptimizationCode;
-
-/**
- * CPU cache flush hint.
- */
-typedef enum {
-  /**
-   * Sync input buffer and invalidate output buffer.
-   * Available since 5.0.1
-   */
-  NEURON_CACHE_FLUSH_ENABLE_ALL = 0,
-  /**
-   * Disable sync input buffer.
-   * Available since 5.0.1
-   */
-  NEURON_CACHE_FLUSH_DISABLE_SYNC_INPUT = 1 << 0,
-  /**
-   * Disable invalidate output buffer.
-   * Available since 5.0.1
-   */
-  NEURON_CACHE_FLUSH_DISABLE_INVALIDATE_OUTPUT = 1 << 1,
-  /**
-   * Default cache flush setting.
-   * Available since 5.0.1
-   */
-  NEURON_CACHE_FLUSH_DEFAULT = NEURON_CACHE_FLUSH_ENABLE_ALL,
-} CacheFlushCode;
-
-/**
- * Compilation Type.
- */
-typedef enum {
-  /* Normal Compilation Available since 7.0.0 */
-  COMPILATION_TYPE_NORMAL = 0,
-  /* @deprecate */
-  COMPILATION_TYPE_DEBUG_PLUS = 1,
-  /* Batched Execution: Set input/output from memory every time.
-   * Available since 7.0.0
-   */
-  COMPILATION_TYPE_BATCHED = 2,
-  /* One compilation with multi-executions could be created.
-   * Available since 7.0.0
-   */
-  COMPILATION_TYPE_MULTI_EXECUTIONS = 3,
-  /* Batched Execution: Set input/output from memory 1st time and memcpy next
-   * time. Available since 7.0.1
-   */
-  COMPILATION_TYPE_EXECUTION_CONTROLLER = 4,
-} CompilationType;
-
-/**
- * Supported Feature
- */
-typedef enum {
-  NEURON_FEATURE_NONE = 0,
-  NEURON_THROUGHPUT_MODE = 1,
-} NeuronFeatureType;
-
-/**
- * The structure to represent the neuron version.
- */
-typedef struct {
-  uint8_t major; ///< major version
-  uint8_t minor; ///< minor version
-  uint8_t patch; ///< patch version
-} NeuronRuntimeVersion;
-
-/**
- * Get the version of Neuron runtime library.
- *
- * @param version the version of Neuron runtime library.
- * @return NEURON_NO_ERROR
- */
-int Neuron_getVersion(NeuronRuntimeVersion* version);
-
-/**
- * Get the supported status of feature.
- *
- * Available since 7.0.0
- *
- * @param type input feature @NeuronFeatureType to check supported or not
- * @param supported return the supported status
- * @return NEURON_NO_ERROR if successful.
- */
-int Neuron_getFeatureSupportedStatus(NeuronFeatureType type, bool* supported);
-
-/**
- * Get the size of L1 memory in APU.
- *
- * Available since 4.3.0
- *
- * @param sizeKb L1 memory size in KB
- * @return NEURON_NO_ERROR if successful.
- */
-int Neuron_getL1MemorySizeKb(uint32_t* sizeKb);
-
-/**
- * Creates a shared memory object from a file descriptor.
- *
- * For ion descriptor, application should create the ion memory and descriptor
- * first and then use it in this function.
- *
- * Available since 4.1.0 Only supports ion fd.
- *
- * @param size The requested size in bytes. Must not be larger than the file
- * size.
- * @protect The desired memory protection for the mapping. It is either
- * PROT_NONE or the bitwise OR of one or more of the following flags: PROT_READ,
- * PROT_WRITE.
- * @fd The requested file descriptor. The file descriptor has to be mmap-able.
- * @offset The offset to the beginning of the file of the area to map.
- * @memory The memory object to be created. Set to NULL if unsuccessful.
- */
-int NeuronMemory_createFromFd(
-    size_t size,
-    int protect,
-    int fd,
-    size_t offset,
-    NeuronMemory** memory);
-
-#ifdef __ANDROID__
-/**
- * Creates a shared memory object from an AHardwareBuffer handle.
- *
- * We only support AHardwareBuffer with format AHARDWAREBUFFER_FORMAT_BLOB and
- * it can only be used for Model inputs and outputs.
- *
- * The AHardwareBuffer with AHARDWAREBUFFER_FORMAT_BLOB format can be used the
- * same way as shared memory created from a file handle. See NeuronMemory for
- * description on how to use this shared memory.
- *
- * The provided AHardwareBuffer must outlive the NeuronMemory object.
- *
- * Available since 5.0.0
- *
- * @param ahwb The AHardwareBuffer handle.
- * @param memory The memory object to be created.
- *               Set to NULL if unsuccessful.
- *
- * @return NEURON_NO_ERROR if the request completed normally.
- *
- */
-int NeuronMemory_createFromAHardwareBuffer(
-    const AHardwareBuffer* ahwb,
-    NeuronMemory** memory);
-
-#else // __ANDROID__
-
-/**
- * Not supported at non-android platform
- *
- * @return NEURON_BAD_STATE
- */
-int NeuronMemory_createFromAHardwareBuffer();
-
-#endif
-
-/**
- * Delete a memory object.
- *
- * For ion memory, this function cleans up the internal resource associated with
- * this memory. Applications should clean up the allocated ion memory after this
- * function.
- *
- * Available since 4.1.0
- */
-void NeuronMemory_free(NeuronMemory* memory);
-
-/**
- * Create an empty NeuronModel. The model should be constructed with calls to
- * NeuronModel_addOperation and NeuronModel_addOperand.
- *
- * Available since 4.1.0
- *
- * @param model The NeuronModel to be created. Set to NULL if unsuccessful.
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_create(NeuronModel** model);
-
-/**
- * Destroy a model. The model need not have been finished by a call to
- * NeuronModel_finish.
- *
- * Available since 4.1.0
- *
- * @param model The model to be destroyed.
- */
-void NeuronModel_free(NeuronModel* model);
-
-/**
- * Indicate that we have finished modifying a model. Required before calling
- * NeuronCompilation_compile.
- *
- * Available since 4.1.0
- *
- * @param model The model to be finished.
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_finish(NeuronModel* model);
-
-/**
- * Add an operand to a model. The order in which the operands are added is
- * important. The first one added to a model will have the index value 0, the
- * second 1, etc. These indexes are used as operand identifiers in
- * NeuronModel_addOperation.
- *
- * Available since 4.1.0
- *
- * @param model The model to be modified.
- * @param type The NeuronOperandType that describes the shape of the operand.
- * Neither the NeuronOperandType nor the dimensions it points to need to outlive
- * the call to NeuronModel_addOperand.
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_addOperand(NeuronModel* model, const NeuronOperandType* type);
-
-/**
- * Sets an operand to a constant value.
- * Values of length smaller or equal to
- * NEURON_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES are immediately copied into the
- * model. For values of length greater than
- * NEURON_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES, a pointer to the buffer is
- * stored within the model. The application must not change the content of this
- * region until all executions using this model have completed. As the data may
- * be copied during processing, modifying the data after this call yields
- * undefined results.
- *
- * Attempting to modify a model once NeuronModel_finish has been called will
- * return an error.
- *
- * A special notice on the buffer lifetime when the length is greater than
- * NEURON_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES. The provided buffer must
- * outlive the compilation of this model. I.e. user must keep the buffer
- * unchanged until NeuronCompilation_finish of this model. This is an internal
- * optimization comparing to NNAPI. In NNAPI, NN runtime will copy the buffer to
- * a shared memory between NN runtime and NNAPI HIDL service during
- * ANNModel_finish, and it will be copied again to the compiled result during
- * ANNCompilation_finish. In Neuron Adapter, there will be only one copying
- * during NeuronCompilaiton_finish, so it is required to keep the buffer alive
- * until NeuronCompilaiton_finish returned.
- *
- * Available since 4.1.0
- *
- * @param model The model to be modified.
- * @param index The index of the model operand we're setting.
- * @param buffer A pointer to the data to use.
- * @param length The size in bytes of the data value.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_setOperandValue(
-    NeuronModel* model,
-    int32_t index,
-    const void* buffer,
-    size_t length);
-/**
- * Sets an operand to a value that is a reference to another NeuronModel.
- *
- * The referenced model must already have been finished by a call to
- * NeuronModel_finish.
- *
- * The NeuronModel_relaxComputationFloat32toFloat16 setting of referenced models
- * is overridden by that setting of the main model of a compilation.
- *
- * The referenced model must outlive the model referring to it.
- *
- * Attempting to modify a model once NeuronModel_finish has been called will
- * return an error.
- *
- * Available since 4.1.0
- *
- * @param model The model to be modified.
- * @param index The index of the model operand we're setting.
- * @param value The model to be referenced.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_setOperandValueFromModel(
-    NeuronModel* model,
-    int32_t index,
-    const NeuronModel* value);
-
-/**
- * Sets an operand's per channel quantization parameters
- * Sets parameters required by a tensor of type
- * NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL This function must be called for every
- * tensor of type NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL before calling
- * NeuronModel_finish
- *
- * Available since 4.1.0
- *
- * @param model The model to be modified.
- * @param index The index of the model operand we're setting.
- * @param channelQuant The per channel quantization parameters for the operand.
- * No memory in this struct needs to outlive the call to this function.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_setOperandSymmPerChannelQuantParams(
-    NeuronModel* model,
-    int32_t index,
-    const NeuronSymmPerChannelQuantParams* channelQuant);
-
-/**
- * Sets an operand's per channel quantization parameters
- * Sets parameters required by a tensor of type
- * NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL or
- * NEURON_TENSOR_QUANT8_ASYMM_PER_CHANNEL.
- * This function must be called for every tensor of type
- * NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL or
- * NEURON_TENSOR_QUANT8_ASYMM_PER_CHANNEL before calling NeuronModel_finish.
- *
- * Available since 6.0.0
- *
- * @param model The model to be modified.
- * @param index The index of the model operand we're setting.
- * @param channelQuant The per channel quantization parameters(include
- * per-channel offset) for the operand. No memory in this struct needs to
- * outlive the call to this function.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_setOperandPerChannelQuantParams(
-    NeuronModel* model,
-    int32_t index,
-    const NeuronPerChannelQuantParams* channelQuant);
-
-/**
- * Add an operation to a model.
- * The operands specified by inputs and outputs must have been previously added
- * by calls to NeuronModel_addOperand.
- *
- * Available since 4.1.0
- *
- * @param model The model to be modified.
- * @param type The NeuronOperationType of the operation.
- * @param inputCount The number of entries in the inputs array.
- * @param inputs An array of indexes identifying each operand.
- * @param outputCount The number of entries in the outputs array.
- * @param outputs An array of indexes identifying each operand.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_addOperation(
-    NeuronModel* model,
-    NeuronOperationType type,
-    uint32_t inputCount,
-    const uint32_t* inputs,
-    uint32_t outputCount,
-    const uint32_t* outputs);
-
-/**
- * Add an operation extension to a model.
- * The operands specified by inputs and outputs must have been previously added
- * by calls to NeuronModel_addOperand. User needs to specify the operation
- * extension name and the desired device which will execute the operation
- * extension.
- *
- * Available since 4.1.0
- *
- * @param model The model to be modified.
- * @param name The name of the operation extension.
- * @param vendor The name of the vendor which will implement the operation
- * extension.
- * @param device The device which will execute the operation extension.
- * @param inputCount The number of entries in the inputs array.
- * @param inputs An array of indexes identifying each operand.
- * @param outputCount The number of entries in the outputs array.
- * @param outputs An array of indexes identifying each operand.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_addOperationExtension(
-    NeuronModel* model,
-    const char* name,
-    const char* vendor,
-    const NeuronDevice* device,
-    uint32_t inputCount,
-    const uint32_t* inputs,
-    uint32_t outputCount,
-    const uint32_t* outputs);
-
-/**
- * Specfifies which operands will be the model's inputs and outputs.
- * An operand cannot be used for both input and output. Doing so will return an
- * error.
- *
- * The operands specified by inputs and outputs must have been
- * previously added by calls to NeuronModel_addOperand.
- *
- * Attempting to modify a model once NeuronModel_finish has been
- * called will return an error.
- *
- * Available since 4.1.0
- *
- * @param model The model to be modified.
- * @param inputCount The number of entries in the inputs array.
- * @param inputs An array of indexes identifying the input operands.
- * @param outputCount The number of entries in the outputs array.
- * @param outputs An array of indexes identifying the output operands.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_identifyInputsAndOutputs(
-    NeuronModel* model,
-    uint32_t inputCount,
-    const uint32_t* inputs,
-    uint32_t outputCount,
-    const uint32_t* outputs);
-
-/**
- * Gets the supported operations in a model.
- * This function must be called after calling NeuronModel_finish
- *
- * Available since 4.1.0
- *
- * @param model The model to be queried.
- * @param supported The boolean array to be filled. True means supported. The
- * size of the boolean array must be at least as large as the number of
- * operations in the model. The order of elements in the supported array matches
- * the order in which the corresponding operations were added to the model.
- * @param operationCount number of operations in the model
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_getSupportedOperations(
-    NeuronModel* model,
-    bool* supported,
-    uint32_t operationCount);
-
-/**
- * Get the supported operations for a specified set of devices.
- * If multiple devices are selected, the supported operation list is a union of
- * supported operations of all selected devices.
- *
- * Available since 4.1.0
- *
- * @param model The model to be queried.
- * @param devices Selected devices
- * @param numDevices Number of selected devices
- * @param supportedOps The boolean array to be filled. True means supported. The
- * size of the boolean array must be as least as large as the number of
- * operations in the model. The order of elements in the supportedOps array
- * matches the order in which the corresponding operations were added to the
- * model.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_getSupportedOperationsForDevices(
-    const NeuronModel* model,
-    const NeuronDevice* const* devices,
-    uint32_t numDevices,
-    bool* supportedOps);
-
-/**
- * Specifies whether NEURON_TENSOR_FLOAT32 is allowed to be calculated with
- * range and/or precision as low as that of the IEEE 754 16-bit floating-point
- * format. By default, NEURON_TENSOR_FLOAT32 must be calculated using at least
- * the range and precision of the IEEE 754 32-bit floating-point format.
- *
- * Available since 4.1.0
- *
- * @param model The model to be modified.
- * @param allow 'true' indicates NEURON_TENSOR_FLOAT32 may be calculated with
- * range and/or precision as low as that of the IEEE 754 16-bit floating point
- * format. 'false' indicates NEURON_TENSOR_FLOAT32 must be calculated using at
- * least the range and precision of the IEEE 754 32-bit floating point format.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_relaxComputationFloat32toFloat16(
-    NeuronModel* model,
-    bool allow);
-
-/**
- * Hint compiler to suppress the input data conversion, the users have to
- * convert the input data into platform-expected format before inference.
- *
- * Available since 4.2.0
- *
- * @param model The model to be modified.
- * @param suppress True to suppress the input data conversion.
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_suppressInputConversion(NeuronModel* model, bool suppress);
-
-/**
- * Hint compiler to suppress the output data conversion, the users have to
- * convert the output data from platform-generated format before inference.
- *
- * Available since 4.2.0
- *
- * @param model The model to be modified.
- * @param suppress True to suppress the output data conversion.
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_suppressOutputConversion(NeuronModel* model, bool suppress);
-
-/**
- * Restore the compiled network using user provided buffer.
- *
- * The restored NeuronCompilaton could be used in creating executing instance.
- * The restored NeuronModel cannot be recompiled.
- *
- * Available since 4.3.0
- *
- * @param model Restored model.
- * @param compilation Restored compilation
- * @param buffer User provided buffer to restore the compiled network.
- * @param size Size of the user provided buffer in bytes.
- * @return NEURON_NO_ERROR if compiled network is successfully copied to the
- * user allocated buffer. NEURON_BAD_DATA if it fails to load the compiled
- * network, this could either be the version is not matched or the data is
- * corrupted.
- */
-int NeuronModel_restoreFromCompiledNetwork(
-    NeuronModel** model,
-    NeuronCompilation** compilation,
-    const void* buffer,
-    const size_t size);
-
-/**
- * Restore the compiled network using user provided buffer.
- * Support multiple compilation type; choices are: COMPILATION_TYPE_BATCHED,
- * COMPILATION_TYPE_EXECUTION_CONTROLLER, COMPILATION_TYPE_EXECUTION_CONTROLLER,
- * and COMPILATION_TYPE_NORMAL.
- *
- * There are two ways to use Batched Compilation:
- * 1) load from DLA.
- * 2) create batched compilation directly.
- * To load DLA, one should call NeuronCompilation_create and
- * NeuronModel_restoreFromCompiledNetworkV2. To create directly, one should call
- * NeuronCompilation_createForBatch.
- *
- * The restored NeuronCompilaton could be used in creating executing instance.
- * The restored NeuronModel cannot be recompiled.
- *
- * Available since 7.0.0
- *
- * @param model Restored model.
- * @param compilation Restored compilation
- * @param buffer User provided buffer to restore the compiled network.
- * @param size Size of the user provided buffer in bytes.
- * @param type Type of the compilation needed to be restored.
- * @return NEURON_NO_ERROR if compiled network is successfully copied to the
- * user allocated buffer. NEURON_BAD_DATA if it fails to load the compiled
- * network, this could either be the version is not matched or the data is
- * corrupted.
- */
-int NeuronModel_restoreFromCompiledNetworkV2(
-    NeuronModel** model,
-    NeuronCompilation** compilation,
-    const void* buffer,
-    const size_t size,
-    const CompilationType& type);
-
-/**
- * Set a string into model that can be used for recognition for user.
- * It's only used for debug, the string can be dumped into log and make users
- * check the model behavior easily.
- *
- * Available since 7.0.0
- *
- * @param model The model to be modified.
- * @param name The string, user can free buffer 'name' after calling this API.
- * @return NEURON_NO_ERROR if the string is set success. NEURON_UNEXPECTED_NULL
- * if the input param is nullptr.
- */
-int NeuronModel_setName(NeuronModel* model, const char* name);
-
-/**
- * Create a NeuronCompilation to compile the given model.
- *
- * This function only creates the object. Compilation is only performed once
- * NeuronCompilation_finish is invoked. NeuronCompilation_finish should be
- * called once all desired properties have been set on the compilation.
- * NeuronModel_free should be called once the compilation is no longer needed.
- * The provided model must outlive the compilation. The model must already have
- * been finished by a call to NeuronModel_finish.
- *
- * Available since 4.1.0
- *
- * @param model The NeuronModel to be compiled.
- * @param compilation The newly created object or NULL if unsuccessful.
- *
- * @return NEURON_NO_ERROR if successful
- */
-int NeuronCompilation_create(
-    NeuronModel* model,
-    NeuronCompilation** compilation);
-
-/**
- * Create a NeuronCompilation with different purpose to compile the given model.
- *
- * This function only creates the object. Compilation is only performed once
- * NeuronCompilation_finish is invoked. NeuronCompilation_finish should be
- * called once all desired properties have been set on the compilation.
- * NeuronModel_free should be called once the compilation is no longer needed.
- * The provided model must outlive the compilation. The model must already have
- * been finished by a call to NeuronModel_finish.
- *
- * Available since 7.0.1
- *
- * @param model The NeuronModel to be compiled.
- * @param type Type of the compilation needed to be created.
- * @param options The options which used to create with compilation.
- * @param compilation The newly created object or NULL if unsuccessful.
- *
- * @return NEURON_NO_ERROR if successful
- */
-int NeuronCompilation_createV2(
-    NeuronModel* model,
-    CompilationType type,
-    const char* options,
-    NeuronCompilation** compilation);
-
-/**
- * Destroy a compilation.
- *
- * Available since 4.1.0
- *
- * @param compilation The compilation to be destroyed.
- */
-void NeuronCompilation_free(NeuronCompilation* compilation);
-
-/**
- * Compilation is finished once NeuronCompilation_finish is invoked. Required
- * before calling NeuronExecution_create. This function must only be called once
- * for a given compilation.
- *
- * Available since 4.1.0
- *
- * @param compilation The compilation to be finished.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_finish(NeuronCompilation* compilation);
-
-/**
- * Gets the supported operations in a model with specific optimized configures.
- * This function must be called before calling NeuronCompilation_finish.
- *
- * Available since 7.0.0
- *
- * @param compilation The compilation to be queried.
- * @param operationCount number of operations in the model
- * @param supported The boolean array to be filled. True means supported. The
- * size of the boolean array must be at least as large as the number of
- * operations in the model. The order of elements in the supported array matches
- * the order in which the corresponding operations were added to the model.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_getSupportedOperations(
-    NeuronCompilation* compilation,
-    uint32_t operationCount,
-    bool* supported);
-
-/**
- * Provides optional caching information for faster re-compilation.
- *
- * Available since 4.1.0
- *
- * @param compilation The compilation to be cached.
- * @param cacheDir The cache directory for storing and retrieving caching data.
- * The user should choose a directory local to the application, and is
- * responsible for managing the cache entries.
- * @param token The token provided by the user to specify a model must be of
- * length NEURON_BYTE_SIZE_OF_CACHE_TOKEN. The user should ensure that the token
- * is unique to a model within the application. Neuron cannot detect token
- * collisions; a collision will result in a failed execution or in a successful
- * execution that produces incorrect output values.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_setCaching(
-    NeuronCompilation* compilation,
-    const char* cacheDir,
-    const uint8_t* token);
-
-/**
- * Hint compiler with the size of L1 memory, this value should not be larger
- * than real platform's settings. The user can get the platform's L1 memory size
- * in KB by calling Neuron_getL1MemorySizeKb.
- *
- * Available since 4.3.0
- *
- * @param compilation The compilation to be modified.
- * @param sizeKb L1 memory size in KB.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_setL1MemorySizeKb(
-    NeuronCompilation* compilation,
-    uint32_t sizeKb);
-
-/**
- * Create a NeuronCompilation to compile the given model for a specified set of
- * devices. The user must handle all compilation and execution failures from the
- * specified set of devices. This is in contrast to a use of
- * NeuronCompilation_create, where neuron will attempt to recover from such
- * failures.
- *
- * Available since 4.1.0
- *
- * @param model The NeuronModel to be compiled.
- * @param devices The set of devices. Must not contain duplicates.
- * @param numDevices The number of devices in the set.
- * @param compilation The newly created object or NULL if unsuccessful.
- *
- * @return NEURON_NO_ERROR if successful, NEURON_BAD_DATA if the model is
- * invalid.
- */
-int NeuronCompilation_createForDevices(
-    NeuronModel* model,
-    const NeuronDevice* const* devices,
-    uint32_t numDevices,
-    NeuronCompilation** compilation);
-
-/**
- * Create a NeuronCompilation. Which can divide one graph into several subgraph
- * and use the information to debug.
- *
- * Only be used in debug purpose, no guarantees performance and thread safe.
- *
- * Available since 5.0.0
- *
- * @param model The NeuronModel to be compiled.
- * @param compilation The newly created object or NULL if unsuccessful.
- *
- * @return NEURON_NO_ERROR if successful, NEURON_BAD_DATA if the model is
- * invalid.
- */
-int NeuronCompilation_createForDebug(
-    NeuronModel* model,
-    NeuronCompilation** compilation);
-
-/**
- * Sets the execution preference associated with this compilation.
- *
- * Default value of preference is PREFER_SINGLE_FAST_ANSWER
- *
- * Available since 4.1.0
- *
- * @param compilation The compilation to be modified.
- * @param preference Either NEURON_PREFER_LOW_POWER,
- * NEURON_PREFER_SINGLE_FAST_ANSWER, or NEURON_PREFER_SUSTAINED_SPEED.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_setPreference(
-    NeuronCompilation* compilation,
-    int32_t preference);
-
-/**
- * Sets the execution priority associated with this compilation.
- *
- * Execution priorities are relative to other executions created by the same
- * application (specifically same uid) for the same device. Specifically,
- * priorities of executions from one application will not affect executions from
- * another application.
- *
- * Higher priority executions may use more compute resources than lower priority
- * executions, and may preempt or starve lower priority executions.
- *
- * Available since 4.1.0
- *
- * @param compilation The compilation to be modified.
- * @param priority The relative priority of the execution compared to other
- * executions created by the application. Must be one of NEURON_PRIORITY_*.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_setPriority(NeuronCompilation* compilation, int priority);
-
-/**
- * Get the padded dimensional information of the specified input operand of the
- * compilation. This function must be called after calling
- * NeuronCompilation_finish. If NeuronModel_suppressInputConversion was not
- * applied to the model to be compiled, the returned dimensions are the padded
- * dimension after NeuronCompilation_finish to satisfy the optimization
- * requirement from the underlying hardware accelerators.
- * If NeuronModel_suppressInputConversion was applied to the model to be
- * compiled, the returned dimensions are the same as the original dimensions
- * given from user.
- *
- * Available since 4.2.0
- *
- * @param compilation The compilation to be queried.
- * @param index The index of the input operand we are querying. It is an index
- * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the
- * index associated with NeuronModel_addOperand.
- * @param dimensions The dimension array to be filled. The size of the array
- * must be exactly as large as the rank of the input operand to be queried in
- * the model.
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_getInputPaddedDimensions(
-    NeuronCompilation* compilation,
-    int32_t index,
-    uint32_t* dimensions);
-
-/**
- * Get the padded dimensional information of the specified output operand of the
- * compilation. This function must be called after calling
- * NeuronCompilation_finish. If NeuronModel_suppressOutputConversion was not
- * applied to the model to be compiled, the returned dimensions are the padded
- * dimension after NeuronCompilation_finish to satisfy the optimization
- * requirement from the underlying hardware accelerators.
- * If NeuronModel_suppressOutputConversion was applied to the model to be
- * compiled, the returned dimensions are the same as the original dimensions
- * given from user.
- *
- * Available since 4.2.0
- *
- * @param compilation The compilation to be queried.
- * @param index The index of the output operand we are querying. It is an index
- * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the
- * index associated with NeuronModel_addOperand.
- * @param dimensions The dimension array to be filled. The size of the array
- * must be exactly as large as the rank of the output operand to be queried in
- * the model.
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_getOutputPaddedDimensions(
-    NeuronCompilation* compilation,
-    int32_t index,
-    uint32_t* dimensions);
-
-/**
- * Get the expected buffer size (bytes) of the specified input operand of the
- * compilation. If NeuronModel_suppressInputConversion was not applied to the
- * model to be compiled, the returned size are the padded size after
- * NeuronCompilation_finish to satisfy the optimization requirement from the
- * underlying hardware accelerators. If NeuronModel_suppressInputConversion was
- * applied to the model to be compiled, the returned size are the same as the
- * original size given from user.
- *
- * Available since 4.2.0
- *
- * @param compilation The compilation to be queried.
- * @param index The index of the input operand we are querying. It is an index
- * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the
- * index associated with NeuronModel_addOperand.
- * @param size the expected buffer size in bytes.
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_getInputPaddedSize(
-    NeuronCompilation* compilation,
-    int32_t index,
-    size_t* size);
-
-/**
- * Get the expected buffer size (bytes) of the specified output operand of the
- * compilation. If NeuronModel_suppressOutputConversion was not applied to the
- * model to be compiled, the returned size are the padded size after
- * NeuronCompilation_finish to satisfy the optimization requirement from the
- * underlying hardware accelerators. If NeuronModel_suppressOutputConversion was
- * applied to the model to be compiled, the returned size are the same as the
- * original size given from user.
- *
- * Available since 4.2.0
- *
- * @param compilation The compilation to be queried.
- * @param index The index of the output operand we are querying. It is an index
- * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the
- * index associated with NeuronModel_addOperand.
- * @param size the expected buffer size in bytes.
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_getOutputPaddedSize(
-    NeuronCompilation* compilation,
-    int32_t index,
-    size_t* size);
-
-/**
- * Get the compiled network size of the compilation.
- *
- * This must be called after NeuronCompilation_finished and before
- * NeuronExecution_create. It is not allowed to call this with a compilation
- * restored from cache.
- *
- * Available since 4.3.0
- *
- * @param compilation The compilation to be queried.
- * @param size The compiled network size in bytes.
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_getCompiledNetworkSize(
-    NeuronCompilation* compilation,
-    size_t* size);
-
-/**
- * Store the compiled network.
- *
- * Users have to allocate the buffer with the specified size before calling this
- * function.
- *
- * This must be called after NeuronCompilation_finished and before
- * NeuronExecution_create. It is not allowed to call this with a compilation
- * restored from cache.
- *
- * Available since 4.3.0
- *
- * @param compilation The compilation to be queried.
- * @param buffer User allocated buffer to store the compiled network.
- * @param size Size of the user allocated buffer in bytes.
- * @return NEURON_NO_ERROR if compiled network is successfully copied to the
- * user allocated buffer.
- */
-int NeuronCompilation_storeCompiledNetwork(
-    NeuronCompilation* compilation,
-    void* buffer,
-    const size_t size);
-/**
- * Hint the compiler to apply the optimization strategy according to the user
- * specified parameters.
- *
- * Available since 4.3.0
- *
- * @param compilation The compilation to be modified.
- * @param optimizationCode User specified optimization strategy. Must be one of
- * NEURON_OPTIMIZATION_* or the inclusive OR value of multiple
- * NEURON_OPTIMIZATION_*.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_setOptimizationHint(
-    NeuronCompilation* compilation,
-    uint32_t optimizationCode);
-
-/**
- * Hint the compiler to apply the optimization strategy according to the user
- * specified arguments in a null-terminated string.
- *
- * Available since 4.6.0
- *
- * @param compilation The compilation to be modified.
- * @param optimizationString A null-terminated string to represent the user
- * specified optimization strategy.
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_setOptimizationString(
-    NeuronCompilation* compilation,
-    const char* optimizationString);
-
-/**
- * Only allow users' optimization string(from
- * NeuronCompilation_setOptimizationString), the system won't set any compiler
- * options for them.
- *
- * Available since 6.0.5
- *
- * @param compilation The compilation to be modified.
- * @param allow Allow only use user's setting or not.
- * strategy.
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_setOnlyAllowOptimizationString(
-    NeuronCompilation* compilation,
-    bool allow);
-
-/**
- * Get the compiler hints which are used to apply the optimization strategy
- * according to the user specified arguments in a null-terminated string.
- *
- * Available since 6.0.5
- *
- * @param compilation The compilation to be modified.
- * @param optimizationString A null-terminated string to represent the user
- * specified optimization strategy.
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_getOptimizationString(
-    NeuronCompilation* compilation,
-    const char** optimizationString);
-
-/**
- * Hint compiler to trim the model IO alignment.
- *
- * Available since 4.4.8
- *
- * @param compilation The compilation to be modified.
- * @param enable 'true' for trimming model IO alignment.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_setTrimIOAlignment(
-    NeuronCompilation* compilation,
-    bool enable);
-
-/**
- * Hint compiler to use software dilated convolution
- *
- * Available since 4.4.8
- *
- * @param compilation The compilation to be modified.
- * @param enable 'true' indicates a hint to compiler to use software dilated
- * convolution
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_setSWDilatedConv(
-    NeuronCompilation* compilation,
-    bool enable);
-
-/**
- * Create a new execution instance by calling the NeuronExecution_create
- * function. The provided compilation must outlive the execution.
- *
- * Available since 4.1.0
- *
- * @param compilation The NeuronCompilation to be evaluated.
- * @param execution The newly created object or NULL if unsuccessful.
- *
- * @return NEURON_NO_ERROR if successful
- */
-int NeuronExecution_create(
-    NeuronCompilation* compilation,
-    NeuronExecution** execution);
-
-/**
- * Destroy an execution.
- *
- * Available since 4.1.0
- *
- * @param execution The execution to be destroyed.
- */
-void NeuronExecution_free(NeuronExecution* execution);
-
-/**
- * Associate a user buffer with an input of the model of the NeuronExecution.
- * The provided buffer must outlive the execution.
- *
- * Available since 4.1.0
- *
- * @param execution The execution to be modified.
- * @param index The index of the input argument we are setting. It is an index
- * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the
- * index associated with NeuronModel_addOperand.
- * @param type The NeuronOperandType of the operand. Currently NeuronAdapter
- * only takes NULL.
- * @param buffer The buffer containing the data.
- * @param length The length in bytes of the buffer.
- *
- * @return NEURON_NO_ERROR if successful, NEURON_BAD_DATA if the name is not
- * recognized or the buffer is too small for the input.
- */
-int NeuronExecution_setInput(
-    NeuronExecution* execution,
-    int32_t index,
-    const NeuronOperandType* type,
-    const void* buffer,
-    size_t length);
-
-/**
- * Associate a user buffer with an output of the model of the NeuronExecution.
- * The provided buffer must outlive the execution.
- *
- * Available since 4.1.0
- *
- * @param execution The execution to be modified.
- * @param index The index of the output argument we are setting. It is an index
- * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the
- * index associated with NeuronModel_addOperand.
- * @param type The NeuronOperandType of the operand. Currently NeuronAdapter
- * only takes NULL.
- * @param buffer The buffer where the data is to be written.
- * @param length The length in bytes of the buffer.
- *
- * @return NEURON_NO_ERROR if successful, NEURON_BAD_DATA if the name is not
- * recognized or the buffer is too small for the output.
- */
-int NeuronExecution_setOutput(
-    NeuronExecution* execution,
-    int32_t index,
-    const NeuronOperandType* type,
-    void* buffer,
-    size_t length);
-
-/**
- * Associate part of a memory object with an input of the model of the
- * NeuronExecution.
- *
- * The provided memory must outlive the execution and should not be changed
- * during computation.
- *
- * Available since 4.1.0
- *
- * @param execution The execution to be modified.
- * @param index The index of the input argument we are setting. It is an index
- * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the
- * index associated with Neuronodel_addOperand.
- * @param type The NeuronOperandType of the operand. Currently NueronAdapter
- * only takes NULL.
- * @param memory The memory containing the data.
- * @param offset This specifies the location of the data within the memory. The
- * offset is in bytes from the start of memory.
- * @param length The size in bytes of the data value.
- *
- * @return NEURON_NO_ERROR if successful, NEURON_BAD_DATA if the name is not
- * recognized or the buffer is too small for the input.
- */
-int NeuronExecution_setInputFromMemory(
-    NeuronExecution* execution,
-    uint32_t index,
-    const NeuronOperandType* type,
-    const NeuronMemory* memory,
-    size_t offset,
-    size_t length);
-
-/**
- * Associate part of a memory object with an output of the model of the
- * NeuronExecution.
- *
- * The provided memory must outlive the execution and should not be changed
- * during computation.
- *
- * Available since 4.1.0
- *
- * @param execution The execution to be modified.
- * @param index The index of the output argument we are setting. It is an index
- * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the
- * index associated with Neuronodel_addOperand.
- * @param type The NeuronOperandType of the operand. Currently NueronAdapter
- * only takes NULL.
- * @param memory The memory containing the data.
- * @param offset This specifies the location of the data within the memory. The
- * offset is in bytes from the start of memory.
- * @param length The size in bytes of the data value.
- *
- * @return NEURON_NO_ERROR if successful, NEURON_BAD_DATA if the name is not
- * recognized or the buffer is too small for the input.
- */
-int NeuronExecution_setOutputFromMemory(
-    NeuronExecution* execution,
-    uint32_t index,
-    const NeuronOperandType* type,
-    const NeuronMemory* memory,
-    size_t offset,
-    size_t length);
-
-/**
- * Schedule synchronous evaluation of the execution.
- * Returns once the execution has completed and the outputs are ready to be
- * consumed.
- *
- * Available since 4.1.0
- *
- * @param execution The execution to be scheduled and executed.
- *
- * @return NEURON_NO_ERROR if the execution completed normally. NEURON_BAD_STATE
- * if the inference fails. Add two return code since 5.0.0
- * (NEURON_MISSED_DEADLINE_TRANSIENT if  inference timeout, and
- * NEURON_OUTPUT_INSUFFICIENT_SIZE if given outsize is not sufficient for real
- * output)
- *
- */
-int NeuronExecution_compute(NeuronExecution* execution);
-
-/**
- * Schedule asynchronous evaluation of the execution with dependencies.
- *
- * The execution will wait for all the depending events to be signaled before
- * starting the evaluation. Once the execution has completed and the outputs
- * are ready to be consumed, the returned event will be signaled. Depending on
- * which devices are handling the execution, the event could be backed by a sync
- * fence. Use NeuronEvent_wait to wait for that event.
- *
- * NeuronEvent_wait must be called to recurperate the resources used by the
- * execution.
- *
- * If parts of the execution are scheduled on devices that do not support fenced
- * execution, the function call may wait for such parts to finish before
- * returning.
- *
- * The function will return an error if any of the events in dependencies is
- * already in a bad state. After the execution is scheduled, if any of the
- * events in dependencies does not complete normally, the execution will fail,
- * and NeuronEvent_wait on the returned event will return an error.
- *
- * The function will return an error if any of the execution outputs has a
- * tensor operand type that is not fully specified.
- *
- * @param execution The execution to be scheduled and executed.
- * @param dependencies A set of depending events. The actual evaluation will not
- * start until all the events are signaled.
- * @param num_dependencies The number of events in the dependencies set.
- * @param duration currently not used
- * @param event The event that will be signaled on completion. event is set to
- *              NULL if there's an error.
- *
- * @return NEURON_NO_ERROR if the evaluation is successfully scheduled.
- *
- * Available since 5.0.0
- */
-int NeuronExecution_startComputeWithDependencies(
-    NeuronExecution* execution,
-    const NeuronEvent* const* dependencies,
-    uint32_t num_dependencies,
-    uint64_t duration,
-    NeuronEvent** event);
-
-/**
- * Set the maximum duration of WHILE loops in the specified execution.
- *
- * @param execution The execution to be modified.
- * @param duration The maximum amount of time in nanoseconds.
- * @return NEURON_NO_ERROR if successful.
- *
- * Available since 5.0.0
- */
-int NeuronExecution_setLoopTimeout(
-    NeuronExecution* execution,
-    uint64_t duration);
-
-/**
- * Get the default timeout value for WHILE loops.
- *
- * @return The default timeout value in nanoseconds.
- *
- * Available since 5.0.0
- */
-uint64_t Neuron_getDefaultLoopTimeout();
-
-/**
- * Get the maximum timeout value for WHILE loops.
- *
- * @return The maximum timeout value in nanoseconds.
- *
- * Available since 5.0.0
- */
-uint64_t Neuron_getMaximumLoopTimeout();
-
-/**
- * Sets the execution boost hint associated with this execution. Required before
- * calling NeuronExecution_compute.
- *
- * Execution boost is the hint for the device frequency, ranged between 0
- * (lowest) to 100 (highest). For the compilation with preference set as
- * NEURON_PREFER_SUSTAINED_SPEED, scheduler guarantees that the executing boost
- * value would equal to the boost value hint.
- *
- * On the other hand, for the compilation with preference set as
- * NEURON_PREFER_LOW_POWER, scheduler would try to save power by configuring the
- * executing boost value with some value that is not higher than the boost value
- * hint.
- *
- * Available since 4.1.0
- *
- * @param execution The execution to be modified.
- * @param boostValue The hint for the device frequency, ranged between 0
- * (lowest) to 100 (highest).
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronExecution_setBoostHint(
-    NeuronExecution* execution,
-    uint8_t boostValue);
-
-/**
- * Sets the execution CPU cache flush hint associated with this execution.
- * Required before calling NeuronExecution_setInputFromMemory and
- * NeuronExecution_setOutputFromMemory.
- *
- * Default value of preference is NEURON_CACHE_FLUSH_ENABLE_ALL
- *
- * Available since 5.0.1
- *
- * @param execution The execution to be modified.
- * @param hint  It is either NEURON_CACHE_FLUSH_ENABLE_ALL or the bitwise OR
- * of one or more of the following flags: NEURON_CACHE_FLUSH_DISABLE_SYNC_INPUT,
- * NEURON_CACHE_FLUSH_DISABLE_INVALIDATE_OUTPUT.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronExecution_setCacheFlushHint(
-    NeuronExecution* execution,
-    uint8_t flushHint);
-
-/**
- * Get the dimensional information of the specified output operand of the model
- * of the latest computation evaluated on {@link NeuronExecution}.
- *
- * This function may only be invoked when the execution is in the completed
- * state.
- *
- * Available since 5.0.0
- *
- * @param execution The execution to be queried.
- * @param index The index of the output argument we are querying. It is
- *              an index into the lists passed to {@link
- * NeuronModel_identifyInputsAndOutputs}.
- * @param rank The rank of the output operand.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronExecution_getOutputOperandRank(
-    NeuronExecution* execution,
-    int32_t index,
-    uint32_t* rank);
-
-/**
- * Get the dimensional information of the specified output operand of the model
- * of the latest computation evaluated on {@link NeuronExecution}. The target
- * output operand cannot be a scalar.
- *
- * This function may only be invoked when the execution is in the completed
- * state.
- *
- * Available since 5.0.0
- *
- * @param execution The execution to be queried.
- * @param index The index of the output argument we are querying. It is
- *              an index into the lists passed to {@link
- * NeuronModel_identifyInputsAndOutputs}.
- * @param dimensions The dimension array to be filled. The size of the array
- * must be exactly as large as the rank of the output operand to be queried in
- * the model.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronExecution_getOutputOperandDimensions(
-    NeuronExecution* execution,
-    int32_t index,
-    uint32_t* dimensions);
-
-/**
- * Create a NeuronCompilation which can create executions with shared static
- * memory.
- *
- * This function only creates the object. Compilation is only performed once
- * NeuronCompilation_finish is invoked. NeuronCompilation_finish should be
- * called once all desired properties have been set on the compilation.
- * NeuronModel_free should be called once the compilation is no longer needed.
- * The provided model must outlive the compilation. The model must already have
- * been finished by a call to NeuronModel_finish.
- *
- * Available since 7.0.0
- *
- * @param model The NeuronModel to be compiled.
- * @param compilation The newly created object or NULL if unsuccessful.
- *
- * @return NEURON_NO_ERROR if successful
- */
-int NeuronCompilation_createForBatch(
-    NeuronModel* model,
-    NeuronCompilation** compilation);
-
-/**
- * Set the size of runner pool, and create same number of runners.
- *
- * The execution must created by the following steps:
- * NeuronCompilation_createForBatch, NeuronCompilation_finish,
- * NeuronExecution_create.
- *
- * The execution created from this compilation has to use
- * NeuronExecution_setRunnerPoolSize to create thread pool and then set a series
- * of inputs & outputs into the execution. The execution will inference with the
- * series of inputs.
- *
- * Available since 7.0.0
- *
- * @param execution The NeuronExecution to be utilized.
- * @param numRunners The number of runner need to be created.
- *
- * @return NEURON_NO_ERROR if successful
- * @return NEURON_BAD_STATE if the compilation is not created via
- * NeuronCompilation_createForBatch.
- */
-int NeuronExecution_setRunnerPoolSize(
-    NeuronExecution* execution,
-    uint8_t numRunners);
-
-/**
- * Notify the execution that all inputs / outputs have been set.
- * Should be called after NeuronExecution_setInputFromMemory and
- * NeuronExecution_setOutputFromMemory.
- *
- * The execution must created by the following steps:
- * NeuronCompilation_createForBatch, NeuronCompilation_finish,
- * NeuronExecution_create.
- *
- * Available since 7.0.0
- *
- * @param execution The NeuronExecution to be utilized.
- *
- * @return NEURON_NO_ERROR if successful
- * @return NEURON_BAD_STATE if the compilation is not created via
- * NeuronCompilation_createForBatch.
- */
-int NeuronExecution_setBatchDone(NeuronExecution* execution);
-
-/**
- * Notify the execution that all inputs / outputs have been set.
- * Should be called after NeuronExecution_setInputFromMemory and
- * NeuronExecution_setOutputFromMemory.
- *
- * The execution must created by the following steps:
- * 1. NeuronCompilation_createV2 with COMPILATION_TYPE_EXECUTION_CONTROLLER
- * 2. NeuronCompilation_finish
- * 3. NeuronExecution_create.
- * or
- * 1. NeuronModel_restoreFromCompiledNetworkV2  with
- * COMPILATION_TYPE_EXECUTION_CONTROLLER
- * 2. NeuronExecution_create.
- *
- * Available since 7.0.1
- *
- * @param execution The NeuronExecution to be utilized.
- * @param idx The index of runner to set the previous inputs and outputs.
- *
- * @return NEURON_NO_ERROR if successful
- * @return NEURON_BAD_STATE if the compilation is not created via
- *             COMPILATION_TYPE_EXECUTION_CONTROLLER.
- */
-int NeuronExecution_setIODone(NeuronExecution* execution, int idx);
-
-/**
- * Create a NeuronCompilation which can create executions with shared static
- * memory.
- *
- * This function only creates the object. Compilation is only performed once
- * NeuronCompilation_finish is invoked. NeuronCompilation_finish should be
- * called once all desired properties have been set on the compilation.
- * NeuronModel_free should be called once the compilation is no longer needed.
- * The provided model must outlive the compilation. The model must already have
- * been finished by a call to NeuronModel_finish.
- *
- * The executions created from this compilation can be executed at the same
- * time.
- *
- * Available since 7.0.0
- *
- * @param model The NeuronModel to be compiled.
- * @param compilation The newly created object or NULL if unsuccessful.
- *
- * @return NEURON_NO_ERROR if successful
- */
-int NeuronCompilation_createForMultiExecutions(
-    NeuronModel* model,
-    NeuronCompilation** compilation);
-
-/**
- * Set report path for debug plus.
- *
- * Only be used in debug purpose, the execution should be created by
- * NeuronCompilation_createForDebug compilation.
- *
- * Available since 5.0.0
- *
- * @param model The model need to be debug.
- * @param path The path of execution report.
- *
- * @return NEURON_NO_ERROR if successful, NEURON_BAD_DATA if the path is empty.
- */
-int NeuronDebug_setReportPath(NeuronModel* model, const char* path);
-
-/**
- * Get the number of available devices.
- *
- * Available since 4.1.0
- * @param numDevices The number of devices returned.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int Neuron_getDeviceCount(uint32_t* numDevices);
-
-/**
- * Get the representation of the specified device.
- *
- * Available since 4.1.0
- *
- * @param devIndex The index of the specified device. Must be less than the
- * number of available devices.
- * @param device The representation of the specified device. The same
- * representation will always be returned for the specified device.
- *
- * @return NEURONNO_ERROR if successful.
- */
-int Neuron_getDevice(uint32_t devIndex, NeuronDevice** device);
-
-/**
- * Get the name of the specified device.
- *
- * Available since 4.1.0
- *
- * @param device The representation of the specified device.
- * @param name The returned name of the specified device. The name will remain
- * valid for the duration of the application.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronDevice_getName(const NeuronDevice* device, const char** name);
-
-/**
- * Get the description of the specified device.
- *
- * Available since 5.0.0
- *
- * @param device The representation of the specified device.
- * @param description The returned description of the specified device. The
- * description will remain valid for the duration of the application.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronDevice_getDescription(
-    const NeuronDevice* device,
-    const char** description);
-
-/*
- * Destroys the event.
- *
- * See NeuronExecution for information on multithreaded usage.
- *
- * Available since 5.0.0
- *
- * @param event The event object to be destroyed. Passing NULL is acceptable and
- *              results in no operation.
- */
-void NeuronEvent_free(NeuronEvent* event);
-
-/*
- * Force destroys the event without calling NeuronEvent_wait().
- * If user wants do wait before destroying the event, they should use
- * NeuronEvent_free.
- *
- * See NeuronExecution for information on multithreaded usage.
- *
- * Available since 6.0.0
- *
- * @param event The event object to be destroyed. Passing NULL is acceptable and
- *              results in no operation.
- */
-void NeuronEvent_freeForce(NeuronEvent* event);
-
-/**
- * Waits until the execution completes.
- *
- * More than one thread can wait on an event. When the execution completes,
- * all threads will be released.
- *
- * SeeNeuronExecution for information on multithreaded usage.
- *
- * Available since 5.0.0
- *
- * @param event The event that will be signaled on completion.
- * @return NEURON_NO_ERROR if the execution completed normally.
- *         NEURON_UNMAPPABLE if the execution input or output memory cannot
- *         be properly mapped.
- */
-int NeuronEvent_wait(NeuronEvent* event);
-
-/**
- * Create a NeuronEventfrom a sync_fence file descriptor.
- *
- * The newly created NeuronEvent does not take ownership of the provided
- * sync_fence_fd, it will instead dup the provided sync_fence_fd and own the
- * duplicate.
- *
- * @param sync_fence_fd The sync_fence file descriptor.
- * @param event The newly created object or NULL if unsuccessful.
- *
- * @return NEURON_NO_ERROR if successful.
- *
- * Available since 5.0.0
- */
-int NeuronEvent_createFromSyncFenceFd(int sync_fence_fd, NeuronEvent** event);
-
-/**
- * Get sync_fence file descriptor from the event.
- *
- * If the NeuronEvent is not backed by a sync fence, the sync_fence_fd
- * will be set to -1, and NEURON_BAD_DATA will be returned.
- *
- * See NeuronEvent_createFromSyncFenceFd and
- * NeuronExecution_startComputeWithDependencies to see how to create an event
- * backed by a sync fence.
- *
- * The user takes ownership of the returned fd, and must close the returned file
- * descriptor when it is no longer needed.
- *
- * @param event An event that is backed by a sync fence.
- * @param sync_fence_fd The sync_fence file descriptor. The file descriptor will
- *                      be set to -1 if there is an error.
- *
- * @return NEURON_NO_ERROR if successful.
- *
- * Available since 5.0.0
- */
-int NeuronEvent_getSyncFenceFd(const NeuronEvent* event, int* sync_fence_fd);
-
-/**
- * Queries whether an extension is supported by the driver implementation of the
- * specified device.
- *
- * @param extension The extension name.
- * @param isExtensionSupported The boolean value indicating whether the
- * extension is supported.
- *
- * @return NEURON_NO_ERROR if successful.
- *
- * Available since 5.0.0
- */
-// Note: Remove "device"
-int NeuronDevice_getExtensionSupport(
-    const char* extensionName,
-    bool* isExtensionSupported);
-
-/**
- * Creates an operand type from an extension name and an extension operand code.
- *
- * See {@link NeuronModel} for information on multithreaded usage.
- *
- * Available since 5.0.0
- *
- * @param model The model to contain the operand.
- * @param extensionName The extension name.
- * @param operandCodeWithinExtension The extension operand code.
- * @param type The operand type.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_getExtensionOperandType(
-    NeuronModel* model,
-    const char* extensionName,
-    uint16_t operandCodeWithinExtension,
-    int32_t* type);
-
-/**
- * Creates an operation type from an extension name and an extension operation
- * code.
- *
- * See {@link NeuronModel} for information on multithreaded usage.
- *
- * Available since 5.0.0
- *
- * @param model The model to contain the operation.
- * @param extensionName The extension name.
- * @param operationCodeWithinExtension The extension operation code.
- * @param type The operation type.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_getExtensionOperationType(
-    NeuronModel* model,
-    const char* extensionName,
-    uint16_t operationCodeWithinExtension,
-    int32_t* type);
-
-/**
- * Sets extension operand parameters.
- *
- * Available since 5.0.0
- *
- * @param model The model to be modified.
- * @param index The index of the model operand we're setting.
- * @param data A pointer to the extension operand data.
- *             The data does not have to outlive the call to this function.
- * @param length The size in bytes of the data value.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronModel_setOperandExtensionData(
-    NeuronModel* model,
-    int32_t index,
-    const void* data,
-    size_t length);
-
-/**
- * Gets the execution preference associated with this compilation.
- * This function must be called after calling NeuronCompilation_finish.
- *
- * Available since 6.0.0
- *
- * @param compilation The compilation to be queried.
- * @param preference The execution preference will be one of NEURON_PREFER_*.
- * Ignore preference value if this function doesn't return NEURON_NO_ERROR.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_getPreference(
-    NeuronCompilation* compilation,
-    int* preference);
-
-/**
- * Gets the execution priority associated with this compilation.
- * This function must be called after calling NeuronCompilation_finish.
- *
- * Available since 6.0.0
- *
- * @param compilation The compilation to be queried.
- * @param priority The priority will be one of NEURON_PRIORITY_*. Ignore
- * priority value if this function doesn't return NEURON_NO_ERROR.
- *
- * @return NEURON_NO_ERROR if successful.
- */
-int NeuronCompilation_getPriority(
-    NeuronCompilation* compilation,
-    int* priority);
-
-int NeuronCompilation_createWithOptions(
-    NeuronModel* model,
-    NeuronCompilation** compilation,
-    const char* options);
-__END_DECLS
diff --git a/backends/nxp/TARGETS b/backends/nxp/TARGETS
index d56ac60242c..875f9813f43 100644
--- a/backends/nxp/TARGETS
+++ b/backends/nxp/TARGETS
@@ -50,7 +50,7 @@ runtime.python_library(
     name = "neutron_sdk",
     srcs = glob(["backend/**/*.py"]),
     deps = [
-       "fbsource//third-party/pypi/neutron_converter:neutron_converter",
+        "fbsource//third-party/pypi/neutron_converter:neutron_converter",
     ],
 )
 
@@ -68,7 +68,6 @@ runtime.python_library(
         ":quantizer",
         "fbsource//third-party/pypi/flatbuffers:flatbuffers",
         "fbsource//third-party/pypi/ml-dtypes:ml-dtypes",
-        "fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/serializer:serializer",
         "//executorch/exir:lib",
         "//executorch/backends/transforms:remove_getitem_op",
         "//caffe2:torch",
diff --git a/backends/nxp/backend/ir/converter/builder/model_builder.py b/backends/nxp/backend/ir/converter/builder/model_builder.py
index 1ca46237814..4f036854138 100755
--- a/backends/nxp/backend/ir/converter/builder/model_builder.py
+++ b/backends/nxp/backend/ir/converter/builder/model_builder.py
@@ -412,6 +412,26 @@ def _make_outputs_channels_first(self):
 
         self.get_sub_graph().outputs.tmp_outputs = new_outputs
 
+    def _keep_one_empty_buffer(self):
+        """Create a single empty `Buffer` object and assign it to all tensors in the model that don't have static data."""
+        empty_buffer = self.get_first_empty_buffer()
+
+        for t in self.get_tensors().vector:
+            if tensor_has_data(t):
+                # The buffer of `t` is not empty.
+                continue
+
+            if t.tmp_buffer == empty_buffer:
+                # Already optimized.
+                continue
+
+            if t.is_variable:
+                # The data of the tensor will change at runtime, so it shouldn't share the buffer with other tensors.
+                continue
+
+            # It's safe to replace the buffer.
+            t.tmp_buffer = empty_buffer
+
     def finish(self) -> tflite_model.Model:
         """Finalize and optimize the converted TFLite model. Then return it.
 
@@ -430,6 +450,8 @@ def finish(self) -> tflite_model.Model:
             self.conversion_config.optimization_blacklist,
         )
 
+        self._keep_one_empty_buffer()
+
         # Remove outputs, which are not produced by any node. Otherwise, there would be errors after inference.
         operator_outputs = []
         for op in self.get_operators().vector:
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/keep_one_empty_buffer.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/keep_one_empty_buffer.py
deleted file mode 100755
index 9809719fad4..00000000000
--- a/backends/nxp/backend/ir/tflite_optimizer/optimizations/keep_one_empty_buffer.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2024 NXP
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from executorch.backends.nxp.backend.ir.converter.tensor_utils import tensor_has_data
-from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.base_optimization import (
-    BaseOptimization,
-)
-
-
-class KeepOneEmptyBuffer(BaseOptimization):
-
-    def __call__(self) -> bool:
-        """Create a single empty `Buffer` object and assign it to all tensors in the model that don't have static data.
-        :return: True, if any tensors had their buffer changed. Otherwise, False.
-        """
-
-        made_changes = False
-        empty_buffer = self._builder.get_first_empty_buffer()
-
-        for t in self._builder.get_tensors().vector:
-            if tensor_has_data(t):
-                # The buffer of `t` is not empty.
-                continue
-
-            if t.tmp_buffer == empty_buffer:
-                # Already optimized.
-                continue
-
-            if t.is_variable:
-                # The data of the tensor will change at runtime, so it shouldn't share the buffer with other tensors.
-                continue
-
-            # It's safe to replace the buffer.
-            t.tmp_buffer = empty_buffer
-            made_changes = True
-
-        return made_changes
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizer.py b/backends/nxp/backend/ir/tflite_optimizer/optimizer.py
index eb4ce6a5992..d4a097ca76d 100755
--- a/backends/nxp/backend/ir/tflite_optimizer/optimizer.py
+++ b/backends/nxp/backend/ir/tflite_optimizer/optimizer.py
@@ -20,9 +20,6 @@
 from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.fuse_fully_connected_and_add_operators import (
     FuseFullyConnectedAndAddOperators,
 )
-from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.keep_one_empty_buffer import (
-    KeepOneEmptyBuffer,
-)
 from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.move_relu_before_concat import (
     MoveActivationBeforeConcatenation,
 )
@@ -36,7 +33,6 @@
 
 
 class Optimization(Enum):
-    KEEP_ONE_EMPTY_BUFFER = 0
     FUSE_ACTIVATION_FUNCTIONS = 1
     FUSE_FULLY_CONNECTED_AND_ADD = 2
 
@@ -76,9 +72,6 @@ def __init__(
         self._builder = builder
 
         self.optimization_map = {
-            Optimization.KEEP_ONE_EMPTY_BUFFER: KeepOneEmptyBuffer(
-                builder, conversion_config
-            ),
             Optimization.FUSE_ACTIVATION_FUNCTIONS: FuseActivationFunctions(
                 builder, conversion_config
             ),
diff --git a/backends/test/suite/operators/test_elu.py b/backends/test/suite/operators/test_elu.py
index f768a426954..361e1382c37 100644
--- a/backends/test/suite/operators/test_elu.py
+++ b/backends/test/suite/operators/test_elu.py
@@ -7,6 +7,8 @@
 # pyre-unsafe
 
 
+import unittest
+
 import torch
 from executorch.backends.test.suite.flow import TestFlow
 
@@ -42,5 +44,6 @@ def test_elu_f32_multi_dim(self, flow: TestFlow) -> None:
     def test_elu_f32_alpha(self, flow: TestFlow) -> None:
         self._test_op(Model(alpha=0.5), (torch.randn(3, 4, 5),), flow)
 
+    @unittest.skip("In place activations aren't properly defunctionalized yet.")
     def test_elu_f32_inplace(self, flow: TestFlow) -> None:
         self._test_op(Model(inplace=True), (torch.randn(3, 4, 5),), flow)
diff --git a/backends/test/suite/operators/test_hardsigmoid.py b/backends/test/suite/operators/test_hardsigmoid.py
index 238b18b1e0d..8ca254d4f61 100644
--- a/backends/test/suite/operators/test_hardsigmoid.py
+++ b/backends/test/suite/operators/test_hardsigmoid.py
@@ -7,6 +7,8 @@
 # pyre-unsafe
 
 
+import unittest
+
 import torch
 from executorch.backends.test.suite.flow import TestFlow
 
@@ -38,6 +40,7 @@ def test_hardsigmoid_f32_single_dim(self, flow: TestFlow) -> None:
     def test_hardsigmoid_f32_multi_dim(self, flow: TestFlow) -> None:
         self._test_op(Model(), (torch.randn(2, 3, 4, 5),), flow)
 
+    @unittest.skip("In place activations aren't properly defunctionalized yet.")
     def test_hardsigmoid_f32_inplace(self, flow: TestFlow) -> None:
         self._test_op(Model(inplace=True), (torch.randn(3, 4, 5),), flow)
 
diff --git a/backends/test/suite/operators/test_hardswish.py b/backends/test/suite/operators/test_hardswish.py
index 66902791c33..a93516542c8 100644
--- a/backends/test/suite/operators/test_hardswish.py
+++ b/backends/test/suite/operators/test_hardswish.py
@@ -7,6 +7,8 @@
 # pyre-unsafe
 
 
+import unittest
+
 import torch
 from executorch.backends.test.suite.flow import TestFlow
 
@@ -38,6 +40,7 @@ def test_hardswish_f32_single_dim(self, flow: TestFlow) -> None:
     def test_hardswish_f32_multi_dim(self, flow: TestFlow) -> None:
         self._test_op(Model(), (torch.randn(2, 3, 4, 5),), flow)
 
+    @unittest.skip("In place activations aren't properly defunctionalized yet.")
     def test_hardswish_f32_inplace(self, flow: TestFlow) -> None:
         self._test_op(Model(inplace=True), (torch.randn(3, 4, 5),), flow)
 
diff --git a/backends/test/suite/operators/test_hardtanh.py b/backends/test/suite/operators/test_hardtanh.py
index 2fcd1dbf563..7520c3faeae 100644
--- a/backends/test/suite/operators/test_hardtanh.py
+++ b/backends/test/suite/operators/test_hardtanh.py
@@ -7,6 +7,8 @@
 # pyre-unsafe
 
 
+import unittest
+
 import torch
 from executorch.backends.test.suite.flow import TestFlow
 
@@ -45,6 +47,7 @@ def test_hardtanh_f32_multi_dim(self, flow: TestFlow) -> None:
     def test_hardtanh_f32_custom_range(self, flow: TestFlow) -> None:
         self._test_op(Model(min_val=-2.0, max_val=2.0), (torch.randn(3, 4, 5),), flow)
 
+    @unittest.skip("In place activations aren't properly defunctionalized yet.")
     def test_hardtanh_f32_inplace(self, flow: TestFlow) -> None:
         self._test_op(Model(inplace=True), (torch.randn(3, 4, 5),), flow)
 
diff --git a/backends/test/suite/operators/test_leaky_relu.py b/backends/test/suite/operators/test_leaky_relu.py
index 983da47bba3..79ed5425623 100644
--- a/backends/test/suite/operators/test_leaky_relu.py
+++ b/backends/test/suite/operators/test_leaky_relu.py
@@ -7,6 +7,8 @@
 # pyre-unsafe
 
 
+import unittest
+
 import torch
 from executorch.backends.test.suite.flow import TestFlow
 
@@ -44,6 +46,7 @@ def test_leaky_relu_f32_multi_dim(self, flow: TestFlow) -> None:
     def test_leaky_relu_f32_custom_slope(self, flow: TestFlow) -> None:
         self._test_op(Model(negative_slope=0.1), (torch.randn(3, 4, 5),), flow)
 
+    @unittest.skip("In place activations aren't properly defunctionalized yet.")
     def test_leaky_relu_f32_inplace(self, flow: TestFlow) -> None:
         self._test_op(Model(inplace=True), (torch.randn(3, 4, 5),), flow)
 
diff --git a/backends/test/suite/operators/test_relu.py b/backends/test/suite/operators/test_relu.py
index c9f416f090f..3c4ef2a98d0 100644
--- a/backends/test/suite/operators/test_relu.py
+++ b/backends/test/suite/operators/test_relu.py
@@ -7,6 +7,8 @@
 # pyre-unsafe
 
 
+import unittest
+
 import torch
 from executorch.backends.test.suite.flow import TestFlow
 
@@ -38,5 +40,6 @@ def test_relu_f32_single_dim(self, flow: TestFlow) -> None:
     def test_relu_f32_multi_dim(self, flow: TestFlow) -> None:
         self._test_op(Model(), (torch.randn(2, 3, 4, 5),), flow)
 
+    @unittest.skip("In place activations aren't properly defunctionalized yet.")
     def test_relu_f32_inplace(self, flow: TestFlow) -> None:
         self._test_op(Model(inplace=True), (torch.randn(3, 4, 5),), flow)
diff --git a/backends/test/suite/operators/test_silu.py b/backends/test/suite/operators/test_silu.py
index 69b6576734f..cf6d343f271 100644
--- a/backends/test/suite/operators/test_silu.py
+++ b/backends/test/suite/operators/test_silu.py
@@ -7,6 +7,8 @@
 # pyre-unsafe
 
 
+import unittest
+
 import torch
 from executorch.backends.test.suite.flow import TestFlow
 
@@ -38,6 +40,7 @@ def test_silu_f32_single_dim(self, flow: TestFlow) -> None:
     def test_silu_f32_multi_dim(self, flow: TestFlow) -> None:
         self._test_op(Model(), (torch.randn(2, 3, 4, 5),), flow)
 
+    @unittest.skip("In place activations aren't properly defunctionalized yet.")
     def test_silu_f32_inplace(self, flow: TestFlow) -> None:
         self._test_op(Model(inplace=True), (torch.randn(3, 4, 5),), flow)
 
diff --git a/backends/test/suite/operators/test_threshold.py b/backends/test/suite/operators/test_threshold.py
index 42b6fb801e5..3f69a9f41fe 100644
--- a/backends/test/suite/operators/test_threshold.py
+++ b/backends/test/suite/operators/test_threshold.py
@@ -7,6 +7,8 @@
 # pyre-unsafe
 
 
+import unittest
+
 import torch
 from executorch.backends.test.suite.flow import TestFlow
 
@@ -51,6 +53,7 @@ def test_threshold_f32_custom_value(self, flow: TestFlow) -> None:
     def test_threshold_f32_custom_threshold_value(self, flow: TestFlow) -> None:
         self._test_op(Model(threshold=0.5, value=1.0), (torch.randn(3, 4, 5),), flow)
 
+    @unittest.skip("In place activations aren't properly defunctionalized yet.")
     def test_threshold_f32_inplace(self, flow: TestFlow) -> None:
         self._test_op(Model(inplace=True), (torch.randn(3, 4, 5),), flow)
 
diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt
index 200d8987b19..33bf84b9066 100644
--- a/backends/xnnpack/CMakeLists.txt
+++ b/backends/xnnpack/CMakeLists.txt
@@ -59,7 +59,7 @@ foreach(fbs_file ${_xnnpack_schema__srcs})
   )
 endforeach()
 
-if(WIN32 AND NOT CMAKE_CROSSCOMPILING)
+if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
   set(MV_COMMAND
       powershell -Command
       "Move-Item -Path ${_xnnpack_flatbuffer__outputs} -Destination ${_xnnpack_schema__outputs} -Force"
diff --git a/backends/xnnpack/test/ops/test_linear.py b/backends/xnnpack/test/ops/test_linear.py
index ac6fec25732..dc92a9542a9 100644
--- a/backends/xnnpack/test/ops/test_linear.py
+++ b/backends/xnnpack/test/ops/test_linear.py
@@ -395,7 +395,9 @@ def _test_groupwise_dq_linear(
         quantize_(
             mod,
             Int8DynamicActivationIntxWeightConfig(
-                weight_dtype=torch.int4, weight_granularity=PerGroup(group_size)
+                # pyre-ignore[16]
+                weight_dtype=torch.int4,
+                weight_granularity=PerGroup(group_size),
             ),
         )
         unwrap_tensor_subclass(mod)
diff --git a/docs/source/backends-qualcomm.md b/docs/source/backends-qualcomm.md
index f427c7c7cea..45f932da491 100644
--- a/docs/source/backends-qualcomm.md
+++ b/docs/source/backends-qualcomm.md
@@ -385,7 +385,7 @@ example_inputs = (torch.randn(1, 3, 224, 224),)  # Example input tensor
 Choose between quantization approaches, post training quantization (PTQ) or quantization aware training (QAT):
 ```python
 from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer
-from torch.ao.quantization.quantize_pt2e import prepare_pt2e, prepare_qat_pt2e, convert_pt2e
+from torchao.quantization.pt2e.quantize_pt2e import prepare_pt2e, prepare_qat_pt2e, convert_pt2e
 
 quantizer = QnnQuantizer()
 m = torch.export.export(model, example_inputs, strict=True).module()
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index 8132751f6f0..106ab35363c 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -18,6 +18,7 @@
 
 import torch
 from examples.devtools.scripts.export_bundled_program import save_bundled_program
+from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec
 from executorch.backends.arm.ethosu import EthosUCompileSpec, EthosUPartitioner
 from executorch.backends.arm.quantizer import (
     EthosUQuantizer,
@@ -386,6 +387,7 @@ def get_compile_spec(
     memory_mode: Optional[str] = None,
     quantize: bool = False,
     config: Optional[str] = None,
+    debug_mode: Optional[str] = None,
 ) -> TosaCompileSpec | EthosUCompileSpec | VgfCompileSpec:
     compile_spec = None
     if target.startswith("TOSA"):
@@ -414,6 +416,10 @@ def get_compile_spec(
     if intermediates is not None:
         compile_spec.dump_intermediate_artifacts_to(intermediates)
 
+    if debug_mode is not None:
+        mode = ArmCompileSpec.DebugMode[debug_mode.upper()]
+        compile_spec.dump_debug_info(mode)
+
     return compile_spec
 
 
@@ -601,6 +607,12 @@ def get_args():
         action="store_true",
         help="Enable the QuantizedOpFusionPass fusion step",
     )
+    parser.add_argument(
+        "--enable_debug_mode",
+        required=False,
+        choices=["json", "tosa"],
+        help="Flag to enable ATen-to-TOSA debug mode.",
+    )
     args = parser.parse_args()
 
     if args.evaluate and (
@@ -735,6 +747,7 @@ def to_edge_TOSA_delegate(
         args.memory_mode,
         args.quantize,
         args.config,
+        args.enable_debug_mode,
     )
 
     model_int8 = None
@@ -776,6 +789,7 @@ def to_edge_no_delegate(exported_program, args, model: torch.nn.Module, example_
             args.memory_mode,
             args.quantize,
             args.config,
+            args.enable_debug_mode,
         )
         model, exported_program = quantize_model(
             args, model, example_inputs, compile_spec
@@ -824,12 +838,21 @@ def transform_for_cortex_m_backend(edge, args):
     exported_program = torch.export.export(
         model, example_inputs, strict=args.strict_export
     )
+
     model = exported_program.module()
     model_fp32 = model
 
+    model_name = os.path.basename(os.path.splitext(args.model_name)[0])
     if args.intermediates:
         os.makedirs(args.intermediates, exist_ok=True)
 
+        # We only support Python3.10 and above, so use a later pickle protocol
+        torch.export.save(
+            exported_program,
+            f"{args.intermediates}/{model_name}_exported_program.pt2",
+            pickle_protocol=5,
+        )
+
     # Quantize if required
     model_int8 = None
     if args.delegate:
@@ -862,7 +885,6 @@ def transform_for_cortex_m_backend(edge, args):
         else:
             raise e
 
-    model_name = os.path.basename(os.path.splitext(args.model_name)[0])
     output_name = f"{model_name}" + (
         f"_arm_delegate_{args.target}"
         if args.delegate is True
diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt
index ff6f73398c3..4e4a8eeb409 100644
--- a/examples/arm/executor_runner/CMakeLists.txt
+++ b/examples/arm/executor_runner/CMakeLists.txt
@@ -6,30 +6,59 @@
 cmake_minimum_required(VERSION 3.20)
 project(arm_executor_runner)
 
-option(SEMIHOSTING "Enable semihosting" OFF)
-option(
-  ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE
-  "Set ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE to specify memory alloction pool size"
-  OFF
-)
 option(
   ET_MODEL_PTE_ADDR
   "Place in memory that the PTE file is located/flashed, if set to OFF the PTE is built into the code as a big data area."
   OFF
 )
-option(ET_BUNDLE_IO "Set to compile in BundleIO support" OFF)
-option(ET_ATOL "Set atol to use for BundleIO testing" OFF)
-option(ET_RTOL "Set rtol to use for BundleIO testing" OFF)
-option(ET_DUMP_INPUT "Dump input in log" OFF)
-option(ET_DUMP_OUTPUT "Dump output in log" ON)
-option(FETCH_ETHOS_U_CONTENT
-       "Fetch ethos_u dependencies instead of relying on pre-downloads" ON
-)
+
 set(ET_NUM_INFERENCES
     "1"
     CACHE STRING "Number of inferences to run"
 )
 
+option(ET_LOG_DUMP_INPUT "Dump input in log" OFF)
+option(ET_LOG_DUMP_OUTPUT "Dump output in log" ON)
+
+option(ET_BUNDLE_IO "Set to compile in BundleIO support" OFF)
+set(ET_ATOL
+    "0.01"
+    CACHE STRING "Set atol to use for BundleIO testing (Requires ET_BUNDLE_IO)"
+)
+set(ET_RTOL
+    "0.01"
+    CACHE STRING "Set atol to use for BundleIO testing (Requires ET_BUNDLE_IO)"
+)
+
+option(
+  ET_DUMP_OUTPUTS
+  "Collect and print outputs as a base64 buffer in the log (Requires EXECUTORCH_ENABLE_EVENT_TRACER)"
+  OFF
+)
+option(
+  ET_DUMP_INTERMEDIATE_OUTPUTS
+  "Collect and print intermediate outputs as a base64 buffer in the log (Requires EXECUTORCH_ENABLE_EVENT_TRACER)"
+  OFF
+)
+set(ET_DEBUG_BUFFER_SIZE
+    "2097152"
+    CACHE
+      STRING
+      "Size of buffer to collect intermediate outputs/outputs buffers (Requires EXECUTORCH_ENABLE_EVENT_TRACER and ET_DUMP_OUTPUTS or ET_DUMP_INTERMEDIATE_OUTPUTS)"
+)
+
+option(SEMIHOSTING "Enable semihosting" OFF)
+
+option(
+  ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE
+  "Set ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE to specify memory alloction pool size"
+  OFF
+)
+
+option(FETCH_ETHOS_U_CONTENT
+       "Fetch ethos_u dependencies instead of relying on pre-downloads" ON
+)
+
 if(NOT DEFINED ET_MODEL_PTE_ADDR
    AND NOT DEFINED ET_PTE_FILE_PATH
    AND NOT DEFINED SEMIHOSTING
@@ -322,37 +351,29 @@ if(NOT ${ET_MODEL_PTE_ADDR} AND NOT SEMIHOSTING)
   add_dependencies(arm_executor_runner gen_model_header)
 endif()
 
-if(SEMIHOSTING)
-  target_compile_definitions(arm_executor_runner PUBLIC SEMIHOSTING)
-endif()
-
-if(ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE)
+if(ET_MODEL_PTE_ADDR)
   target_compile_definitions(
-    arm_executor_runner
-    PUBLIC
-      ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE}
+    arm_executor_runner PUBLIC -DET_MODEL_PTE_ADDR=${ET_MODEL_PTE_ADDR}
   )
 endif()
 
-target_compile_definitions(
-  arm_executor_runner
-  PUBLIC
-    ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE}
-)
-if(DEFINED ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE)
+if(ET_NUM_INFERENCES)
   target_compile_definitions(
-    arm_executor_runner
-    PUBLIC
-      ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE}
+    arm_executor_runner PUBLIC ET_NUM_INFERENCES=${ET_NUM_INFERENCES}
   )
 endif()
 
-if(ET_MODEL_PTE_ADDR)
-  target_compile_definitions(
-    arm_executor_runner PUBLIC -DET_MODEL_PTE_ADDR=${ET_MODEL_PTE_ADDR}
-  )
+if(ET_LOG_DUMP_INPUT)
+  target_compile_definitions(arm_executor_runner PUBLIC -DET_LOG_DUMP_INPUT)
+endif()
+
+if(ET_LOG_DUMP_OUTPUT)
+  target_compile_definitions(arm_executor_runner PUBLIC -DET_LOG_DUMP_OUTPUT)
 endif()
 
+# Devtool BundleIO: Use Bundle PTE with input and reference output included to
+# check if it matches.
+
 if(ET_BUNDLE_IO)
   target_compile_definitions(arm_executor_runner PUBLIC -DET_BUNDLE_IO)
 endif()
@@ -365,17 +386,50 @@ if(ET_RTOL)
   target_compile_definitions(arm_executor_runner PUBLIC ET_RTOL=${ET_RTOL})
 endif()
 
-if(ET_DUMP_INPUT)
-  target_compile_definitions(arm_executor_runner PUBLIC -DET_DUMP_INPUT)
+# Devtools ETDump: Speed and dumping output
+
+if(ET_DUMP_OUTPUTS)
+  target_compile_definitions(arm_executor_runner PUBLIC -DET_DUMP_OUTPUTS)
 endif()
 
-if(ET_DUMP_OUTPUT)
-  target_compile_definitions(arm_executor_runner PUBLIC -DET_DUMP_OUTPUT)
+if(ET_DUMP_INTERMEDIATE_OUTPUTS)
+  target_compile_definitions(
+    arm_executor_runner PUBLIC -DET_DUMP_INTERMEDIATE_OUTPUTS
+  )
 endif()
 
-if(ET_NUM_INFERENCES)
+if(ET_DEBUG_BUFFER_SIZE)
   target_compile_definitions(
-    arm_executor_runner PUBLIC ET_NUM_INFERENCES=${ET_NUM_INFERENCES}
+    arm_executor_runner PUBLIC ET_DEBUG_BUFFER_SIZE=${ET_DEBUG_BUFFER_SIZE}
+  )
+endif()
+
+# Semihosting FVP (FVP Simulator can access host filesystem)
+
+if(SEMIHOSTING)
+  target_compile_definitions(arm_executor_runner PUBLIC SEMIHOSTING)
+endif()
+
+# Memory buffer sizes for Executorch flow
+
+if(ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE)
+  target_compile_definitions(
+    arm_executor_runner
+    PUBLIC
+      ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE}
+  )
+endif()
+
+target_compile_definitions(
+  arm_executor_runner
+  PUBLIC
+    ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE}
+)
+if(DEFINED ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE)
+  target_compile_definitions(
+    arm_executor_runner
+    PUBLIC
+      ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE}
   )
 endif()
 
diff --git a/examples/arm/executor_runner/arm_executor_runner.cpp b/examples/arm/executor_runner/arm_executor_runner.cpp
index d56710e27ad..696817450b5 100644
--- a/examples/arm/executor_runner/arm_executor_runner.cpp
+++ b/examples/arm/executor_runner/arm_executor_runner.cpp
@@ -6,10 +6,10 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-/* This is an example executorch runner running on Arm Cortex-m and Ethos-U
+/* This is an example ExecuTorch runner running on Arm Cortex-M and Ethos-U
  * based hardware. This example tries to illustrate a few ways to use ExecuTorch
  * and you can use it as is or remove the unneeded parts. Please use this code
- * as inpiration.
+ * as inspiration.
  *
  * Some defines used to configure the code:
  *
@@ -20,24 +20,43 @@
  *                      that is controlled by your memory mode via the
  *                      ETHOSU_MODEL cmake parameter.
  *                      If SEMIHOSTING is define this is not used
- * ET_DUMP_INPUT      - Control if you want input to be dumped to the log.
- * ET_DUMP_OUTPUT     - Control if you want output to be dumped to the log.
- * ET_BUNDLE_IO       - Build in devtools BundelIO, this makes it possible to
+ * ET_NUM_INFERENCES  - Numbers of times to run the inference
+ * ET_LOG_DUMP_INPUT  - Control if you want input to be dumped to the log.
+ * ET_LOG_DUMP_OUTPUT     - Control if you want output to be dumped to the log.
+ *
+ * Devtool BundleIO: Use Bundle PTE with input and reference output included to
+ * check if it matches.
+ *
+ * ET_BUNDLE_IO       - Build in Devtools BundleIO, this makes it possible to
  *                      use bpte with bundled input and output refdata to
  *                      compare output.
  *                      See also ET_ATOL and ET_RTOL
- * ET_ATOL            - The atol used to compare the output and ref data when
- *                      using ET_BUNDLE_IO
- * ET_RTOL            - The rtol used to compare the output and ref data when
- *                      using ET_BUNDLE_IO
- * ET_EVENT_TRACER_ENABLED - Build in devtools event trace code to generate
- *                           ETDump and print it base64 coded of it in the logs
- *                           so you can get it out of your embedded target.
- *                           This can be used to benchmark where time is spent.
- *                           If you run on Ethos-U the delegate/commandstream
- *                           is run in one go, this means that per op
- *                           measurements is not possible.
- * Warning: CPU time meassurements is NOT possible in the FVP simulator and a
+ *   ET_ATOL              - The atol used to compare the output and ref data
+ * when using ET_BUNDLE_IO ET_RTOL              - The rtol used to compare the
+ * output and ref data when using ET_BUNDLE_IO
+ *
+ * Devtools ETDump: Speed and dumping output
+ *
+ * ET_EVENT_TRACER_ENABLED       - Build in Devtools ETDump event trace code
+ *                                 to generate cycle data and print it base64
+ *                                 coded in the log so you can get it out of
+ *                                 your embedded target. This can be used to
+ *                                 benchmark where time is spent. If you run
+ *                                 on Ethos-U the delegate/commandstream is
+ *                                 run in one go, this means that per op
+ *                                 measurements is not possible.
+ *  ET_DUMP_OUTPUTS              - Collect and print outputs as a base64 buffer
+ *                                 in the log, see ExecuTorch Devtools for more
+ *                                 info. (Requires ET_EVENT_TRACER_ENABLED)
+ *  ET_DUMP_INTERMEDIATE_OUTPUTS - Collect and print intermediate outputs as a
+ *                                 base64 buffer in the log, see ExecuTorch
+ *                                 Devtools for more info.
+ *                                 (Requires ET_EVENT_TRACER_ENABLED)
+ *  ET_DEBUG_BUFFER_SIZE         - Override the size of memory area used by
+ *                                 ET_DUMP_OUTPUTS or
+ * ET_DUMP_INTERMEDIATE_OUTPUTS
+ *
+ * Warning: CPU time measurements is NOT possible in the FVP simulator and a
  * real target or FPGA must be used. NPU number are roughly OK, and can be used
  * as guidance if timeing adaptor values are set correctly.
  *
@@ -54,11 +73,12 @@
  *    left over memory after code is linked. This needs to be big enough to fit
  *    and run your model. In our example using the FVP simulator we have much
  *    memory and set this quite high to be able to test larger models.
- *    Regarding heap/mallocs type of allocation from executorch,
+ *    Regarding heap/mallocs type of allocation from ExecuTorch,
  *    et_pal_allocate() is not implemented or needed.
  *
- * ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE - Size of memory area
- *                                               used when setting up the model
+ * ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE            - Size of memory area
+ *                                                          used when setting up
+ *                                                          the model
  * ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE - Size of memory area
  *                                                          used when running
  *                                                          inferences
@@ -86,10 +106,21 @@
 
 #if defined(ET_EVENT_TRACER_ENABLED)
 #include <executorch/devtools/etdump/etdump_flatcc.h>
+
+#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS)
+#include <executorch/devtools/etdump/data_sinks/buffer_data_sink.h>
+
+#if !defined(ET_DEBUG_BUFFER_SIZE)
+#define ET_DEBUG_BUFFER_SIZE (2 * 1024 * 1024)
+#endif
+
+#endif
+
 #if !defined(SEMIHOSTING)
 #include <executorch/third-party/flatcc/include/flatcc/portable/pbase64.h>
 #endif
-#endif
+
+#endif // defined(ET_EVENT_TRACER_ENABLED)
 
 #if defined(SEMIHOSTING)
 
@@ -158,8 +189,10 @@ using executorch::bundled_program::ErrorStats;
 using executorch::bundled_program::verify_method_outputs;
 #endif
 #if defined(ET_EVENT_TRACER_ENABLED)
+using executorch::etdump::BufferDataSink;
 using executorch::etdump::ETDumpGen;
 using executorch::etdump::ETDumpResult;
+using executorch::runtime::EventTracerDebugLogLevel;
 using torch::executor::etdump_result;
 #endif
 /**
@@ -505,6 +538,9 @@ struct RunnerContext {
   Box<Result<Method>> method;
 #if defined(ET_EVENT_TRACER_ENABLED)
   Box<ETDumpGen> etdump_gen;
+#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS)
+  void* debug_buffer;
+#endif
 #endif
 #if defined(SEMIHOSTING)
   Box<ArmMemoryAllocator> input_file_allocator;
@@ -622,7 +658,60 @@ void runner_init(
   ET_LOG(Info, "Setting up ETDump");
   ctx.etdump_gen.reset();
   event_tracer_ptr = &ctx.etdump_gen.value();
-#endif
+
+#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS)
+  // Alloc debug buffer and create if and only if we need to log intermediate
+  // tensor outputs
+  ctx.debug_buffer = ctx.method_allocator->allocate(ET_DEBUG_BUFFER_SIZE, 16);
+  if (ctx.debug_buffer != nullptr) {
+    Span<uint8_t> debug_buffer_span(
+        (uint8_t*)ctx.debug_buffer, ET_DEBUG_BUFFER_SIZE);
+
+    Result<bool> result =
+        ctx.etdump_gen.value().set_debug_buffer(debug_buffer_span);
+
+    if (result.ok()) {
+      // Everything worked, we got the buffer setup, lets enable output logging
+      // depending on the compile flag ET_DUMP_INTERMEDIATE_OUTPUTS e.g.
+      // kIntermediateOutputs or kProgramOutputs
+#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS)
+      ET_LOG(
+          Info,
+          "ETDump: Allocated intermediate output buffer size: %d at 0x%p",
+          ET_DEBUG_BUFFER_SIZE,
+          ctx.debug_buffer);
+      ctx.etdump_gen.value().set_event_tracer_debug_level(
+          EventTracerDebugLogLevel::kIntermediateOutputs);
+#else // defined(ET_DUMP_INTERMEDIATE_OUTPUTS)
+      ET_LOG(
+          Info,
+          "ETDump: Allocated output buffer size: %d at 0x%p",
+          ET_DEBUG_BUFFER_SIZE,
+          ctx.debug_buffer);
+      ctx.etdump_gen.value().set_event_tracer_debug_level(
+          EventTracerDebugLogLevel::kProgramOutputs);
+#endif // defined(ET_DUMP_INTERMEDIATE_OUTPUTS)
+
+    } else {
+      // set_debug_buffer() failed
+      // Here we would free ctx.debug_buffer if it was possible, but we can't as
+      // the allocator don't support it.
+      ctx.debug_buffer = nullptr;
+      ET_LOG(
+          Error,
+          "ETDump: Could not set_debug_buffer() for output buffer size %zu error:0x%" PRIx32,
+          ET_DEBUG_BUFFER_SIZE,
+          result.error());
+    }
+  } else {
+    // debug buffer allocation failed
+    ET_LOG(
+        Error,
+        "ETDump: Could not allocate memory for output buffer size %zu",
+        ET_DEBUG_BUFFER_SIZE);
+  }
+#endif // defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS)
+#endif // defined(ET_EVENT_TRACER_ENABLED)
 
   ctx.method.reset(
       program->load_method(ctx.method_name, &memory_manager, event_tracer_ptr));
@@ -660,7 +749,7 @@ void runner_init(
     ET_CHECK_MSG(
         status == Error::Ok, "Failed to prepare inputs 0x%" PRIx32, status);
   }
-#if defined(ET_DUMP_INPUT)
+#if defined(ET_LOG_DUMP_INPUT)
   {
     std::vector<EValue> inputs((*ctx.method.value())->inputs_size());
     ET_LOG(Info, "%zu inputs: ", inputs.size());
@@ -712,7 +801,7 @@ void runner_init(
   ET_LOG(Info, "Input prepared.");
 }
 
-void log_mem_status(const RunnerContext& ctx) {
+void log_mem_status(RunnerContext& ctx) {
   size_t executor_memsize =
       ctx.method_allocator->used_size() - ctx.executor_membase;
 
@@ -765,6 +854,20 @@ void log_mem_status(const RunnerContext& ctx) {
   if (ctx.temp_allocator->size() > 0) {
     ET_LOG(Info, "temp_allocator:            %zu", ctx.temp_allocator->size());
   }
+#if defined(ET_EVENT_TRACER_ENABLED)
+#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS)
+  if (ctx.debug_buffer != nullptr) {
+    size_t outputdump_len = ctx.etdump_gen->get_data_sink()->get_used_bytes();
+    ET_LOG(
+        Info,
+        "ETDump_outputs_buffer:     %zu / %zu free: %zu ( used: %zu %% ) ",
+        outputdump_len,
+        ET_DEBUG_BUFFER_SIZE,
+        ET_DEBUG_BUFFER_SIZE - outputdump_len,
+        100 * outputdump_len / ET_DEBUG_BUFFER_SIZE);
+  }
+#endif
+#endif
 }
 
 void print_outputs(RunnerContext& ctx) {
@@ -779,7 +882,7 @@ void print_outputs(RunnerContext& ctx) {
     if (outputs[i].isTensor()) {
       Tensor tensor = outputs[i].toTensor();
 #if !defined(SEMIHOSTING)
-#if defined(ET_DUMP_OUTPUT)
+#if defined(ET_LOG_DUMP_OUTPUT)
       // The output might be collected and parsed so printf() is used instead
       // of ET_LOG() here
       for (int j = 0; j < tensor.numel(); ++j) {
@@ -811,7 +914,7 @@ void print_outputs(RunnerContext& ctx) {
         }
       }
 #endif
-#else
+#else //! defined(SEMIHOSTING)
       char out_filename[255];
       snprintf(out_filename, 255, "%s-%d.bin", ctx.output_basename, i);
       ET_LOG(Info, "Writing output to file: %s", out_filename);
@@ -819,7 +922,7 @@ void print_outputs(RunnerContext& ctx) {
       auto written_size =
           fwrite(tensor.const_data_ptr<char>(), 1, tensor.nbytes(), out_file);
       fclose(out_file);
-#endif
+#endif //! defined(SEMIHOSTING)
     } else {
       printf("Output[%d]: Not Tensor\n", i);
     }
@@ -835,29 +938,96 @@ void write_etdump(RunnerContext& ctx) {
   if (result.buf != nullptr && result.size > 0) {
     // On a device with no file system we can't just write it out
     // to the file-system so we base64 encode it and dump it on the log.
+    bool dump_outputs = false;
     int mode = base64_enc_modifier_padding | base64_dec_modifier_skipspace;
-    size_t len = result.size;
-    size_t encoded_len = base64_encoded_size(result.size, mode);
+    size_t etdump_len = result.size;
+    size_t encoded_etdump_len = base64_encoded_size(etdump_len, mode);
+    size_t base64buffer_len = encoded_etdump_len;
+#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS)
+    // Make base64 buffer fit both so it can be reused istead of allocating two
+    // buffers.
+    size_t outputdump_len = 0;
+    size_t encoded_outputdump_len = 0;
+    if (ctx.debug_buffer != nullptr) {
+      outputdump_len = ctx.etdump_gen->get_data_sink()->get_used_bytes();
+      if (outputdump_len > 0) {
+        encoded_outputdump_len = base64_encoded_size(outputdump_len, mode);
+        if (encoded_outputdump_len > 0) {
+          base64buffer_len =
+              std::max(encoded_etdump_len, encoded_outputdump_len);
+          dump_outputs = true;
+        } else {
+          ET_LOG(
+              Error,
+              "Problem getting the size of the base64 ETDump output buffers");
+        }
+      } else {
+        ET_LOG(Error, "No ETDump output buffers saved in the data area");
+      }
+    }
+#endif
+    ET_LOG(Info, "[base64] buffer size: %d", base64buffer_len);
+
     uint8_t* encoded_buf = reinterpret_cast<uint8_t*>(
-        ctx.method_allocator->allocate(encoded_len + 1));
+        ctx.method_allocator->allocate(base64buffer_len + 1));
     if (encoded_buf != nullptr) {
-      int ret = base64_encode(
-          encoded_buf, (uint8_t*)result.buf, &encoded_len, &len, mode);
-      encoded_buf[encoded_len] = 0x00; // Ensure null termination
-      ET_LOG(Info, "Writing etdump.bin [base64]");
+      int ret;
+      const char* debug_buffer_flag = "";
+      printf("#[RUN THIS]\n");
+#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS)
+      if (dump_outputs) {
+        ret = base64_encode(
+            encoded_buf,
+            (uint8_t*)ctx.debug_buffer,
+            &encoded_outputdump_len,
+            &outputdump_len,
+            mode);
+        encoded_buf[encoded_outputdump_len] = 0x00; // Ensure null termination
+        printf("# Writing debug_buffer.bin [base64]\n");
+        printf("echo \"%s\" | base64 -d >debug_buffer.bin\n", encoded_buf);
+        debug_buffer_flag = "--debug_buffer_path debug_buffer.bin";
+      }
+#endif
+      ret = base64_encode(
+          encoded_buf,
+          (uint8_t*)result.buf,
+          &encoded_etdump_len,
+          &etdump_len,
+          mode);
+      encoded_buf[encoded_etdump_len] = 0x00; // Ensure null termination
+      printf("# Writing etdump.bin [base64]\n");
+      printf("echo \"%s\" | base64 -d >etdump.bin\n", encoded_buf);
+
+      printf("# Generate cpu cycle table with:\n");
       printf(
-          "#[RUN THIS]\necho \"%s\" | base64 -d >etdump.bin\npython3 -m devtools.inspector.inspector_cli --etdump_path etdump.bin  --source_time_scale cycles --target_time_scale cycles\n#[END]\n",
-          encoded_buf);
+          "python3 -m devtools.inspector.inspector_cli --etdump_path etdump.bin %s --source_time_scale cycles --target_time_scale cycles\n",
+          debug_buffer_flag);
+      printf("#[END]\n");
+
     } else {
       ET_LOG(
           Error,
           "Could not allocate memory etdump base64 encoding size %zu",
-          encoded_len + 1);
+          encoded_etdump_len + 1);
     }
   }
-#else
-  // Dump the etdump data containing profiling/debugging data to the specified
-  // file.
+#else // !defined(SEMIHOSTING)
+#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS)
+  if (ctx.debug_buffer != nullptr) {
+    // Dump the etdump outputs data to a file.
+    size_t outputdump_len = ctx.etdump_gen->get_data_sink()->get_used_bytes();
+    const char* etdump_output_filename = "debug_buffer.bin";
+    ET_LOG(
+        Info,
+        "Writing etdump debug_buffer to file: %s",
+        etdump_output_filename);
+    FILE* f = fopen(etdump_output_filename, "w+");
+    fwrite((uint8_t*)ctx.debug_buffer, 1, outputdump_len, f);
+    fclose(f);
+  }
+#endif
+
+  // Dump the etdump data containing profiling/debugging data to a file.
   etdump_result result = ctx.etdump_gen->get_etdump_data();
   if (result.buf != nullptr && result.size > 0) {
     // On a device with a file system we can just write it out
@@ -869,11 +1039,12 @@ void write_etdump(RunnerContext& ctx) {
     fclose(f);
     free(result.buf);
   }
-#endif
-#endif
+#endif // !defined(SEMIHOSTING)
+#endif // defined(ET_EVENT_TRACER_ENABLED)
 }
 
-void verify_result(RunnerContext& ctx, const void* model_pte) {
+bool verify_result(RunnerContext& ctx, const void* model_pte) {
+  bool model_ok = false;
 #if defined(ET_BUNDLE_IO)
   if (ctx.bundle_io) {
     // Check result
@@ -899,6 +1070,7 @@ void verify_result(RunnerContext& ctx, const void* model_pte) {
     if (status == Error::Ok) {
       ET_LOG(Info, "Model output match expected BundleIO bpte ref data.");
       ET_LOG(Info, "TEST: BundleIO index[%d] Test_result: PASS", testset_idx);
+      model_ok = true;
     } else {
       ET_LOG(
           Error,
@@ -906,19 +1078,24 @@ void verify_result(RunnerContext& ctx, const void* model_pte) {
           et_rtol,
           et_atol);
       ET_LOG(Error, "TEST: BundleIO index[%d] Test_result: FAIL", testset_idx);
+      ET_LOG(
+          Error, "Bundle verification failed with status 0x%" PRIx32, status);
+      model_ok = false;
     }
-    ET_CHECK_MSG(
-        status == Error::Ok,
-        "Bundle verification failed with status 0x%" PRIx32,
-        status);
+  } else {
+    // No checking done, assume true
+    model_ok = true;
   }
-#else
+#else // defined(ET_BUNDLE_IO)
   (void)ctx;
   (void)model_pte;
-#endif
+  // No checking done, assume true
+  model_ok = true;
+#endif // defined(ET_BUNDLE_IO)
+  return model_ok;
 }
 
-void run_model(RunnerContext& ctx, const void* model_pte) {
+bool run_model(RunnerContext& ctx, const void* model_pte) {
   Error status;
   ET_LOG(Info, "Starting running %d inferences...", num_inferences);
   int n = 0;
@@ -946,7 +1123,10 @@ void run_model(RunnerContext& ctx, const void* model_pte) {
 
   ET_LOG(Info, "%d inferences finished", num_inferences);
   print_outputs(ctx);
-  verify_result(ctx, model_pte);
+  bool model_ok = verify_result(ctx, model_pte);
+  ET_LOG(Info, "Model run: %d", model_ok);
+
+  return model_ok;
 }
 
 } // namespace
@@ -1047,10 +1227,14 @@ int main(int argc, const char* argv[]) {
       model_pte[7]);
 
   runner_init(ctx, input_buffers, pte_size);
-  run_model(ctx, model_pte);
+  bool model_ok = run_model(ctx, model_pte);
+  ET_LOG(Info, "Model run: %d", model_ok);
+
   log_mem_status(ctx);
   write_etdump(ctx);
 
+  ET_CHECK_MSG(model_ok == true, "Problem running model");
+
   ET_LOG(Info, "Program complete, exiting.");
 #if defined(SEMIHOSTING)
   _exit(0);
diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index 77dddfe6451..8f5dec85ad4 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -41,6 +41,7 @@ scratch_dir_set=false
 toolchain=arm-none-eabi-gcc
 select_ops_list="aten::_softmax.out"
 qdq_fusion_op=false
+model_explorer=false
 
 function help() {
     echo "Usage: $(basename $0) [options]"
@@ -52,7 +53,7 @@ function help() {
     echo "  --no_delegate                          Do not delegate the model (can't override builtin models)"
     echo "  --no_quantize                          Do not quantize the model (can't override builtin models)"
     echo "  --portable_kernels=<OPS>               TO BE DEPRECATED: Alias to select_ops_list."
-    echo "  --select_ops_list=<OPS>                Comma separated list of portable (non delagated) kernels to include Default: ${select_ops_list}"
+    echo "  --select_ops_list=<OPS>                Comma separated list of portable (non delegated) kernels to include Default: ${select_ops_list}"
     echo "                                           NOTE: This is used when select_ops_model is not possible to use, e.g. for semihosting or bundleio."
     echo "                                           See https://docs.pytorch.org/executorch/stable/kernel-library-selective-build.html for more information."
     echo "  --target=<TARGET>                      Target to build and run for Default: ${target}"
@@ -71,6 +72,7 @@ function help() {
     echo "  --et_build_root=<FOLDER>               Executorch build output root folder to use, defaults to ${et_build_root}"
     echo "  --scratch-dir=<FOLDER>                 Path to your Ethos-U scrach dir if you not using default ${ethos_u_scratch_dir}"
     echo "  --qdq_fusion_op                        Enable QDQ fusion op"
+    echo "  --model_explorer                       Generate and open a visual graph of the compiled model."
     exit 0
 }
 
@@ -99,6 +101,7 @@ for arg in "$@"; do
       --et_build_root=*) et_build_root="${arg#*=}";;
       --scratch-dir=*) ethos_u_scratch_dir="${arg#*=}" ; scratch_dir_set=true ;;
       --qdq_fusion_op) qdq_fusion_op=true;;
+      --model_explorer) model_explorer=true ;;
       *)
       ;;
     esac
@@ -289,6 +292,12 @@ for i in "${!test_model[@]}"; do
 
     pte_file=$(realpath ${pte_file})
 
+    if [ "${etrecord_flag}" != "" ] ; then
+        etrecord_filename="${output_folder}/${model_filename}_etrecord.bin"
+        etrecord_filename=$(realpath ${etrecord_filename})
+        etrecord_flag="--etrecord=${etrecord_filename}"
+    fi
+
     [[ -f ${pte_file} ]] || { >&2 echo "Failed to generate a pte file - ${pte_file}"; exit 1; }
     echo "pte_data_size: $(wc -c ${pte_file})"
     echo "pte_file: ${pte_file}"
@@ -322,10 +331,16 @@ for i in "${!test_model[@]}"; do
         backends/arm/scripts/build_executor_runner.sh --et_build_root="${et_build_root}" --pte="${pte_file_or_mem}" --build_type=${build_type} --target=${target} --system_config=${system_config} --memory_mode=${memory_mode} ${bundleio_flag} ${et_dump_flag} --extra_build_flags="${extra_build_flags}" --ethosu_tools_dir="${ethos_u_scratch_dir}" --toolchain="${toolchain}" --select_ops_list="${select_ops_list}"
         if [ "$build_only" = false ] ; then
             # Execute the executor_runner on FVP Simulator
-            backends/arm/scripts/run_fvp.sh --elf=${elf_file} ${model_data} --target=$target
+
+            backends/arm/scripts/run_fvp.sh --elf=${elf_file} ${model_data} --target=$target ${etrecord_flag}
         fi
         set +x
     fi
+
+    if [ "$model_explorer" = true ]; then
+        tosa_flatbuffer_path=$(find ${output_folder} -name "*TOSA*.tosa" | head -n 1)
+        python3 ${script_dir}/visualize.py ${tosa_flatbuffer_path}
+    fi
 done
 
 exit 0
diff --git a/examples/arm/visualize.py b/examples/arm/visualize.py
new file mode 100644
index 00000000000..51fca5b3895
--- /dev/null
+++ b/examples/arm/visualize.py
@@ -0,0 +1,32 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+
+import model_explorer
+
+from executorch.devtools.visualization.visualization_utils import (
+    visualize_model_explorer,
+)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Visualize a model using model explorer."
+    )
+    parser.add_argument("model_path", type=str, help="Path to the model file.")
+    args = parser.parse_args()
+
+    config = model_explorer.config()
+    (config.add_model_from_path(args.model_path))
+
+    visualize_model_explorer(
+        config=config,
+        extensions=["tosa_adapter_model_explorer"],
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/mediatek/README.md b/examples/mediatek/README.md
index c03ab0ec48c..ffc91e10e60 100755
--- a/examples/mediatek/README.md
+++ b/examples/mediatek/README.md
@@ -26,44 +26,61 @@ examples/mediatek
 # Examples Build Instructions
 
 ## Environment Setup
-- Follow the instructions of **Prerequisites** and **Setup** in `backends/mediatek/scripts/README.md`.
+- Follow the instructions in `backends/mediatek/README.md` to build the backend library `libneuron_backend.so`.
 
-- Build required libraries by `backends/mediatek/scripts/mtk_build.sh` before building examples.
-
-## Build MediaTek Examples
-1. Build the backend and the examples by exedcuting the script:
+## Build MediaTek Runners
+1. Build the mediatek model runner by executing the script:
 ```bash
 ./mtk_build_examples.sh
 ```
+This will generate the required runners in `executorch/cmake-android-out/examples/mediatek/`
 
-## LLaMa Example Instructions
+## Model Export Instructions
 ##### Note: Verify that localhost connection is available before running AoT Flow
-1. Exporting Models to `.pte`
-- In the `examples/mediatek directory`, run:
+1. Download Required Files
+- Download the model files from the official Hugging Face website, and move the files to the respective folder in `examples/mediatek/models/llm_models/weights/` **EXCEPT** the `config.json` file.
+    - The `config.json` file is already included in the model folders, which may include some modifications required for the model exportation.
+- Include the calibration data (if any) under `aot_utils/llm_utils/prompts/`
+
+2. Exporting Models to `.pte`
+- In the `examples/mediatek/ directory`, run:
 ```bash
-source shell_scripts/export_llama.sh <model_name> <num_chunks> <prompt_num_tokens> <cache_size> <calibration_set_name>
+source shell_scripts/export_<model_family>.sh <model_name> <num_chunks> <prompt_num_tokens> <cache_size> <calibration_data_file> <precision> <platform>
 ```
 - Defaults:
-    - `model_name` = llama3
+    - `model_name` = Depends on model family. Check respective `shell_scripts/export_<model_family>.sh` for info.
     - `num_chunks` = 4
     - `prompt_num_tokens` = 128
-    - `cache_size` = 1024
-    - `calibration_set_name` = None
+    - `cache_size` = 512
+    - `calibration_data_file` = None
+    - `precision` = A16W4
+    - `platform` = DX4
+
 - Argument Explanations/Options:
-    - `model_name`: llama2/llama3
-    <sub>**Note: Currently Only Tested on Llama2 7B Chat and Llama3 8B Instruct.**</sub>
-    - `num_chunks`: Number of chunks to split the model into. Each chunk contains the same number of decoder layers. Will result in `num_chunks` number of `.pte` files being generated. Typical values are 1, 2 and 4.
+    - `model_name`: View list 'Available model names' below.
+    - `num_chunks`: Number of chunks to split the model into. Each chunk contains the same number of decoder layers. Typical values are 1, 2 and 4.
     - `prompt_num_tokens`: Number of tokens (> 1) consumed each forward pass for the prompt processing stage.
     - `cache_size`: Cache Size.
-    - `calibration_set_name`: Name of calibration dataset with extension that is found inside the `aot_utils/llm_utils/prompts` directory. Example: `alpaca.txt`. If `"None"`, will use dummy data to calibrate.
+    - `calibration_data_file`: Name of calibration dataset with extension that is found inside the `aot_utils/llm_utils/prompts/` directory. Example: `alpaca.txt`. If `"None"`, will use dummy data to calibrate.
+    - `precision`: Quantization precision for the model. Available options are `["A16W4", "A16W8", "A16W16", "A8W4", "A8W8"]`
+    - `platform`: The platform of the device. `DX4` for Mediatek Dimensity 9400 and `DX3` for Mediatek Dimensity 9300.
     <sub>**Note: Export script example only tested on `.txt` file.**</sub>
 
-2. `.pte` files will be generated in `examples/mediatek/pte`
-    - Users should expect `num_chunks*2` number of pte files (half of them for prompt and half of them for generation).
-    - Generation `.pte` files have "`1t`" in their names.
-    - Additionally, an embedding bin file will be generated in the weights folder where the `config.json` can be found in. [`examples/mediatek/models/llm_models/weights/<model_name>/embedding_<model_config_folder>_fp32.bin`]
+- Available model names:
+    - Llama:
+        - llama3.2-3b, llama3.2-1b, llama3, llama2
+    - Qwen:
+        - Qwen3-4B, Qwen3-1.7B, Qwen2-7B-Instruct, Qwen2.5-3B, Qwen2.5-0.5B-Instruct, Qwen2-1.5B-Instruct
+    - Gemma:
+        - gemma2, gemma3
+    - Phi:
+        - phi3.5, phi4
+
+3. `.pte` files will be generated in `examples/mediatek/pte/`
+    - Users should expect `num_chunks` number of pte files.
+    - An embedding bin file will be generated in the weights folder where the `config.json` can be found in. [`examples/mediatek/models/llm_models/weights/<model_name>/embedding_<model_config_folder>_fp32.bin`]
     - eg. For `llama3-8B-instruct`, embedding bin generated in `examples/mediatek/models/llm_models/weights/llama3-8B-instruct/`
-    - AoT flow will take roughly 2.5 hours (114GB RAM for `num_chunks=4`) to complete (Results will vary by device/hardware configurations)
+    - AoT flow will take around 30 minutes to 2.5 hours to complete (Results will vary depending on device/hardware configurations and model sizes)
 
 ### oss
 1. Exporting Model to `.pte`
@@ -74,26 +91,31 @@ bash shell_scripts/export_oss.sh <model_name>
     - `model_name`: deeplabv3/edsr/inceptionv3/inceptionv4/mobilenetv2/mobilenetv3/resnet18/resnet50/dcgan/wav2letter/vit_b_16/mobilebert/emformer_rnnt/bert/distilbert
 
 # Runtime
-## Environment Setup
-
-To set up the build environment for the `mtk_executor_runner`:
-
-1. Navigate to the `backends/mediatek/scripts` directory within the repository.
-2. Follow the detailed build steps provided in that location.
-3. Upon successful completion of the build steps, the `mtk_executor_runner` binary will be generated.
-
 ## Deploying and Running on the Device
 
 ### Pushing Files to the Device
 
-Transfer the `.pte` model files and the `mtk_executor_runner` binary to your Android device using the following commands:
+Transfer the directory containing the `.pte` model files, the `run_<model_name>_sample.sh` script, the `embedding_<model_config_folder>_fp32.bin`, the tokenizer file, the `mtk_llama_executor_runner` binary and the 3 `.so` files to your Android device using the following commands:
 
 ```bash
-adb push mtk_executor_runner <PHONE_PATH, e.g. /data/local/tmp>
-adb push <MODEL_NAME>.pte <PHONE_PATH, e.g. /data/local/tmp>
+adb push mtk_llama_executor_runner <PHONE_PATH, e.g. /data/local/tmp>
+adb push examples/mediatek/executor_runner/run_<model_name>_sample.sh <PHONE_PATH, e.g. /data/local/tmp>
+adb push embedding_<model_config_folder>_fp32.bin <PHONE_PATH, e.g. /data/local/tmp>
+adb push tokenizer.model <PHONE_PATH, e.g. /data/local/tmp>
+adb push <PTE_DIR> <PHONE_PATH, e.g. /data/local/tmp>
 ```
 
-Make sure to replace `<MODEL_NAME>` with the actual name of your model file. And, replace the `<PHONE_PATH>` with the desired detination on the device.
+Make sure to replace `<PTE_DIR>` with the actual name of your directory containing pte files. And, replace the `<PHONE_PATH>` with the desired detination on the device.
+
+At this point your phone directory should have the following files:
+- libneuron_backend.so
+- libneuronusdk_adapter.mtk.so
+- libneuron_buffer_allocator.so
+- mtk_llama_executor_runner
+- <PTE_DIR>
+- tokenizer.json / tokenizer.model(for llama3) / tokenizer.bin(for phi3 and gemma2)
+- embedding_<model_config_folder>_fp32.bin
+- run_<model_name>_sample.sh
 
 ##### Note: For oss models, please push additional files to your Android device
 ```bash
@@ -107,12 +129,13 @@ for i in input*bin; do adb push "$i" <PHONE_PATH, e.g. /data/local/tmp>; done;
 Execute the model on your Android device by running:
 
 ```bash
-adb shell "/data/local/tmp/mtk_executor_runner --model_path /data/local/tmp/<MODEL_NAME>.pte --iteration <ITER_TIMES>"
+adb shell
+cd <PHONE_PATH>
+sh run_<model_name>_sample.sh
 ```
+#### Note: The `mtk_llama_executor_runner` is applicable to the models listed in `examples/mediatek/models/llm_models/weights/`.
 
-In the command above, replace `<MODEL_NAME>` with the name of your model file and `<ITER_TIMES>` with the desired number of iterations to run the model.
-
-##### Note: For llama models, please use `mtk_llama_executor_runner`. Refer to `examples/mediatek/executor_runner/run_llama3_sample.sh` for reference.
+##### Note: For non-LLM models, please run `adb shell "/data/local/tmp/mtk_executor_runner --model_path /data/local/tmp/<MODEL_NAME>.pte --iteration <ITER_TIMES>"`. 
 ##### Note: For oss models, please use `mtk_oss_executor_runner`.
 ```bash
 adb shell "/data/local/tmp/mtk_oss_executor_runner --model_path /data/local/tmp/<MODEL_NAME>.pte --input_list /data/local/tmp/input_list.txt --output_folder /data/local/tmp/output_<MODEL_NAME>"
diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.h b/examples/mediatek/executor_runner/mtk_llama_runner.h
index 5dd8a85005e..e82b36d9373 100644
--- a/examples/mediatek/executor_runner/mtk_llama_runner.h
+++ b/examples/mediatek/executor_runner/mtk_llama_runner.h
@@ -66,6 +66,8 @@ class MTKLlamaRunner : public executorch::extension::llm::IRunner {
       std::function<void(const std::string&)> token_callback);
   std::unique_ptr<Tokenizer> load_tokenizer();
 
+  void reset() {}
+
  private:
   // model
   const LlamaModelOptions modeloptions_;
diff --git a/examples/mediatek/executor_runner/run_phi4_sample.sh b/examples/mediatek/executor_runner/run_phi4_sample.sh
index a6d9824e178..16c4f70009c 100644
--- a/examples/mediatek/executor_runner/run_phi4_sample.sh
+++ b/examples/mediatek/executor_runner/run_phi4_sample.sh
@@ -49,7 +49,7 @@ chmod +x mtk_llama_executor_runner
 
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD
 
-./mtk_llama_executor_runner_longrope \
+./mtk_llama_executor_runner \
     --max_response=$MAX_RESPONSE \
     --prompt_token_batch_size=$PROMPT_TOKEN_BATCH_SIZE \
     --cache_size=$CACHE_SIZE \
diff --git a/examples/models/llama/main.cpp b/examples/models/llama/main.cpp
index 25b840f260b..078d938ffde 100644
--- a/examples/models/llama/main.cpp
+++ b/examples/models/llama/main.cpp
@@ -105,6 +105,8 @@ int32_t main(int32_t argc, char** argv) {
       ET_LOG(Error, "Failed to warmup llama runner");
       return 1;
     }
+    // reset kv cache pos to 0
+    runner->reset();
   }
   // generate
   executorch::extension::llm::GenerationConfig config{
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
index 835972b7f3e..8b76b7650fe 100644
--- a/examples/models/llama/source_transformation/quantize.py
+++ b/examples/models/llama/source_transformation/quantize.py
@@ -135,6 +135,7 @@ def quantize(  # noqa C901
                         PerAxis(0) if group_size == 0 else PerGroup(group_size)
                     ),
                     weight_mapping_type=MappingType.SYMMETRIC,
+                    # pyre-ignore[6]
                     intx_packing_format="opaque_torchao_auto",
                 ),
             )
@@ -154,12 +155,23 @@ def quantize(  # noqa C901
         from torchao.quantization.granularity import PerGroup
         from torchao.utils import unwrap_tensor_subclass
 
+        def filter_fn(m, fqn):
+            is_linear = isinstance(m, nn.Linear)
+            has_shape_compatible_with_group_size = False
+            if is_linear:
+                has_shape_compatible_with_group_size = (
+                    m.weight.shape[1] % group_size == 0
+                )
+            return is_linear and has_shape_compatible_with_group_size
+
         quantize_(
             model,
             Int8DynamicActivationIntxWeightConfig(
+                # pyre-ignore[16]
                 weight_dtype=torch.int4,
                 weight_granularity=PerGroup(group_size),
             ),
+            filter_fn=filter_fn,
         )
 
         model = unwrap_tensor_subclass(model)
diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py
index f903e0f2ecf..7e571087c1d 100644
--- a/examples/models/llava/export_llava.py
+++ b/examples/models/llava/export_llava.py
@@ -77,7 +77,7 @@ def __init__(self, llava):
             super().__init__()
             self.text_model = llava.text_model
 
-        def forward(self, input_pos, embeddings):
+        def forward(self, embeddings, input_pos):
             return self.text_model(None, {"input_pos": input_pos}, embeddings)
 
     llava_text_model = LlavaTextModel(llava)
@@ -88,7 +88,7 @@ def forward(self, input_pos, embeddings):
         max_seq_len=llava.text_model_args.max_seq_len,
         dtype=DType.fp32,
         use_kv_cache=True,
-        example_inputs=(torch.tensor([0], dtype=torch.int64), embeddings),
+        example_inputs=(embeddings, torch.tensor([0], dtype=torch.int64)),
         dynamic_shapes=dynamic_shapes,
     )
 
diff --git a/examples/models/llava/model.py b/examples/models/llava/model.py
index 3973d756e9c..9ff56124174 100644
--- a/examples/models/llava/model.py
+++ b/examples/models/llava/model.py
@@ -405,5 +405,5 @@ def _get_image_dynamic_shapes(self):
 
     def _get_prompt_dynamic_shapes(self):
         dim = torch.export.Dim("token_dim", min=2, max=self.max_seq_len)
-        text_model_dynamic_shapes = ({0: 1}, {1: dim})
+        text_model_dynamic_shapes = ({1: dim}, {0: 1})
         return text_model_dynamic_shapes
diff --git a/examples/models/llava/runner/llava_image_prefiller.h b/examples/models/llava/runner/llava_image_prefiller.h
index 9edfab85904..f5f316d0cac 100644
--- a/examples/models/llava/runner/llava_image_prefiller.h
+++ b/examples/models/llava/runner/llava_image_prefiller.h
@@ -47,7 +47,7 @@ class ET_EXPERIMENTAL LlavaImagePrefiller {
 
     // Run text model
     auto outputs_res = ET_UNWRAP(module_->execute(
-        kTextModelMethod, {start_pos_tensor, image_encoder_outputs[0]}));
+        kTextModelMethod, {image_encoder_outputs[0], start_pos_tensor}));
     ET_CHECK_MSG(
         outputs_res[0].isTensor(),
         "Non Tensor Output returned from executing image prefill");
diff --git a/examples/models/llava/runner/llava_text_decoder_runner.h b/examples/models/llava/runner/llava_text_decoder_runner.h
index cfa92e0c253..691e2f4aa1e 100644
--- a/examples/models/llava/runner/llava_text_decoder_runner.h
+++ b/examples/models/llava/runner/llava_text_decoder_runner.h
@@ -34,7 +34,7 @@ class ET_EXPERIMENTAL LlavaTextDecoderRunner
         &start_pos, {1}, executorch::aten::ScalarType::Long);
     // run text model
     auto outputs_res = ET_UNWRAP(module_->execute(
-        kTextModelMethod, {start_pos_tensor, token_embedding_outputs[0]}));
+        kTextModelMethod, {token_embedding_outputs[0], start_pos_tensor}));
 
     ET_CHECK_MSG(
         outputs_res.size() == 1,
diff --git a/examples/models/llava/test/test_llava.py b/examples/models/llava/test/test_llava.py
index def9eaa02bd..7f2b59e0116 100644
--- a/examples/models/llava/test/test_llava.py
+++ b/examples/models/llava/test/test_llava.py
@@ -97,7 +97,7 @@ def test_llava_export(self):
         )[0]
         llava_module.run_method(
             "text_decoder",
-            (torch.tensor([start_pos], dtype=torch.int64), pte_embeds_before_img),
+            (pte_embeds_before_img, torch.tensor([start_pos], dtype=torch.int64)),
         )
 
         # Update the start_pos. start_pos is used in kv cache. The source of truth
@@ -109,8 +109,8 @@ def test_llava_export(self):
         llava_module.run_method(
             "text_decoder",
             (
-                torch.tensor([start_pos], dtype=torch.int64),
                 pte_embeds_img,
+                torch.tensor([start_pos], dtype=torch.int64),
             ),
         )
 
@@ -123,7 +123,7 @@ def test_llava_export(self):
         )[0]
         pte_prefill_after_img = llava_module.run_method(
             "text_decoder",
-            (torch.tensor([start_pos], dtype=torch.int64), pte_embeds_after_img),
+            (pte_embeds_after_img, torch.tensor([start_pos], dtype=torch.int64)),
         )[0]
 
         # Update the logits for each prefill (kv cache) step.
@@ -140,7 +140,7 @@ def test_llava_export(self):
             )[0]
             logits = llava_module.run_method(
                 "text_decoder",
-                (torch.tensor([start_pos + i], dtype=torch.int64), token_embeds),
+                (token_embeds, torch.tensor([start_pos + i], dtype=torch.int64)),
             )[0]
             new_tokens.append(torch.argmax(logits).item())
 
diff --git a/examples/models/voxtral/README.md b/examples/models/voxtral/README.md
index 5bc675e0615..5f4eeb2ff95 100644
--- a/examples/models/voxtral/README.md
+++ b/examples/models/voxtral/README.md
@@ -54,7 +54,8 @@ The exported model takes in a mel spectrogram input tensor as its audio inputs.
 We provide a simple way to transform raw audio data into a mel spectrogram by exporting a version of Voxtral's audio preprocessor used directly by Transformers.
 
 ```
-python -m executorch.extension.audio.mel_spectrogram --feature_size 128 --output_file voxtral_preprocessor.pte
+# Export a preprocessor that can handle audio up to 5 mins (300s).
+python -m executorch.extension.audio.mel_spectrogram --feature_size 128 --stack_output --max_audio_len 300 --output_file voxtral_preprocessor.pte
 ```
 
 ## Building the multimodal runner
diff --git a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
index 26e70c90f38..47f9f0cfb38 100644
--- a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
+++ b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
@@ -568,6 +568,8 @@ int main(int argc, char** argv) {
     ET_LOG(
         Info,
         "Input list not provided. Inputs prepared with default values set.");
+
+    // Run the method
     Error status = method->execute();
     ET_CHECK_MSG(
         status == Error::Ok,
@@ -575,6 +577,31 @@ int main(int argc, char** argv) {
         method_name,
         (int)status);
     ET_LOG(Info, "Model executed successfully.");
+
+    // Warm up
+    ET_LOG(Info, "Perform %d inferences for warming up", FLAGS_warm_up);
+    for (int i = 0; i < FLAGS_warm_up; ++i) {
+      status = method->execute();
+    }
+
+    // Inference with designated iterations
+    auto before_exec = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < FLAGS_iteration; ++i) {
+      status = method->execute();
+    }
+    auto after_exec = std::chrono::high_resolution_clock::now();
+    double interval_infs =
+        std::chrono::duration_cast<std::chrono::microseconds>(
+            after_exec - before_exec)
+            .count() /
+        1000.0;
+
+    ET_LOG(
+        Info,
+        "%d inferences took %f ms, avg %f ms",
+        FLAGS_iteration,
+        interval_infs,
+        interval_infs / (float)FLAGS_iteration);
   }
 
   // Dump the etdump data containing profiling/debugging data to the specified
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
index 0c9be4d441d..253e083a80e 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
@@ -354,17 +354,6 @@ Error Runner<T>::generate(
     const llm::GenerationConfig& config,
     std::function<void(const std::string&)> token_callback,
     std::function<void(const Stats&)> stats_callback) {
-  return generate_from_pos(prompt, 0, config, token_callback, stats_callback);
-}
-
-template <typename T>
-Error Runner<T>::generate_from_pos(
-    const std::string& prompt,
-    int64_t start_pos,
-    const llm::GenerationConfig& config,
-    std::function<void(const std::string&)> token_callback,
-    std::function<void(const Stats&)> stats_callback) {
-  // TODO: currently only support start_pos == 0
   return generate_from_prompt_or_file(
       prompt, false, config, token_callback, stats_callback);
 }
@@ -435,7 +424,8 @@ Error Runner<T>::generate_from_prompt_or_file(
   stats_.first_token_ms = time_in_ms();
   stats_.prompt_eval_end_ms = time_in_ms();
 
-  // print the first token from prefill. No prev_token so use cur_token for it.
+  // print the first token from prefill. No prev_token so use cur_token for
+  // it.
   if (token_callback) {
     token_callback(
         ET_UNWRAP_TOKENIZER(tokenizer_->decode(cur_token, cur_token)));
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.h b/examples/qualcomm/oss_scripts/llama/runner/runner.h
index 30fba71ecef..9f290d79c75 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/runner.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/runner.h
@@ -72,13 +72,7 @@ class Runner : public executorch::extension::llm::IRunner {
       std::function<void(const std::string&)> token_callback = {},
       std::function<void(const executorch::llm::Stats&)> stats_callback = {})
       override;
-  executorch::runtime::Error generate_from_pos(
-      const std::string& prompt,
-      int64_t start_pos,
-      const executorch::extension::llm::GenerationConfig& config,
-      std::function<void(const std::string&)> token_callback = {},
-      std::function<void(const executorch::llm::Stats&)> stats_callback = {})
-      override;
+
   executorch::runtime::Error generate_from_prompt_or_file(
       const std::string& prompt,
       bool tokenized_prompt,
@@ -86,6 +80,7 @@ class Runner : public executorch::extension::llm::IRunner {
       std::function<void(const std::string&)> token_callback = {},
       std::function<void(const executorch::llm::Stats&)> stats_callback = {});
   void stop() override {};
+  void reset() override {};
   executorch::runtime::Result<DecoderModelVersion> get_decoder_model_version();
 
  private:
diff --git a/export/target_recipes.py b/export/target_recipes.py
index 76e0cacc7b4..0a5ae9ce754 100644
--- a/export/target_recipes.py
+++ b/export/target_recipes.py
@@ -11,26 +11,32 @@
 selection and combine multiple backends optimally for target hardware.
 """
 
+import sys
 from typing import Dict, List
 
-import coremltools as ct
+if sys.platform != "win32":
+    import coremltools as ct
+    from executorch.backends.apple.coreml.recipes import CoreMLRecipeType
 
 # pyre-ignore
-from executorch.backends.apple.coreml.recipes import CoreMLRecipeType
 from executorch.backends.xnnpack.recipes import XNNPackRecipeType
 from executorch.export.recipe import ExportRecipe, RecipeType
 
 
 ## IOS Target configs
 # The following list of recipes are not exhaustive for CoreML; refer to CoreMLRecipeType for more detailed recipes.
-IOS_CONFIGS: Dict[str, List[RecipeType]] = {
-    # pyre-ignore
-    "ios-arm64-coreml-fp32": [CoreMLRecipeType.FP32, XNNPackRecipeType.FP32],
-    # pyre-ignore
-    "ios-arm64-coreml-fp16": [CoreMLRecipeType.FP16],
-    # pyre-ignore
-    "ios-arm64-coreml-int8": [CoreMLRecipeType.PT2E_INT8_STATIC],
-}
+IOS_CONFIGS: Dict[str, List[RecipeType]] = (
+    {
+        # pyre-ignore
+        "ios-arm64-coreml-fp32": [CoreMLRecipeType.FP32, XNNPackRecipeType.FP32],
+        # pyre-ignore
+        "ios-arm64-coreml-fp16": [CoreMLRecipeType.FP16],
+        # pyre-ignore
+        "ios-arm64-coreml-int8": [CoreMLRecipeType.PT2E_INT8_STATIC],
+    }
+    if sys.platform != "win32"
+    else {}
+)
 
 
 def _create_target_recipe(
diff --git a/export/tests/test_target_recipes.py b/export/tests/test_target_recipes.py
index d781ffea945..7a2a7c87342 100644
--- a/export/tests/test_target_recipes.py
+++ b/export/tests/test_target_recipes.py
@@ -7,10 +7,10 @@
 # pyre-strict
 
 import logging
+import sys
 import unittest
 
 import torch
-from executorch.backends.apple.coreml.recipes import CoreMLRecipeProvider  # pyre-ignore
 from executorch.backends.xnnpack.recipes.xnnpack_recipe_provider import (
     XNNPACKRecipeProvider,
 )
@@ -18,6 +18,11 @@
 from executorch.export.target_recipes import get_ios_recipe
 from executorch.runtime import Runtime
 
+if sys.platform != "win32":
+    from executorch.backends.apple.coreml.recipes import (  # pyre-ignore
+        CoreMLRecipeProvider,
+    )
+
 
 class TestTargetRecipes(unittest.TestCase):
     """Test target recipes."""
@@ -26,12 +31,14 @@ def setUp(self) -> None:
         torch._dynamo.reset()
         super().setUp()
         recipe_registry.register_backend_recipe_provider(XNNPACKRecipeProvider())
-        # pyre-ignore
-        recipe_registry.register_backend_recipe_provider(CoreMLRecipeProvider())
+        if sys.platform != "win32":
+            # pyre-ignore
+            recipe_registry.register_backend_recipe_provider(CoreMLRecipeProvider())
 
     def tearDown(self) -> None:
         super().tearDown()
 
+    @unittest.skipIf(sys.platform == "win32", "Core ML is not available on Windows.")
     def test_ios_fp32_recipe_with_xnnpack_fallback(self) -> None:
         # Linear ops skipped by coreml but handled by xnnpack
         class Model(torch.nn.Module):
@@ -107,6 +114,7 @@ def forward(self, x, y):
             et_output = session.run_method("forward", example_inputs[0])
             logging.info(f"et output {et_output}")
 
+    @unittest.skipIf(sys.platform == "win32", "Core ML is not available on Windows.")
     def test_ios_quant_recipes(self) -> None:
         class Model(torch.nn.Module):
             def __init__(self):
diff --git a/extension/evalue_util/test/print_evalue_test.cpp b/extension/evalue_util/test/print_evalue_test.cpp
index b881e55d8a8..242cb0af224 100644
--- a/extension/evalue_util/test/print_evalue_test.cpp
+++ b/extension/evalue_util/test/print_evalue_test.cpp
@@ -267,7 +267,7 @@ TEST(PrintEvalueTest, UnelidedBoolLists) {
   // case; the other scalar types use the same underlying code, so they don't
   // need to test this again.
   {
-    EValue value(ArrayRef<bool>(list.data(), 0ul));
+    EValue value(ArrayRef<bool>(list.data(), static_cast<size_t>(0ul)));
     expect_output(value, "(len=0)[]");
   }
   {
@@ -419,7 +419,7 @@ TEST(PrintEvalueTest, UnelidedDoubleLists) {
   std::array<double, 6> list = {-2.2, -1, 0, INFINITY, NAN, 3.3};
 
   {
-    EValue value(ArrayRef<double>(list.data(), 0ul));
+    EValue value(ArrayRef<double>(list.data(), static_cast<size_t>(0ul)));
     expect_output(value, "(len=0)[]");
   }
   {
diff --git a/extension/flat_tensor/test/CMakeLists.txt b/extension/flat_tensor/test/CMakeLists.txt
index c3296dc61f3..fd3d6792f90 100644
--- a/extension/flat_tensor/test/CMakeLists.txt
+++ b/extension/flat_tensor/test/CMakeLists.txt
@@ -23,7 +23,7 @@ add_custom_command(
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.ptd"
   COMMAND
     ${PYTHON_EXECUTABLE} -m test.models.export_program --modules "ModuleAddMul"
-    --external-constants --outdir "${CMAKE_CURRENT_BINARY_DIR}" 2> /dev/null
+    --external-constants --outdir "${CMAKE_CURRENT_BINARY_DIR}"
   WORKING_DIRECTORY ${EXECUTORCH_ROOT}
 )
 
diff --git a/extension/llm/runner/irunner.h b/extension/llm/runner/irunner.h
index 5bd5ef9d04e..ef93f32319c 100644
--- a/extension/llm/runner/irunner.h
+++ b/extension/llm/runner/irunner.h
@@ -125,39 +125,18 @@ class ET_EXPERIMENTAL IRunner {
       std::function<void(const std::string&)> token_callback,
       std::function<void(const Stats&)> stats_callback) = 0;
 
-  /**
-   * Generate text based on the provided prompt and generation config, from a
-   * given position in KV cache.
-   *
-   * @param prompt The input prompt to generate from
-   * @param start_pos The starting position in KV cache of the input. Note:
-   * Depending on the actual implementation, a runner may manage the position
-   * internally, and this may not be respected.
-   * @param config Generation configuration parameters
-   * @param token_callback Callback function called for each generated token
-   * @param stats_callback Callback function for generation statistics
-   * @return Error::Ok if successful, an error otherwise
-   */
-  virtual runtime::Error generate_from_pos(
-      const std::string& prompt,
-      int64_t start_pos,
-      const GenerationConfig& config,
-      std::function<void(const std::string&)> token_callback,
-      std::function<void(const Stats&)> stats_callback) = 0;
   /**
    * Stop the generation process.
    */
   virtual void stop() = 0;
+
   /**
    * Force remove prefilled tokens and reset KV cache start position
    *
-   * For some existing runners, overriding this method is not needed because
-   * start_pos is passed as an argument to generate_from_pos.
-   *
    * This method removes the prefilled tokens from the KV cache and resets the
    * start position to 0.
    */
-  virtual void reset() {};
+  virtual void reset() = 0;
 };
 
 } // namespace llm
diff --git a/extension/llm/runner/multimodal_decoder_runner.h b/extension/llm/runner/multimodal_decoder_runner.h
index f76b8c64028..c8db3e57000 100644
--- a/extension/llm/runner/multimodal_decoder_runner.h
+++ b/extension/llm/runner/multimodal_decoder_runner.h
@@ -48,7 +48,7 @@ class ET_EXPERIMENTAL MultimodalDecoderRunner
         &start_pos, {1}, executorch::aten::ScalarType::Long);
     // run text model
     auto outputs_res = ET_UNWRAP(
-        module_->execute(kTextModelMethod, {start_pos_tensor, embeddings}));
+        module_->execute(kTextModelMethod, {embeddings, start_pos_tensor}));
 
     ET_CHECK_MSG(
         outputs_res.size() == 1,
diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp
index 1d9a0c8fdfc..2705a9eadff 100644
--- a/extension/llm/runner/multimodal_prefiller.cpp
+++ b/extension/llm/runner/multimodal_prefiller.cpp
@@ -91,24 +91,22 @@ Result<uint64_t> MultimodalPrefiller::prefill(
   }
 
   // 2. Run decoder model for prefill.
-  // `cache_position` goes from start_pos to start_pos + encoder_output.size(1).
-  // e.g. if start_pos = 2 and encoder_output.size(1) = 5,
-  // cache_position_tensor should be [2, 3, 4, 5, 6].
+
+  // Get expected shape of cache position tensor, which should be the second
+  // argument
+
   int64_t seq_len = encoder_output.toTensor().size(1);
   if (seq_len == 0) {
     ET_LOG(Error, "The encoder returned an empty output.");
     return ::executorch::runtime::Error::InvalidState;
   }
-  std::vector<int64_t> cache_positions(seq_len);
-  for (int64_t i = 0; i < seq_len; ++i) {
-    cache_positions[i] = start_pos + i;
-  }
-  auto cache_position_tensor = ::executorch::extension::from_blob(
-      cache_positions.data(),
-      {static_cast<int>(seq_len)},
-      executorch::aten::ScalarType::Long);
+  std::vector<int64_t> cache_positions;
+
+  auto cache_position_tensor = ET_UNWRAP(populate_start_pos_or_cache_position(
+      module_, start_pos, cache_positions, seq_len, kTextModelMethod));
+
   auto prefill_result = module_->execute(
-      kTextModelMethod, {cache_position_tensor, encoder_output});
+      kTextModelMethod, {encoder_output, cache_position_tensor});
   if (prefill_result.error() != ::executorch::runtime::Error::Ok) {
     return prefill_result.error();
   }
diff --git a/extension/llm/runner/text_decoder_runner.cpp b/extension/llm/runner/text_decoder_runner.cpp
index 27c00c19089..7cd7623f58f 100644
--- a/extension/llm/runner/text_decoder_runner.cpp
+++ b/extension/llm/runner/text_decoder_runner.cpp
@@ -36,37 +36,11 @@ ::executorch::runtime::Result<executorch::aten::Tensor> TextDecoderRunner::step(
   // If only 1 input, we are not using kv cache
   bool use_kv_cache = method_meta.num_inputs() > 1;
 
+  std::vector<int64_t> cache_positions;
+
   if (use_kv_cache) {
-    // Size of the second argument. This could be either input_pos or
-    // cache_positions
-
-    // Check if we are using cache positions instead of input pos.
-    auto second_input_info = ET_UNWRAP(method_meta.input_tensor_meta(1));
-    // For input_pos, numel is 1, for cache_positions, numel is max_seq_len
-    auto sizes = second_input_info.sizes();
-    // Assuming 1D tensor
-    ET_CHECK_OR_RETURN_ERROR(
-        sizes.size() == 1,
-        InvalidProgram,
-        "The second input tensor is not 1D tensor. Got dimension (%zu)",
-        sizes.size());
-    auto numel = sizes[0];
-    std::vector<::executorch::aten::SizesType> sizes_vec = {numel};
-
-    TensorPtr start_pos_tensor;
-    if (numel > 1) {
-      // If we are here, model is exported with cache_positions, create a tensor
-      // with the same length as input_ids. Assuming the last dimension is the
-      // one with the variable token length, for example [1, S] or [1, 1, S]
-      sizes_vec[sizes_vec.size() - 1] = tokens->numel();
-      start_pos_tensor = empty(sizes_vec, ::executorch::aten::ScalarType::Long);
-      torch::executor::native::arange_out_impl(
-          start_pos, start_pos + tokens->numel(), 1.0, *start_pos_tensor);
-    } else {
-      // Assuming model is exported with input_pos, create a tensor with size 1
-      start_pos_tensor = from_blob(
-          &start_pos, sizes_vec, ::executorch::aten::ScalarType::Long);
-    }
+    auto start_pos_tensor = ET_UNWRAP(populate_start_pos_or_cache_position(
+        module_, start_pos, cache_positions, tokens->numel(), "forward"));
 
     std::vector<runtime::EValue> inputs;
     auto inputs_res = io_manager_->prepare_decode(tokens, start_pos_tensor);
diff --git a/extension/llm/runner/text_llm_runner.cpp b/extension/llm/runner/text_llm_runner.cpp
index b6f41fd7af6..333716ac831 100644
--- a/extension/llm/runner/text_llm_runner.cpp
+++ b/extension/llm/runner/text_llm_runner.cpp
@@ -43,7 +43,8 @@ TextLLMRunner::TextLLMRunner(
       io_manager_(std::move(io_manager)),
       text_token_generator_(std::move(text_token_generator)),
       stats_(std::move(stats)),
-      temperature_(temperature) {
+      temperature_(temperature),
+      pos_(0) {
   // Note: This constructor assumes that text_prefiller and text_token_generator
   // already have references to the Module and TextDecoderRunner they need
 }
@@ -70,9 +71,8 @@ Error TextLLMRunner::load() {
     ET_LOG(Info, format, __VA_ARGS__);     \
   }
 
-Error TextLLMRunner::generate_from_pos(
+Error TextLLMRunner::generate(
     const std::string& prompt,
-    ET_UNUSED int64_t start_pos,
     const GenerationConfig& config,
     std::function<void(const std::string&)> token_callback,
     std::function<void(const Stats&)> stats_callback) {
@@ -217,15 +217,6 @@ Error TextLLMRunner::generate_from_pos(
   return Error::Ok;
 }
 
-Error TextLLMRunner::generate(
-    const std::string& prompt,
-    const GenerationConfig& config,
-    std::function<void(const std::string&)> token_callback,
-    std::function<void(const Stats&)> stats_callback) {
-  pos_ = 0;
-  return generate_from_pos(prompt, 0, config, token_callback, stats_callback);
-}
-
 Error TextLLMRunner::warmup(const std::string& prompt, int32_t max_new_tokens) {
   // Create a GenerationConfig for warmup
   GenerationConfig config{
diff --git a/extension/llm/runner/text_llm_runner.h b/extension/llm/runner/text_llm_runner.h
index 21b77fe1dfa..9dd99d82d59 100644
--- a/extension/llm/runner/text_llm_runner.h
+++ b/extension/llm/runner/text_llm_runner.h
@@ -101,25 +101,6 @@ class ET_EXPERIMENTAL TextLLMRunner : public IRunner {
       std::function<void(const std::string&)> token_callback = {},
       std::function<void(const Stats&)> stats_callback = {}) override;
 
-  /**
-   * Generate text based on the provided prompt and generation config, from a
-   * given position in KV cache.
-   *
-   * @param prompt The input prompt to generate from
-   * @param start_pos [Unused] The starting position in KV cache of the input,
-   * ignored because the runner manages the position internally.
-   * @param config Generation configuration parameters
-   * @param token_callback Callback function called for each generated token
-   * @param stats_callback Callback function for generation statistics
-   * @return Error::Ok if successful, an error otherwise
-   */
-  ET_DEPRECATED runtime::Error generate_from_pos(
-      const std::string& prompt,
-      ET_UNUSED int64_t start_pos,
-      const GenerationConfig& config,
-      std::function<void(const std::string&)> token_callback = {},
-      std::function<void(const Stats&)> stats_callback = {}) override;
-
   /**
    * @brief Warms up the model with a sample prompt
    *
@@ -133,6 +114,7 @@ class ET_EXPERIMENTAL TextLLMRunner : public IRunner {
   ::executorch::runtime::Error warmup(
       const std::string& prompt,
       int32_t max_new_tokens);
+
   /**
    * @brief Remove prefilled tokens and reset start position, and stats.
    *
@@ -140,6 +122,7 @@ class ET_EXPERIMENTAL TextLLMRunner : public IRunner {
    * start position to 0. It also clears the stats for previous runs.
    */
   void reset() override;
+
   /**
    * @brief Stops the ongoing text generation process
    *
diff --git a/extension/llm/runner/util.h b/extension/llm/runner/util.h
index 0cb2463d163..5aff2c8a3b5 100644
--- a/extension/llm/runner/util.h
+++ b/extension/llm/runner/util.h
@@ -7,6 +7,9 @@
  */
 
 #pragma once
+#include <executorch/extension/llm/runner/constants.h>
+#include <executorch/extension/llm/runner/multimodal_prefiller.h>
+#include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/platform/compiler.h>
 #include <stdio.h>
 #include <time.h>
@@ -99,6 +102,48 @@ ET_EXPERIMENTAL size_t inline get_rss_bytes() {
   // when this changed.
   return 0;
 }
+
+// Returns the cache position tensor, which can be either a single start_pos
+// (when the method_name [`text_decoder` or `forward`] expects a tensor with
+// size 1 because model will populate the cache position tensor underneath), or
+// a populated tensor for cache position, for the given start_pos and seq_len.
+inline runtime::Result<TensorPtr> populate_start_pos_or_cache_position(
+    Module* module,
+    int64_t& start_pos,
+    std::vector<int64_t>& cache_positions_vec,
+    int seq_len,
+    const char* method_name = "forward") {
+  // Get expected shape of cache position tensor, which should be the second
+  // argument
+  auto method_meta = ET_UNWRAP(module->method_meta(method_name));
+  auto second_input_info = ET_UNWRAP(method_meta.input_tensor_meta(1));
+  auto second_input_sizes = second_input_info.sizes();
+  auto numel = second_input_sizes[0];
+
+  for (int i = 0; i < second_input_sizes.size(); ++i) {
+    ET_LOG(Error, "second_input_sizes[%d] = %d", i, second_input_sizes[i]);
+  }
+
+  TensorPtr start_pos_tensor;
+  if (numel > 1) {
+    // `cache_position` goes from start_pos to start_pos +
+    // encoder_output.size(1). e.g. if start_pos = 2 and encoder_output.size(1)
+    // = 5, cache_position_tensor should be [2, 3, 4, 5, 6].
+    cache_positions_vec.resize(seq_len);
+    for (int64_t i = 0; i < seq_len; ++i) {
+      cache_positions_vec[i] = start_pos + i;
+    }
+    return ::executorch::extension::from_blob(
+        cache_positions_vec.data(),
+        {static_cast<int>(seq_len)},
+        executorch::aten::ScalarType::Long);
+  } else {
+    // Cache position is size 1.
+    return ::executorch::extension::from_blob(
+        &start_pos, {1}, executorch::aten::ScalarType::Long);
+  }
+}
+
 } // namespace llm
 } // namespace extension
 } // namespace executorch
diff --git a/extension/module/test/CMakeLists.txt b/extension/module/test/CMakeLists.txt
index 964b810eed5..1c4358dd73e 100644
--- a/extension/module/test/CMakeLists.txt
+++ b/extension/module/test/CMakeLists.txt
@@ -24,10 +24,10 @@ add_custom_command(
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.pte"
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.ptd"
   COMMAND ${PYTHON_EXECUTABLE} -m test.models.export_program --modules
-          "ModuleAdd" --outdir "${CMAKE_CURRENT_BINARY_DIR}" 2> /dev/null
+          "ModuleAdd" --outdir "${CMAKE_CURRENT_BINARY_DIR}"
   COMMAND
     ${PYTHON_EXECUTABLE} -m test.models.export_program --modules "ModuleAddMul"
-    --external-constants --outdir "${CMAKE_CURRENT_BINARY_DIR}" 2> /dev/null
+    --external-constants --outdir "${CMAKE_CURRENT_BINARY_DIR}"
   WORKING_DIRECTORY ${EXECUTORCH_ROOT}
 )
 
diff --git a/extension/runner_util/test/CMakeLists.txt b/extension/runner_util/test/CMakeLists.txt
index 0cca06178cd..44b85a7fced 100644
--- a/extension/runner_util/test/CMakeLists.txt
+++ b/extension/runner_util/test/CMakeLists.txt
@@ -20,7 +20,7 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 add_custom_command(
   OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/ModuleAdd.pte"
   COMMAND ${PYTHON_EXECUTABLE} -m test.models.export_program --modules
-          "ModuleAdd" --outdir "${CMAKE_CURRENT_BINARY_DIR}" 2> /dev/null
+          "ModuleAdd" --outdir "${CMAKE_CURRENT_BINARY_DIR}"
   WORKING_DIRECTORY ${EXECUTORCH_ROOT}
 )
 
diff --git a/extension/testing_util/temp_file.h b/extension/testing_util/temp_file.h
index aa8f5bcc82e..4edaf2135d8 100644
--- a/extension/testing_util/temp_file.h
+++ b/extension/testing_util/temp_file.h
@@ -9,13 +9,11 @@
 #pragma once
 
 #include <array>
+#include <fstream>
 #include <memory>
 #include <string>
 
 #include <fcntl.h> // open()
-#include <stdio.h> // tmpnam(), remove()
-#include <unistd.h> // write(), close()
-
 #include <gtest/gtest.h>
 
 namespace executorch {
@@ -72,19 +70,13 @@ class TempFile {
     }
 
     // Write the contents to the file.
-    int fd = open(
-        path.c_str(),
-        // O_EXCL ensures that we are the ones creating this file, to help
-        // protect against race conditions.
-        O_CREAT | O_EXCL | O_RDWR,
-        // User can read and write, group can read.
-        S_IRUSR | S_IWUSR | S_IRGRP);
-    ASSERT_GE(fd, 0) << "open(" << path << ") failed: " << strerror(errno);
-
-    ssize_t nwrite = write(fd, data, size);
-    ASSERT_EQ(nwrite, size) << "Failed to write " << size << " bytes (wrote "
-                            << nwrite << "): " << strerror(errno);
-    close(fd);
+    std::ofstream file(path, std::ios::out | std::ios::binary);
+    ASSERT_TRUE(file.is_open())
+        << "open(" << path << ") failed: " << strerror(errno);
+
+    file.write((const char*)data, size);
+    ASSERT_TRUE(file.good())
+        << "Failed to write " << size << " bytes: " << strerror(errno);
 
     *out_path = path;
   }
diff --git a/extension/threadpool/CMakeLists.txt b/extension/threadpool/CMakeLists.txt
index 5bb647d3a09..a6c06e84293 100644
--- a/extension/threadpool/CMakeLists.txt
+++ b/extension/threadpool/CMakeLists.txt
@@ -20,16 +20,6 @@ if(NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 17)
 endif()
 
-# Threadpool size specifiers. Mutual exclusion is checking in default.cmake.
-# Default to using performance cores if
-# EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES isn't set.
-set(_threadpool_size_flag)
-if(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES)
-  set(_threadpool_size_flag "EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES")
-else()
-  set(_threadpool_size_flag "EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES")
-endif()
-
 add_library(
   extension_threadpool threadpool.cpp threadpool_guard.cpp thread_parallel.cpp
                        cpuinfo_utils.cpp
@@ -46,9 +36,7 @@ target_include_directories(
     $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include>
     $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include>
 )
-target_compile_definitions(
-  extension_threadpool PUBLIC ET_USE_THREADPOOL ${_threadpool_size_flag}
-)
+target_compile_definitions(extension_threadpool PUBLIC ET_USE_THREADPOOL)
 target_compile_options(extension_threadpool PUBLIC ${_common_compile_options})
 
 # Install libraries
diff --git a/extension/threadpool/targets.bzl b/extension/threadpool/targets.bzl
index 1889cb650ad..6ef55c42434 100644
--- a/extension/threadpool/targets.bzl
+++ b/extension/threadpool/targets.bzl
@@ -22,7 +22,6 @@ def define_common_targets():
         name = "threadpool_lib",
         srcs = _THREADPOOL_SRCS,
         deps = [
-            ":cpuinfo_utils",
             "//executorch/runtime/core:core",
             "//executorch/runtime/core/portable_type/c10/c10:c10",
         ],
diff --git a/extension/threadpool/test/threadpool_test.cpp b/extension/threadpool/test/threadpool_test.cpp
index 052e6c22f5e..e7784d3cc11 100644
--- a/extension/threadpool/test/threadpool_test.cpp
+++ b/extension/threadpool/test/threadpool_test.cpp
@@ -7,7 +7,6 @@
  */
 
 #include <executorch/extension/threadpool/threadpool.h>
-#include <executorch/runtime/platform/runtime.h>
 
 #include <mutex>
 #include <numeric>
@@ -72,8 +71,6 @@ void run_lambda_with_size(
 } // namespace
 
 TEST(ThreadPoolTest, ParallelAdd) {
-  executorch::runtime::runtime_init();
-
   std::vector<int32_t> a, b, c, c_ref;
   size_t vector_size = 100;
   size_t grain_size = 10;
@@ -114,8 +111,6 @@ TEST(ThreadPoolTest, ParallelAdd) {
 
 // Test parallel reduction where we acquire lock within lambda
 TEST(ThreadPoolTest, ParallelReduce) {
-  executorch::runtime::runtime_init();
-
   std::vector<int32_t> a;
   int32_t c = 0, c_ref = 0;
   size_t vector_size = 100;
@@ -149,8 +144,6 @@ TEST(ThreadPoolTest, ParallelReduce) {
 // Copied from
 // caffe2/aten/src/ATen/test/test_thread_pool_guard.cp
 TEST(TestNoThreadPoolGuard, TestThreadPoolGuard) {
-  executorch::runtime::runtime_init();
-
   auto threadpool_ptr = ::executorch::extension::threadpool::get_pthreadpool();
 
   ASSERT_NE(threadpool_ptr, nullptr);
@@ -180,8 +173,6 @@ TEST(TestNoThreadPoolGuard, TestThreadPoolGuard) {
 }
 
 TEST(TestNoThreadPoolGuard, TestRunWithGuard) {
-  executorch::runtime::runtime_init();
-
   const std::vector<int64_t> array = {1, 2, 3};
 
   auto pool = ::executorch::extension::threadpool::get_threadpool();
diff --git a/extension/threadpool/threadpool.cpp b/extension/threadpool/threadpool.cpp
index 72265e4cf07..5fee732b053 100644
--- a/extension/threadpool/threadpool.cpp
+++ b/extension/threadpool/threadpool.cpp
@@ -6,7 +6,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/extension/threadpool/cpuinfo_utils.h>
 #include <executorch/extension/threadpool/threadpool.h>
 
 #include <algorithm>
@@ -15,26 +14,9 @@
 
 #include <executorch/extension/threadpool/threadpool_guard.h>
 #include <executorch/runtime/platform/assert.h>
-#include <executorch/runtime/platform/runtime.h>
 
 #include <cpuinfo.h>
 
-// At most one mode should be set.
-#if (                                                       \
-    defined(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES) && \
-    defined(EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES))
-#error Multiple \
-            threadpool size specifiers are set.At most one of                \
-    EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES,                             \
-    and EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES may be defined.
-#endif
-
-// Default to EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES if no mode is set.
-#if !defined(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES) && \
-    !defined(EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES)
-#define EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES 1
-#endif
-
 namespace executorch::extension::threadpool {
 
 #if !(defined(WIN32))
@@ -114,25 +96,12 @@ void ThreadPool::run(
 // get_threadpool is not thread safe due to leak_corrupted_threadpool
 // Make this part threadsafe: TODO(kimishpatel)
 ThreadPool* get_threadpool() {
-  executorch::runtime::runtime_init();
-
   if (!cpuinfo_initialize()) {
     ET_LOG(Error, "cpuinfo initialization failed");
     return nullptr; // NOLINT(facebook-hte-NullableReturn)
   }
 
-  // Choose the number of threads according to the EXECUTORCH_THREADPOOL_
-  // options. See the description in threadpool.h.
-
-#if defined(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES)
-  // Use threads=cores.
-  static int num_threads = cpuinfo_get_processors_count();
-#else
-  // Set threads equal to the number of performance cores.
-  static int num_threads =
-      ::executorch::extension::cpuinfo::get_num_performant_cores();
-#endif
-
+  int num_threads = cpuinfo_get_processors_count();
   /*
    * For llvm-tsan, holding limit for the number of locks for a single thread
    * is 63 (because of comparison < 64 instead of <=). pthreadpool's worst
diff --git a/extension/threadpool/threadpool.h b/extension/threadpool/threadpool.h
index 16acad6e5fa..3ad2d1d48d4 100644
--- a/extension/threadpool/threadpool.h
+++ b/extension/threadpool/threadpool.h
@@ -14,22 +14,6 @@
 
 #include <pthreadpool.h>
 
-/*
- * Threadpool Options:
- *
- * Threadpool size has a sizble affect on performance. By default, the
- * threadpool will be sized according to the number of performance cores. This
- * behavior can be overriden with the following build-time options. Note that
- * these options are mutually exclusive.
- *
- * - EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES (flag) - Sizes the threadpool
- * equal to the number of performance cores on the system. This is the default
- * behavior.
- * - EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES (flag) - Sizes the threadpool
- * equal to the number of logical cores on system. This is the historical
- * behavior.
- */
-
 namespace executorch::extension::threadpool {
 
 class ThreadPool final {
diff --git a/kernels/portable/cpu/op_argmax.cpp b/kernels/portable/cpu/op_argmax.cpp
index e9a561366f7..a48c152133b 100644
--- a/kernels/portable/cpu/op_argmax.cpp
+++ b/kernels/portable/cpu/op_argmax.cpp
@@ -49,13 +49,13 @@ Tensor& argmax_out(
   static constexpr const char op_name[] = "argmax.out";
 
   ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] {
-    long* out_data = out.mutable_data_ptr<long>();
+    int64_t* out_data = out.mutable_data_ptr<int64_t>();
 
     const bool success = parallel_for_each_reduce_over_dim_output_index(
         in, dim, out, [&](const auto begin, const auto end) {
           for (const auto out_ix : c10::irange(begin, end)) {
-            std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
-                [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
+            std::tuple<CTYPE, int64_t> acc = reduce_over_dim<CTYPE>(
+                [](CTYPE v, int64_t ix, CTYPE acc_val, int64_t acc_ix) {
                   // the below condition as written is equivalent to
                   // !isnan(accval) && (isnan(v) || v > acc_val). See
                   // argument in op_argmin.cpp.
@@ -63,7 +63,7 @@ Tensor& argmax_out(
                     acc_val = v;
                     acc_ix = ix;
                   }
-                  return std::tuple<CTYPE, long>{acc_val, acc_ix};
+                  return std::tuple<CTYPE, int64_t>{acc_val, acc_ix};
                 },
                 in,
                 dim,
diff --git a/kernels/portable/cpu/op_argmin.cpp b/kernels/portable/cpu/op_argmin.cpp
index fda9463c5ee..55f2f82b04b 100644
--- a/kernels/portable/cpu/op_argmin.cpp
+++ b/kernels/portable/cpu/op_argmin.cpp
@@ -49,13 +49,13 @@ Tensor& argmin_out(
   static constexpr const char op_name[] = "argmin.out";
 
   ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] {
-    long* out_data = out.mutable_data_ptr<long>();
+    int64_t* out_data = out.mutable_data_ptr<int64_t>();
 
     const bool success = parallel_for_each_reduce_over_dim_output_index(
         in, dim, out, [&](const auto begin, const auto end) {
           for (const auto out_ix : c10::irange(begin, end)) {
-            std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
-                [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
+            std::tuple<CTYPE, int64_t> acc = reduce_over_dim<CTYPE>(
+                [](CTYPE v, int64_t ix, CTYPE acc_val, int64_t acc_ix) {
                   // the below condition as written is equivalent to
                   // !isnan(accval) && (isnan(v) || v < acc_val). cases:
                   // - if neither acc_val nor v is NaN, !(v >= acc_val) is
@@ -70,7 +70,7 @@ Tensor& argmin_out(
                     acc_val = v;
                     acc_ix = ix;
                   }
-                  return std::tuple<CTYPE, long>{acc_val, acc_ix};
+                  return std::tuple<CTYPE, int64_t>{acc_val, acc_ix};
                 },
                 in,
                 dim,
diff --git a/kernels/portable/cpu/op_clamp.cpp b/kernels/portable/cpu/op_clamp.cpp
index b3aa41cda85..8ac78fd5477 100644
--- a/kernels/portable/cpu/op_clamp.cpp
+++ b/kernels/portable/cpu/op_clamp.cpp
@@ -45,9 +45,9 @@ ET_NODISCARD bool check_bounds(
   static constexpr const char op_name[] = "clamp.out";
 
   if (isIntegralType(out_type, /*includeBool=*/false)) {
-    const long val_long = utils::scalar_to<long>(val_scalar);
+    const int64_t val_long = utils::scalar_to<int64_t>(val_scalar);
     ET_SWITCH_INT_TYPES(out_type, ctx, op_name, CTYPE_OUT, [&]() {
-      if (is_out_of_bounds<CTYPE_OUT, long>(val_long)) {
+      if (is_out_of_bounds<CTYPE_OUT, int64_t>(val_long)) {
         ET_LOG(Error, "%s value out of bounds", val_name);
         is_valid = false;
       }
diff --git a/kernels/portable/cpu/op_gather.cpp b/kernels/portable/cpu/op_gather.cpp
index 9899c21a94e..02ea502ca63 100644
--- a/kernels/portable/cpu/op_gather.cpp
+++ b/kernels/portable/cpu/op_gather.cpp
@@ -30,7 +30,7 @@ void gather_helper(
     Tensor& out,
     int64_t dim) {
   const CTYPE* in_data = in.const_data_ptr<CTYPE>();
-  const long* index_data = index.const_data_ptr<long>();
+  const int64_t* index_data = index.const_data_ptr<int64_t>();
   CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
 
   if (index.dim() == 0) {
diff --git a/kernels/portable/cpu/op_max.cpp b/kernels/portable/cpu/op_max.cpp
index 7df93470d39..467c8ccffd5 100644
--- a/kernels/portable/cpu/op_max.cpp
+++ b/kernels/portable/cpu/op_max.cpp
@@ -82,19 +82,19 @@ std::tuple<Tensor&, Tensor&> max_out(
   ET_SWITCH_REALHBBF16_TYPES(
       in.scalar_type(), ctx, "max.dim_max", CTYPE, [&]() {
         CTYPE* max_data = max.mutable_data_ptr<CTYPE>();
-        long* max_indices_data = max_indices.mutable_data_ptr<long>();
+        int64_t* max_indices_data = max_indices.mutable_data_ptr<int64_t>();
 
         const bool success = parallel_for_each_reduce_over_dim_output_index(
             in, dim, max, [&](const auto begin, const auto end) {
               for (const auto out_ix : c10::irange(begin, end)) {
-                std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
-                    [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
+                std::tuple<CTYPE, int64_t> acc = reduce_over_dim<CTYPE>(
+                    [](CTYPE v, int64_t ix, CTYPE acc_val, int64_t acc_ix) {
                       if (!utils::isnan_override(acc_val) &&
                           (utils::isnan_override(v) || v > acc_val)) {
                         acc_val = v;
                         acc_ix = ix;
                       }
-                      return std::tuple<CTYPE, long>{acc_val, acc_ix};
+                      return std::tuple<CTYPE, int64_t>{acc_val, acc_ix};
                     },
                     in,
                     dim,
diff --git a/kernels/portable/cpu/op_min.cpp b/kernels/portable/cpu/op_min.cpp
index a4cd1be2067..304321bb9f8 100644
--- a/kernels/portable/cpu/op_min.cpp
+++ b/kernels/portable/cpu/op_min.cpp
@@ -82,19 +82,19 @@ std::tuple<Tensor&, Tensor&> min_out(
   ET_SWITCH_REALHBBF16_TYPES(
       in.scalar_type(), ctx, "min.dim_min", CTYPE, [&]() {
         CTYPE* min_data = min.mutable_data_ptr<CTYPE>();
-        long* min_indices_data = min_indices.mutable_data_ptr<long>();
+        int64_t* min_indices_data = min_indices.mutable_data_ptr<int64_t>();
 
         const bool success = parallel_for_each_reduce_over_dim_output_index(
             in, dim, min, [&](const auto begin, const auto end) {
               for (const auto out_ix : c10::irange(begin, end)) {
-                std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
-                    [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
+                std::tuple<CTYPE, int64_t> acc = reduce_over_dim<CTYPE>(
+                    [](CTYPE v, int64_t ix, CTYPE acc_val, int64_t acc_ix) {
                       if (!utils::isnan_override(acc_val) &&
                           (utils::isnan_override(v) || v < acc_val)) {
                         acc_val = v;
                         acc_ix = ix;
                       }
-                      return std::tuple<CTYPE, long>{acc_val, acc_ix};
+                      return std::tuple<CTYPE, int64_t>{acc_val, acc_ix};
                     },
                     in,
                     dim,
diff --git a/kernels/portable/cpu/op_scatter.cpp b/kernels/portable/cpu/op_scatter.cpp
index 58341cefb1e..42d40c8284d 100644
--- a/kernels/portable/cpu/op_scatter.cpp
+++ b/kernels/portable/cpu/op_scatter.cpp
@@ -32,7 +32,7 @@ void scatter_src_helper(
     const Tensor& src,
     Tensor& out) {
   const CTYPE* in_data = in.const_data_ptr<CTYPE>();
-  const long* index_data = index.const_data_ptr<long>();
+  const int64_t* index_data = index.const_data_ptr<int64_t>();
   const CTYPE* src_data = src.const_data_ptr<CTYPE>();
   CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
 
@@ -72,7 +72,7 @@ void scatter_value_helper(
     CTYPE_VAL val,
     Tensor& out) {
   const CTYPE* in_data = in.const_data_ptr<CTYPE>();
-  const long* index_data = index.const_data_ptr<long>();
+  const int64_t* index_data = index.const_data_ptr<int64_t>();
   CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
 
   memcpy(out_data, in_data, in.nbytes());
diff --git a/kernels/portable/cpu/op_scatter_add.cpp b/kernels/portable/cpu/op_scatter_add.cpp
index f9c1f7677b6..690c31342a9 100644
--- a/kernels/portable/cpu/op_scatter_add.cpp
+++ b/kernels/portable/cpu/op_scatter_add.cpp
@@ -23,7 +23,7 @@ namespace {
 template <typename CTYPE>
 void scatter_add_helper(
     const CTYPE* src_data,
-    const long* index_data,
+    const int64_t* index_data,
     CTYPE* out_data,
     const Tensor& src,
     const Tensor& index,
@@ -81,7 +81,7 @@ Tensor& scatter_add_out(
 
   ET_SWITCH_REALHBBF16_TYPES(self_type, ctx, "scatter_add.out", CTYPE, [&]() {
     const CTYPE* self_data = self.const_data_ptr<CTYPE>();
-    const long* index_data = index.const_data_ptr<long>();
+    const int64_t* index_data = index.const_data_ptr<int64_t>();
     const CTYPE* src_data = src.const_data_ptr<CTYPE>();
     CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
 
diff --git a/kernels/portable/cpu/op_topk.cpp b/kernels/portable/cpu/op_topk.cpp
index e2143ce78d5..bdea02f83bc 100644
--- a/kernels/portable/cpu/op_topk.cpp
+++ b/kernels/portable/cpu/op_topk.cpp
@@ -79,7 +79,7 @@ void perform_topk(
     elem_t* queue) {
   const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
   CTYPE* values_data = values.mutable_data_ptr<CTYPE>();
-  long* indices_data = indices.mutable_data_ptr<long>();
+  int64_t* indices_data = indices.mutable_data_ptr<int64_t>();
 
   if (in.dim() == 0) {
     values_data[0] = in_data[0];
diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt
index 0304d751455..2e488b109c1 100644
--- a/kernels/test/CMakeLists.txt
+++ b/kernels/test/CMakeLists.txt
@@ -26,8 +26,8 @@ foreach(kernel ${_kernels})
   set(_functions_include "#include <executorch/kernels/${kernel}/Functions.h>")
   add_custom_command(
     OUTPUT "${_wrapper_path}"
-    COMMAND mkdir -p ${_wrapper_dir}
-    COMMAND echo ${_functions_include} > "${_wrapper_path}"
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${_wrapper_dir}
+    COMMAND ${CMAKE_COMMAND} -E echo ${_functions_include} > "${_wrapper_path}"
     DEPENDS
       "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/Functions.h"
       "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/NativeFunctions.h"
@@ -44,7 +44,7 @@ foreach(kernel ${_kernels})
   add_custom_command(
     OUTPUT "${_wrapper_dir}/supported_features.cpp"
            "${_wrapper_dir}/supported_features.h"
-    COMMAND mkdir -p ${_wrapper_dir}
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${_wrapper_dir}
     COMMAND
       ${PYTHON_EXECUTABLE} kernels/test/gen_supported_features.py
       kernels/${_supported_features_kernel}/test/supported_features_def.yaml >
@@ -73,17 +73,35 @@ foreach(kernel ${_kernels})
         "${CMAKE_CURRENT_BINARY_DIR}/../../kernels/${kernel}/${kernel}_ops_lib"
     )
   endif()
+
+  # Copy with glob needs to be handle in a platform-specific manner.
+  if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
+    # The quoting here is complicated, because there are three levels of
+    # interpretation: CMake -> Batch -> Powershell. The invoked (batch) command
+    # should look like `powershell -Command "Copy-Item ... -Path \"...\" ...".
+    # Powershell sees `Copy-Item -Path "..." ...`.
+    set(_copy_headers_cmd
+        powershell
+        -Command
+        "Copy-Item -Path \\\"${_kernel_ops_lib_path}/*.h\\\" -Destination \\\"${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/\\\""
+    )
+  else()
+    set(_copy_headers_cmd
+        cp
+        "${_kernel_ops_lib_path}/*.h"
+        "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/"
+    )
+  endif()
+
   add_custom_command(
     OUTPUT
       "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/Functions.h"
       "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/NativeFunctions.h"
       "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/RegisterKernels.h"
     COMMAND
-      mkdir -p
-      "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/"
-    COMMAND
-      cp "${_kernel_ops_lib_path}/*.h"
+      ${CMAKE_COMMAND} -E make_directory
       "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/"
+    COMMAND ${_copy_headers_cmd}
     DEPENDS ${_kernel_ops_lib}
   )
 endforeach()
diff --git a/kernels/test/export_test_model.ps1 b/kernels/test/export_test_model.ps1
new file mode 100644
index 00000000000..d19e2a713d9
--- /dev/null
+++ b/kernels/test/export_test_model.ps1
@@ -0,0 +1,24 @@
+param (
+    [string]$Modules,
+    [string]$OutDir,
+    [string]$CondaEnv
+)
+
+Set-PSDebug -Trace 1
+
+# Activate the VS dev environment - needed for dynamo. Try to use vswhere to locate the install. If not,
+# fall back to a reasonable guess for the build tools, which also happens to match the CLI setup.
+$vswherePath = "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe"
+if (Test-Path $vswherePath) {
+    $vsInstallPath = & $vswherePath -latest -property installationPath
+} else {
+    $vsInstallPath = "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\"
+}
+
+& "$vsInstallPath\Common7\Tools\Launch-VsDevShell.ps1" -Arch amd64 -SkipAutomaticLocation
+
+conda activate $CondaEnv
+
+$Modules = $Modules.Replace(" ", ",")
+echo "Modules: $Modules"
+python -m test.models.export_program --modules "$Modules" --outdir "$OutDir"
diff --git a/pytest-windows.ini b/pytest-windows.ini
index 0eb30e3583d..0959318afdd 100644
--- a/pytest-windows.ini
+++ b/pytest-windows.ini
@@ -100,6 +100,7 @@ addopts =
     #extension/llm/export
     --deselect=extension/pybindings/test/test_pybindings.py::PybindingsTest::test_method_quantized_ops
     --deselect=extension/pybindings/test/test_pybindings.py::PybindingsTest::test_quantized_ops
+    --deselect=extension/pybindings/test/test_pybindings.py::PybindingsTest::test_program_data_separation
     --deselect=runtime/test/test_runtime.py::RuntimeTest::test_load_program_with_path
     --deselect=exir/backend/test/test_compatibility.py::TestCompatibility::test_compatibility_in_runtime
     --deselect=exir/backend/test/test_compatibility.py::TestCompatibility::test_compatibility_in_runtime_edge_program_manager
@@ -108,6 +109,7 @@ addopts =
     --deselect=extension/llm/custom_ops/test_sdpa_with_kv_cache.py::SDPATestForSpeculativeDecode::test_sdpa_with_cache_seq_len_130
     --deselect=devtools/inspector/tests/inspector_test.py::TestInspector::test_etrecord_populates_correct_edge_dialect_aot_intermediate_outputs
     --deselect=devtools/inspector/tests/inspector_test.py::TestInspector::test_etrecord_populates_correct_export_program_aot_intermediate_outputs
+    --deselect=runtime/test/test_runtime_etdump_gen.py::RuntimeETDumpGenTest::test_etdump_generation
 
 # run the same tests multiple times to determine their
 # flakiness status. Default to 50 re-runs
diff --git a/runtime/core/exec_aten/testing_util/tensor_util.cpp b/runtime/core/exec_aten/testing_util/tensor_util.cpp
index 35ddbe8ac15..218a64cf9dd 100644
--- a/runtime/core/exec_aten/testing_util/tensor_util.cpp
+++ b/runtime/core/exec_aten/testing_util/tensor_util.cpp
@@ -28,6 +28,43 @@ namespace testing {
 
 namespace {
 
+/**
+ * Returns true if the two elements are close according to the description on
+ * `tensors_are_close()`.
+ *
+ * T must be a floating point type. Non-floating point data should be compared
+ * directly.
+ */
+template <typename T>
+bool element_is_close(const T a, const T b, double rtol, double atol) {
+  if constexpr (c10::is_reduced_floating_point_v<T>) {
+    // MSVC complains about ambiguous overloads, so explicitly cast to float to
+    // compare.
+    return element_is_close(
+        static_cast<float>(a), static_cast<float>(b), rtol, atol);
+  } else {
+    if (std::isnan(a) && std::isnan(b)) {
+      // NaN == NaN
+    } else if (!std::isfinite(a) && !std::isfinite(b) && ((a > 0) == (b > 0))) {
+      // -Inf == -Inf
+      // +Inf == +Inf
+    } else if (rtol == 0 && atol == 0) {
+      // Exact comparison; avoid unnecessary math.
+      if (a != b) {
+        return false;
+      }
+    } else {
+      auto allowed_error = atol + std::abs(rtol * b);
+      auto actual_error = std::abs(a - b);
+      if (!std::isfinite(actual_error) || actual_error > allowed_error) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+}
+
 /**
  * Returns true if the two arrays are close according to the description on
  * `tensors_are_close()`.
@@ -55,23 +92,8 @@ bool data_is_close(
     const auto ai = a[i];
     const auto bi = b[i];
 
-    if (std::isnan(ai) && std::isnan(bi)) {
-      // NaN == NaN
-    } else if (
-        !std::isfinite(ai) && !std::isfinite(bi) && ((ai > 0) == (bi > 0))) {
-      // -Inf == -Inf
-      // +Inf == +Inf
-    } else if (rtol == 0 && atol == 0) {
-      // Exact comparison; avoid unnecessary math.
-      if (ai != bi) {
-        return false;
-      }
-    } else {
-      auto allowed_error = atol + std::abs(rtol * bi);
-      auto actual_error = std::abs(ai - bi);
-      if (!std::isfinite(actual_error) || actual_error > allowed_error) {
-        return false;
-      }
+    if (!element_is_close(ai, bi, rtol, atol)) {
+      return false;
     }
   }
   return true;
diff --git a/runtime/executor/test/CMakeLists.txt b/runtime/executor/test/CMakeLists.txt
index d8df1f9ea56..05d149ab1b4 100644
--- a/runtime/executor/test/CMakeLists.txt
+++ b/runtime/executor/test/CMakeLists.txt
@@ -17,6 +17,31 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
+if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
+  # Use a wrapper script to set up the environment for MSVC to make Dynamo
+  # export work.
+  set(_export_program_cmd
+      powershell
+      ${EXECUTORCH_ROOT}/kernels/test/export_test_model.ps1
+      -Modules
+      "\"ModuleAdd,ModuleAddHalf,ModuleAddMul,ModuleDynamicCatUnallocatedIO,ModuleIndex,ModuleMultipleEntry,ModuleSimpleTrain,ModuleStateful\""
+      -outDir
+      "${CMAKE_CURRENT_BINARY_DIR}"
+      -CondaEnv
+      $ENV{CONDA_DEFAULT_ENV}
+  )
+else()
+  set(_export_program_cmd
+      ${PYTHON_EXECUTABLE}
+      -m
+      test.models.export_program
+      --modules
+      "ModuleAdd,ModuleAddHalf,ModuleAddMul,ModuleDynamicCatUnallocatedIO,ModuleIndex,ModuleMultipleEntry,ModuleSimpleTrain,ModuleStateful"
+      --outdir
+      "${CMAKE_CURRENT_BINARY_DIR}"
+  )
+endif()
+
 add_custom_command(
   OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/ModuleAdd.pte"
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddHalf.pte"
@@ -29,17 +54,14 @@ add_custom_command(
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleSimpleTrain.pte"
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleStateful.pte"
          "${CMAKE_CURRENT_BINARY_DIR}/delegated/ModuleAddMul.pte"
-  COMMAND
-    ${PYTHON_EXECUTABLE} -m test.models.export_program --modules
-    "ModuleAdd,ModuleAddHalf,ModuleAddMul,ModuleDynamicCatUnallocatedIO,ModuleIndex,ModuleMultipleEntry,ModuleSimpleTrain,ModuleStateful"
-    --outdir "${CMAKE_CURRENT_BINARY_DIR}"
+  COMMAND ${_export_program_cmd}
   COMMAND
     ${PYTHON_EXECUTABLE} -m test.models.export_program --modules "ModuleAddMul"
     --external-constants --outdir "${CMAKE_CURRENT_BINARY_DIR}"
   COMMAND
     ${PYTHON_EXECUTABLE} -m test.models.export_delegated_program --modules
     "ModuleAddMul" --backend_id "StubBackend" --outdir
-    "${CMAKE_CURRENT_BINARY_DIR}/delegated/" || true
+    "${CMAKE_CURRENT_BINARY_DIR}/delegated/"
   WORKING_DIRECTORY ${EXECUTORCH_ROOT}
 )
 
diff --git a/runtime/kernel/test/CMakeLists.txt b/runtime/kernel/test/CMakeLists.txt
index c70ec5d135b..a8166017e53 100644
--- a/runtime/kernel/test/CMakeLists.txt
+++ b/runtime/kernel/test/CMakeLists.txt
@@ -39,12 +39,6 @@ add_test(kernel_runtime_context_test kernel_runtime_context_test)
 add_executable(
   operator_registry_max_kernel_num_test
   operator_registry_max_kernel_num_test.cpp
-  ../operator_registry.cpp
-  ../../core/evalue.cpp
-  ../../platform/abort.cpp
-  ../../platform/log.cpp
-  ../../platform/runtime.cpp
-  ../../platform/default/posix.cpp
 )
 target_link_libraries(
   operator_registry_max_kernel_num_test GTest::gtest GTest::gtest_main
diff --git a/test/end2end/exported_module.py b/test/end2end/exported_module.py
index e5630b8e89f..750b9097335 100644
--- a/test/end2end/exported_module.py
+++ b/test/end2end/exported_module.py
@@ -187,7 +187,6 @@ def __init__(self, method):
                         if method_name_to_dynamic_shapes
                         else None
                     ),
-                    strict=True,
                 )
 
         exec_prog = to_edge(
diff --git a/test/models/export_delegated_program.py b/test/models/export_delegated_program.py
index 8f7c388d7ad..98f4b0b9b36 100644
--- a/test/models/export_delegated_program.py
+++ b/test/models/export_delegated_program.py
@@ -155,9 +155,9 @@ def forward(self, *args, **kwargs):
 
     if method_name != "forward":
         # Only require wrapper module if we're exporting a specific method other than forward.
-        exported_program = export(WrapperModule(eager_module), args=inputs, strict=True)
+        exported_program = export(WrapperModule(eager_module), args=inputs)
     else:
-        exported_program = export(eager_module, args=inputs, strict=True)
+        exported_program = export(eager_module, args=inputs)
 
     edge_config = EdgeCompileConfig(_check_ir_validity=False)
     et_config = exir.ExecutorchBackendConfig(
@@ -178,7 +178,7 @@ def forward(self, *args, **kwargs):
                 module=tagged_module,
                 gen_tag_fn=lambda x: module_class.__name__,
             )
-            exported_program = export(tagged_module, args=inputs, strict=True)
+            exported_program = export(tagged_module, args=inputs)
         executorch_program = to_edge_transform_and_lower(
             exported_program,
             compile_config=edge_config,
@@ -205,7 +205,7 @@ def forward(self, *args, **kwargs):
         composite_module(*inputs)
 
         executorch_program = to_edge(
-            export(composite_module, args=inputs, strict=True)
+            export(composite_module, args=inputs)
         ).to_executorch(config=et_config)
 
     return executorch_program
diff --git a/tools/cmake/Utils.cmake b/tools/cmake/Utils.cmake
index 1e0671eb920..77918ebbf2e 100644
--- a/tools/cmake/Utils.cmake
+++ b/tools/cmake/Utils.cmake
@@ -62,6 +62,8 @@ endfunction()
 function(target_link_options_gc_sections target_name)
   if(APPLE)
     target_link_options(${target_name} PRIVATE "LINKER:-dead_strip")
+  elseif(WIN32)
+    target_link_options(${target_name} PRIVATE "LINKER:/OPT:REF")
   else()
     target_link_options(${target_name} PRIVATE "LINKER:--gc-sections")
   endif()
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index 16f4245f6bc..fb0dc0a4ade 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -176,36 +176,6 @@ define_overridable_option(
   ${_default_executorch_build_cpuinfo}
 )
 
-# Threadpool size options. At most one can be specified. Note that the default
-# is managed in threadpool.cpp to allow the user to specify an alternate mode
-# without needing to explicitly set the default to off.
-define_overridable_option(
-  EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES
-  "Set the number of threads used for CPU parallel computation equal to the number of performant CPU cores."
-  BOOL
-  OFF
-)
-define_overridable_option(
-  EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES
-  "Set the number of threads used for CPU parallel computation equal to the number of logical CPU cores."
-  BOOL
-  OFF
-)
-
-check_required_options_on(
-  IF_ON EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES REQUIRES
-  EXECUTORCH_BUILD_PTHREADPOOL EXECUTORCH_BUILD_CPUINFO
-)
-check_required_options_on(
-  IF_ON EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES REQUIRES
-  EXECUTORCH_BUILD_PTHREADPOOL EXECUTORCH_BUILD_CPUINFO
-)
-
-check_conflicting_options_on(
-  IF_ON EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES CONFLICTS_WITH
-  EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES
-)
-
 # TODO(jathu): move this to platform specific presets when created
 set(_default_executorch_build_executor_runner ON)
 if(APPLE AND "${SDK_NAME}" STREQUAL "iphoneos")