pytorch · abhinaykukkadapu · Sep 16, 2025 · Sep 12, 2025 · Sep 13, 2025 · Sep 13, 2025
diff --git a/.ci/scripts/setup-windows.ps1 b/.ci/scripts/setup-windows.ps1
@@ -1,5 +1,5 @@
 param (
-    [string]$editable = $false
+    [string]$editable = "false"
 )
 
 conda create --yes --quiet -n et python=3.12

diff --git a/.ci/scripts/test_torchao_huggingface_checkpoints.sh b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-set -euo pipefail
+set -euxo pipefail
 
 # -------------------------
 # Args / flags

diff --git a/.ci/scripts/unittest-windows.ps1 b/.ci/scripts/unittest-windows.ps1
@@ -1,15 +1,38 @@
 param (
-    [string]$editable = $false
+    [string]$buildMode = "Release"
 )
 
 Set-PSDebug -Trace 1
 $ErrorActionPreference = 'Stop'
 $PSNativeCommandUseErrorActionPreference = $true
 
-# Run pytest with coverage
-# pytest -n auto --cov=./ --cov-report=xml
-pytest -v --full-trace -c pytest-windows.ini
+# Run native unit tests (via ctest)
+New-Item -Path "test-build" -ItemType Directory
+cd "test-build"
+
+cmake .. --preset windows -B . -DEXECUTORCH_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=$buildMode
 if ($LASTEXITCODE -ne 0) {
-    Write-Host "Pytest invocation was unsuccessful. Exit code: $LASTEXITCODE."
+    Write-Host "CMake configuration was unsuccessful. Exit code: $LASTEXITCODE."
+    exit $LASTEXITCODE
+}
+
+cmake --build . -j8 --config $buildMode --verbose
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "CMake build was unsuccessful. Exit code: $LASTEXITCODE."
     exit $LASTEXITCODE
 }
+
+ctest -j8 . --build-config $buildMode --output-on-failure -E "method_test|tensor_parser_test"
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "CTest run was unsuccessful. Exit code: $LASTEXITCODE."
+    exit $LASTEXITCODE
+}
+
+cd ..
+
+# Run pytest
+pytest -v -c pytest-windows.ini
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "Pytest invocation was unsuccessful. Exit code: $LASTEXITCODE."
+    exit $LASTEXITCODE
+}
diff --git a/.github/workflows/_unittest.yml b/.github/workflows/_unittest.yml
@@ -69,7 +69,15 @@ jobs:
           \$ErrorActionPreference = 'Stop'
           \$PSNativeCommandUseErrorActionPreference = \$true
 
-          .ci/scripts/setup-windows.ps1       
+          .ci/scripts/setup-windows.ps1 -editable "${{ inputs.editable }}"
+          if (\$LASTEXITCODE -ne 0) {
+              Write-Host "Setup failed. Exit code: \$LASTEXITCODE."
+              exit \$LASTEXITCODE
+          }
 
-          powershell .ci/scripts/unittest-windows.ps1 -editable "${{ inputs.editable }}"
+          .ci/scripts/unittest-windows.ps1 -buildMode "${{ inputs.build-mode }}"
+          if (\$LASTEXITCODE -ne 0) {
+              Write-Host "Unit tests failed. Exit code: \$LASTEXITCODE."
+              exit \$LASTEXITCODE
+          }
         }"
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -779,6 +779,7 @@ jobs:
       contents: read
     strategy:
       fail-fast: false
+    if: false # TODO Re-enable after fixing timeouts (#14314)
     with:
       runner: linux.2xlarge
       docker-image: ci-image:executorch-ubuntu-22.04-gcc9

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
@@ -1032,5 +1032,5 @@ jobs:
 
           .ci/scripts/setup-windows.ps1
 
-          powershell .ci/scripts/test_model.ps1 -modelName ${{ matrix.model }} -backend ${{ matrix.backend }}
+          .ci/scripts/test_model.ps1 -modelName ${{ matrix.model }} -backend ${{ matrix.backend }}
         }"
@@ -143,9 +143,13 @@ endif()
 
 # -ffunction-sections -fdata-sections: breaks function and data into sections so
 # they can be properly gc'd. -s: strip symbol.
-set(CMAKE_CXX_FLAGS_RELEASE
-    "-ffunction-sections -fdata-sections ${CMAKE_CXX_FLAGS_RELEASE}"
-)
+if(WIN32)
+  set(CMAKE_CXX_FLAGS_RELEASE "/Gy /Gw ${CMAKE_CXX_FLAGS_RELEASE}")
+else()
+  set(CMAKE_CXX_FLAGS_RELEASE
+      "-ffunction-sections -fdata-sections ${CMAKE_CXX_FLAGS_RELEASE}"
+  )
+endif()
 if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s")
 endif()

@@ -22,8 +22,7 @@ runtime.python_library(
         "common/debug.py",
     ],
     deps = [
-        "fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/serializer:serializer",
-        "fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/serializer:serializer",
+        "fbsource//third-party/tosa_tools:serializer",
         "//caffe2:torch",
         "//executorch/exir:lib",
     ],
@@ -37,10 +36,8 @@ runtime.python_library(
     deps = [
         "fbsource//third-party/pypi/flatbuffers:flatbuffers",
         "fbsource//third-party/pypi/ml-dtypes:ml-dtypes",
-        "fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/serializer:serializer",
-        "fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/serializer:serializer",
-        "fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/tosa:tosa",
-        "fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/tosa:tosa",
+        "fbsource//third-party/tosa_tools:serializer",
+        "fbsource//third-party/tosa_tools:tosa",
         ":process_node",
         "//executorch/exir/backend:compile_spec_schema",
         "//executorch/backends/arm/operators:lib",
@@ -83,8 +80,7 @@ runtime.python_library(
     name = "process_node",
     srcs = ["process_node.py"],
     deps = [
-        "fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/tosa:tosa",
-        "fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/tosa:tosa",
+        "fbsource//third-party/tosa_tools:tosa",
         "//executorch/backends/arm/operators:node_visitor",
         "//executorch/backends/arm/tosa:mapping",
         "//executorch/backends/arm/tosa:quant_utils",

@@ -25,17 +25,19 @@
 # per-io structs to simplify runtime use.
 def vela_bin_pack_io(prefix, data):
     vela_input_shapes = data[prefix + "_shape"]
+    # Vela input/output shape is fixed to 6D
+    vela_io_shape_dims = 6
 
     ios = struct.pack("<i", len(vela_input_shapes))
     for i in range(len(vela_input_shapes)):
         io_shape = vela_input_shapes[i]
         io_elem_size = data[prefix + "_elem_size"][i]
         io_offset = data[prefix + "_offset"][i]
         io_region = data[prefix + "_region"][i]
-        assert len(io_shape) <= 4
-        inp_pad = io_shape.tolist() + [0] * (4 - len(io_shape))
+        assert len(io_shape) == vela_io_shape_dims
+        inp_pad = io_shape.tolist()
         io_struct = struct.pack(
-            "<iiiiiii", *inp_pad, io_elem_size, io_offset, io_region
+            "<iiiiiiiii", *inp_pad, io_elem_size, io_offset, io_region
         )
         ios += io_struct
     return ios

@@ -8,7 +8,7 @@ runtime.python_library(
         "schema.py",
     ],
     deps = [
-        "fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/serializer:serializer",
+        "fbsource//third-party/tosa_tools:serializer",
         "//caffe2:torch",
     ],
 )
@@ -15,6 +15,7 @@
 from typing import final, List
 
 from executorch.backends.arm.arm_vela import vela_compile
+from executorch.backends.arm.ethosu.compile_spec import EthosUCompileSpec
 
 from executorch.backends.arm.tosa.backend import TOSABackend
 from executorch.exir.backend.backend_details import BackendDetails, PreprocessResult
@@ -35,16 +36,13 @@ class EthosUBackend(BackendDetails):
 
     @staticmethod
     def _compile_tosa_flatbuffer(
-        tosa_flatbuffer: bytes, compile_spec: List[CompileSpec]
+        tosa_flatbuffer: bytes, compile_spec: EthosUCompileSpec
     ) -> bytes:
         """
         Static helper method to do the compilation of the TOSA flatbuffer
         representation to a target specific binary stream.
         """
-        compile_flags = []
-        for spec in compile_spec:
-            if spec.key == "compile_flags":
-                compile_flags.append(spec.value.decode())
+        compile_flags = compile_spec.compiler_flags
 
         if len(compile_flags) == 0:
             # Not testing for compile_flags correctness here, just that they are
@@ -64,10 +62,11 @@ def _compile_tosa_flatbuffer(
     @staticmethod
     def preprocess(
         edge_program: ExportedProgram,
-        compile_spec: List[CompileSpec],
+        compile_specs: List[CompileSpec],
     ) -> PreprocessResult:
         logger.info(f"{EthosUBackend.__name__} preprocess")
 
+        compile_spec = EthosUCompileSpec.from_list(compile_specs)
         # deduce TOSA compile_spec from Ethos-U compile spec. We get a new
         # compile spec list, containing only elements relevant for the
         # TOSABackend.
@@ -77,7 +76,7 @@ def preprocess(
         # ('All backend implementation are final...'), so use composition instead.
         # preprocess returns the serialized TOSA flatbuffer in .processed_bytes,
         # which can be passed on to next compilation step.
-        tosa_preprocess = TOSABackend.preprocess(edge_program, tosa_compile_spec)
+        tosa_preprocess = TOSABackend._preprocess(edge_program, tosa_compile_spec)
 
         binary = EthosUBackend._compile_tosa_flatbuffer(
             tosa_preprocess.processed_bytes, compile_spec

@@ -20,8 +20,7 @@ runtime.python_library(
     name = "ops",
     srcs = glob(["op_*.py", "ops_*.py"]),
     deps = [
-        "fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/tosa:tosa",
-        "fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/tosa:tosa",
+        "fbsource//third-party/tosa_tools:tosa",
         ":node_visitor",
         ":operator_validation_utils",
         "//executorch/backends/arm/tosa:mapping",

@@ -73,7 +73,9 @@ def define_node(
             abs_output = output
 
         # Do the INT32 Abs
-        tosa_graph.addOperator(
+        self._serialize_operator(
+            node,
+            tosa_graph,
             ts.TosaOp.Op().ABS,
             [
                 rescaled_inputs[0].name,

@@ -46,13 +46,20 @@ def define_node(
         input_zp = cast(int, node.args[3])
         output_zp = cast(int, node.args[4])
 
-        if input_dtype != map_dtype(torch.int8, self.tosa_spec) and input_zp != 0:
+        if (
+            input_dtype
+            not in [
+                map_dtype(torch.int8, self.tosa_spec),
+                map_dtype(torch.int16, self.tosa_spec),
+            ]
+            and input_zp != 0
+        ):
             raise ValueError(
-                f"If input dtype is not int8, input_zp must be 0. Got input_dtype{input_dtype=}, {input_zp=}"
+                f"If input dtype is not int8 or int16, input_zp must be 0. Got input_dtype{input_dtype=}, {input_zp=}"
             )
-        if output_dtype != torch.int8 and output_zp != 0:
+        if output_dtype not in [torch.int8, torch.int16] and output_zp != 0:
             raise ValueError(
-                f"If output dtype is not int8, output_zp must be 0. Got {ts.DTypeNames[output_dtype]}, {output_zp=}"
+                f"If output dtype is not int8 or int16, output_zp must be 0. Got {ts.DTypeNames[output_dtype]}, {output_zp=}"
             )
 
         build_rescale(

@@ -67,7 +67,9 @@ def define_node(
             dtype=ts.DType.INT32,
         )
 
-        tosa_graph.addOperator(
+        self._serialize_operator(
+            node,
+            tosa_graph,
             ts.TosaOp.Op().REDUCE_SUM,
             [rescaled_inputs[0].name],
             [intermediate.name],
@@ -111,7 +113,9 @@ def define_node(
         attr = ts.TosaSerializerAttribute()
         attr.ReduceSumAttribute(tensor.dim_order.index(dim))
 
-        tosa_graph.addOperator(
+        self._serialize_operator(
+            node,
+            tosa_graph,
             ts.TosaOp.Op().REDUCE_SUM,
             [tensor.name],
             [output.name],

@@ -3,4 +3,4 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-ethos-u-vela @ git+https://gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-vela@d37febc1715edf0d236c2ff555739a8a9aadcf9a
+ethos-u-vela == 4.4.0
@@ -5,5 +5,7 @@
 
 ml_dtypes == 0.5.1
 flatbuffers == 24.3.25
+tosa-adapter-model-explorer == 0.0.1
+ai-edge-model-explorer >= 0.1.16
 
 tosa-tools @ git+https://git.gitlab.arm.com/tosa/[email protected]
@@ -383,8 +383,8 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
       *tensor_count = *tensor_count * tensor.size(i);
     }
 
-    // The VelaIO type has a shape of fixed size 4
-    for (int i = 0; i < 4; i++) {
+    // The VelaIO type has a shape of fixed size 6
+    for (int i = 0; i < shapeDim; i++) {
       *io_count = *io_count * io->shape[i];
     }
   }

@@ -34,9 +34,11 @@ typedef struct {
   char data[]; // block.name specific format data
 } VelaBinBlock;
 
+constexpr int shapeDim = 6; // Number of dimensions in VelaIO
+
 // A Vela input or output descriptor in the binary stream
 typedef struct {
-  int shape[4]; // Up to 4D shape of input or output
+  int shape[shapeDim]; // Shape of input or output
   int elem_size; // Element sizeof in bytes
   int offset; // Offset in bytes within SRAM working data
   int region; // Scratch region this belongs to

@@ -44,7 +44,7 @@ help() {
     echo "  --memory_mode=<CONFIG>               Vela memory mode, used for setting the Timing Adapter parameters of the Corstone platforms."
     echo "                                       Valid values are Shared_Sram(for Ethos-U55, Ethos-U65, Ethos-85), Sram_Only(for Ethos-U55, Ethos-U65, Ethos-U85) or Dedicated_Sram(for Ethos-U65, Ethos-U85)."
     echo "                                       Default: Shared_Sram for the Ethos-U55 and Sram_Only for the Ethos-U85"
-    echo "  --etdump                             Adds Devtools etdump support to track timing, etdump area will be base64 encoded in the log"
+    echo "  --etdump                             Adds Devtools etdump support to track timing and output, etdump area will be base64 encoded in the log"
     echo "  --extra_build_flags=<FLAGS>          Extra flags to pass to cmake like -DET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=60000 Default: none "
     echo "  --output=<FOLDER>                    Output folder Default: <MODEL>/<MODEL>_<TARGET INFO>.pte"
     echo "  --et_build_root=<FOLDER>             Build output root folder to use, defaults to ${et_build_root}"
@@ -161,7 +161,7 @@ if [ "$bundleio" = true ] ; then
 fi
 
 if [ "$build_with_etdump" = true ] ; then
-    build_with_etdump_flags=" -DEXECUTORCH_ENABLE_EVENT_TRACER=ON "
+    build_with_etdump_flags=" -DEXECUTORCH_ENABLE_EVENT_TRACER=ON -DET_DUMP_INTERMEDIATE_OUTPUTS=ON "
 fi
 
 echo "Building with BundleIO/etdump/extra flags: ${build_bundleio_flags} ${build_with_etdump_flags} ${extra_build_flags}"

@@ -7,7 +7,7 @@
 set -euo pipefail
 
 mlsdk_manifest_url="https://github.com/arm/ai-ml-sdk-manifest.git"
-mlsdk_manifest_tag="dev-snapshot-2025-09-12"
+mlsdk_manifest_tag="refs/tags/dev-snapshot-2025-09-12"
 
 script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)