pytorch
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 53 additions & 7 deletions b/‎.github/workflows/trunk.yml‎
Lines changed: 53 additions & 7 deletions
diff --git a/‎backends/arm/_passes/decompose_avg_pool2d.py‎
Lines changed: 12 additions & 2 deletions b/‎backends/arm/_passes/decompose_avg_pool2d.py‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎backends/arm/_passes/decompose_grouped_conv.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/_passes/decompose_grouped_conv.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py‎
Lines changed: 2 additions & 2 deletions b/‎backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/_passes/fuse_quantized_activation_pass.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/_passes/fuse_quantized_activation_pass.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/_passes/insert_rescales_pass.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/_passes/insert_rescales_pass.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/_passes/insert_table_ops.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/_passes/insert_table_ops.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/_passes/quant_args.py‎
Lines changed: 125 additions & 0 deletions b/‎backends/arm/_passes/quant_args.py‎
Lines changed: 125 additions & 0 deletions
diff --git a/‎backends/arm/operator_support/ethos_u55_support.py‎
Lines changed: 2 additions & 0 deletions b/‎backends/arm/operator_support/ethos_u55_support.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/arm/runtime/VGFBackend.cpp‎
Lines changed: 50 additions & 5 deletions b/‎backends/arm/runtime/VGFBackend.cpp‎
Lines changed: 50 additions & 5 deletions
@@ -78,25 +78,71 @@ jobs:
         mkdir -p zephyr_scratch/
         cd zephyr_scratch
         export ZEPHYR_PROJ_ROOT=$(realpath $(pwd))
+        export ARM_FVP_TUTORIALS_ROOT=$ZEPHYR_PROJ_ROOT/zephyr/samples/modules/executorch/arm-fvp-tutorials
 
+        # TODO @Bujji: Should see if this can be moved into the docker image itself
         download_arm_zephyr_sdk
         ./zephyr-sdk-0.16.0/setup.sh -c -t arm-zephyr-eabi
-
         cd $ZEPHYR_PROJ_ROOT
         setup_zephyr_et_module
 
+        # Run setup scripts for Arm FVP and Arm AOT Compilation
         cd $ZEPHYR_PROJ_ROOT/modules/lib/executorch
         install_executorch "--use-pt-pinned-commit"
         .ci/scripts/setup-arm-baremetal-tools.sh --target-toolchain zephyr
         source examples/arm/ethos-u-scratch/setup_path.sh
         source $ZEPHYR_PROJ_ROOT/zephyr/zephyr-env.sh
-        cd $ZEPHYR_PROJ_ROOT/zephyr/samples/modules/executorch/arm/hello_world
-        west build -p always -b mps3/corstone300/fvp
-        FVP_Corstone_SSE-300_Ethos-U55 -a build/zephyr/zephyr.elf -C mps3_board.visualisation.disable-visualisation=1 -C mps3_board.telnetterminal0.start_telnet=0 -C mps3_board.uart0.out_file='sim.out'  -C cpu0.CFGITCMSZ=15 -C cpu0.CFGDTCMSZ=15 --simlimit 120
 
-        grep -qF "Output[0][0]: (float) 2.000000" sim.out
-        exit_status=$? #store 0 if found (success), 1 if not (failure)
-        exit $exit_status
+        # Get the model as PTE
+        python -m examples.arm.aot_arm_compiler \
+            --model_name="${MODEL_NAME}" \
+            --output="${MODEL_NAME}.pte"
+
+        # Generate the C-style header
+        cd $ARM_FVP_TUTORIALS_ROOT
+        python build_model.py \
+            --executorch-root $ZEPHYR_PROJ_ROOT/modules/lib/executorch \
+            --pte-file $ZEPHYR_PROJ_ROOT/modules/lib/executorch/${MODEL_NAME}.pte \
+            --output-path $ARM_FVP_TUTORIALS_ROOT/models/${MODEL_NAME}/src/
+
+        cd $ARM_FVP_TUTORIALS_ROOT/models/${MODEL_NAME}/
+
+        # Build the zephyr elf
+        west build -p always -b mps3/corstone300/fvp -- \
+            -DET_PTE_FILE_PATH_FOR_SELECTIVE_BUILD=$ZEPHYR_PROJ_ROOT/modules/lib/executorch/${MODEL_NAME}.pte
+
+        # Run the simulation
+        FVP_Corstone_SSE-300_Ethos-U55 -a build/zephyr/zephyr.elf \
+            -C mps3_board.visualisation.disable-visualisation=1 \
+            -C mps3_board.telnetterminal0.start_telnet=0 \
+            -C mps3_board.uart0.out_file='sim.out'  \
+            -C cpu0.CFGITCMSZ=15 \
+            -C cpu0.CFGDTCMSZ=15 \
+            --simlimit 120
+
+        # Disable exit on error
+        set +e
+        # Report failure if any of the ouptut verification checks fail
+        # store 0 if found (failure), 1 if not (success)
+        grep -qF "ERROR" sim.out
+        exit_status=$?
+        if [[ "$exit_status" -eq "0" ]]; then
+            cat sim.out
+            set -e
+            exit 1
+        fi
+
+        # Report fail if simulation does not complete successfully
+        # store 0 if found (success), 1 if not (failure)
+        grep -qF "SUCCESS: Program complete, exiting." sim.out
+        exit_status=$?
+        if [[ "$exit_status" -eq "1" ]]; then
+            cat sim.out
+            set -e
+            exit 1
+        fi
+        # Re-enable exit on error
+        set -e
 
   test-models-linux-aarch64:
     name: test-models-linux-aarch64
 
@@ -45,7 +45,10 @@ def call_operator(self, op, args, kwargs, meta):
         x = args[0]
         kernel_h, kernel_w = args[1]
         kernel_size = kernel_h * kernel_w
-        stride_h, stride_w = args[2]
+        if len(args) > 2 and args[2] is not None:
+            stride_h, stride_w = args[2]
+        else:
+            stride_h, stride_w = kernel_h, kernel_w
         pad_h, pad_w = new_pad_h, new_pad_w = args[3] if len(args) > 3 else (0, 0)
         ceil_mode = args[4] if len(args) > 4 else False
         count_include_pad = args[5] if len(args) > 5 else True
@@ -108,7 +111,14 @@ def call_operator(self, op, args, kwargs, meta):
             x = super().call_operator(cat_op, (cat_nodes, 2), kwargs, meta)
             new_pad_h = 0
 
-        avgpool_args = (x, args[1], args[2], [new_pad_h, new_pad_w], ceil_mode, False)
+        avgpool_args = (
+            x,
+            args[1],
+            [stride_h, stride_w],
+            [new_pad_h, new_pad_w],
+            ceil_mode,
+            False,
+        )
         x = super().call_operator(avgpool_op, avgpool_args, kwargs, meta)
 
         # Multiply by factor (kernel_size / divisor_override) if divisor_override
 
@@ -6,7 +6,7 @@
 from copy import copy
 
 import torch
-from executorch.backends.arm.tosa_quant_utils import QuantArgs
+from executorch.backends.arm._passes.quant_args import QuantArgs
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
 
 
@@ -15,9 +15,9 @@
     get_param_tensor,
     is_param_node,
 )
-from executorch.backends.arm.constants import DQ_OPS, Q_OPS
 
-from executorch.backends.arm.tosa_quant_utils import QuantArgs
+from executorch.backends.arm._passes.quant_args import QuantArgs
+from executorch.backends.arm.constants import DQ_OPS, Q_OPS
 
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
 
@@ -6,8 +6,8 @@
 # pyre-unsafe
 
 import torch
+from executorch.backends.arm._passes.quant_args import QuantArgs
 from executorch.backends.arm.constants import Q_OPS
-from executorch.backends.arm.tosa_quant_utils import QuantArgs
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch.fx import Node
 
@@ -9,8 +9,8 @@
 
 import torch
 from executorch.backends.arm._passes.arm_pass_utils import create_node
+from executorch.backends.arm._passes.quant_args import QuantArgs
 from executorch.backends.arm.constants import DQ_OPS, Q_OPS
-from executorch.backends.arm.tosa_quant_utils import QuantArgs
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch import Tensor
 from torch.fx import GraphModule, Node
 
@@ -10,7 +10,7 @@
 
 import torch
 from executorch.backends.arm._passes.arm_pass_utils import create_node
-from executorch.backends.arm.tosa_quant_utils import QuantArgs
+from executorch.backends.arm._passes.quant_args import QuantArgs
 from executorch.exir import ExportedProgram
 
 from executorch.exir.dialects._ops import ops as exir_ops
 
@@ -0,0 +1,125 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Any, cast, NamedTuple
+
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+
+exir_ops = cast(Any, exir_ops)
+from executorch.backends.arm.constants import PER_CHANNEL_QDQ_OPS, PER_TENSOR_QDQ_OPS
+from torch import Tensor
+
+
+class QuantArgs(NamedTuple):
+    scale: list[float] | float
+    zp: list[int] | int
+    qmin: int
+    qmax: int
+    dtype: torch.dtype
+    axis: int = 0
+    per_channel: bool = False
+
+    def quantize_value(self, x: torch.Tensor | float) -> Tensor:
+        """Quantizes the input tensor or value to a quantized tensor. If the input is
+        not a tensor, it is converted to a tensor first. If self.per_channel is True,
+        the quantization is done per channel, otherwise it is done per tensor.
+        """
+        if not isinstance(x, torch.Tensor):
+            x = torch.Tensor([x])
+        x = x.to(torch.float32)
+        if self.per_channel:
+            q_op = exir_ops.edge.quantized_decomposed.quantize_per_channel.default
+            args = (
+                x,
+                torch.tensor(self.scale),
+                torch.tensor(self.zp),
+                self.axis,
+                self.qmin,
+                self.qmax,
+                self.dtype,
+            )
+        else:
+            q_op = exir_ops.edge.quantized_decomposed.quantize_per_tensor.default
+            args = (x, self.scale, self.zp, self.qmin, self.qmax, self.dtype)  # type: ignore[assignment]
+        return q_op(*args)
+
+    def dequantize_value(self, qx: torch.Tensor) -> torch.Tensor:
+        """Dequantizes the input tensor or value to a dequantized tensor  If the input
+        is not a tensor, it is converted to a tensor first. If self.per_channel is True,
+        the dequantization is done per channel, otherwise it is done per tensor.
+        """
+        if self.per_channel:
+            dq_op = exir_ops.edge.quantized_decomposed.dequantize_per_channel.default
+            args = (
+                qx,
+                torch.tensor(self.scale),
+                torch.tensor(self.zp),
+                self.axis,
+                self.qmin,
+                self.qmax,
+                self.dtype,
+            )
+        else:
+            dq_op = exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default
+            args = (qx, self.scale, self.zp, self.qmin, self.qmax, self.dtype)  # type: ignore[assignment]
+        return dq_op(*args)
+
+    @classmethod
+    def from_operator(cls, op, args):
+        if op in PER_TENSOR_QDQ_OPS:
+            return cls(
+                scale=cast(float, args[1]),
+                zp=cast(int, args[2]),
+                qmin=cast(int, args[3]),
+                qmax=cast(int, args[4]),
+                dtype=cast(torch.dtype, args[5]),
+                axis=0,
+                per_channel=False,
+            )
+        elif op in PER_CHANNEL_QDQ_OPS:
+            return cls(
+                scale=cast(list[float], args[1].tolist()),
+                zp=cast(list[int], args[2].tolist()),
+                axis=cast(int, args[3]),
+                qmin=cast(int, args[4]),
+                qmax=cast(int, args[5]),
+                dtype=cast(torch.dtype, args[6]),
+                per_channel=True,
+            )
+        else:
+            # We're only handling per tensor and per channel quantization
+            raise NotImplementedError(f"Unsupported quantization operation: {op}")
+
+    def get_scale_per_tensor(self) -> float:
+        if not isinstance(self.scale, float):
+            raise TypeError(
+                f"Expected scale {self.scale} to be a float but found scale of "
+                f"type {type(self.scale)}"
+            )
+        return self.scale
+
+    def get_zp_per_tensor(self) -> int:
+        if not isinstance(self.zp, int):
+            raise TypeError(
+                f"Expected zero point {self.zp} to be an int but found zp of "
+                f"type {type(self.zp)}"
+            )
+        return self.zp
+
+    def get_scale_per_channel(self) -> list[float]:
+        if not isinstance(self.scale, list):
+            raise TypeError(
+                f"Expected scale {self.scale} to be a list but found scale of "
+                f"type {type(self.scale)}"
+            )
+        return self.scale
+
+    def get_zp_per_channel(self) -> list[int]:
+        if not isinstance(self.zp, list):
+            raise TypeError(
+                f"Expected zero point {self.zp} to be a list but found zp of "
+                f"type {type(self.zp)}"
+            )
+        return self.zp
@@ -149,6 +149,8 @@ class EthosU55NotSupported(OperatorSupportBase):
         exir_ops.edge.aten.ne.Scalar,
         exir_ops.edge.aten.flip.default,  # REVERSE
         exir_ops.edge.aten.grid_sampler_2d,  # GATHER
+        exir_ops.edge.aten.index.Tensor,  # GATHER
+        exir_ops.edge.aten.index_select.default,  # GATHER
         exir_ops.edge.aten.scatter.src,
         exir_ops.edge.aten.scatter.value,
         exir_ops.edge.aten.select_scatter.default,
 
@@ -264,15 +264,60 @@ VkResult vkml_allocate_basics(
       .engineVersion = 0,
       .apiVersion = VK_API_VERSION_1_3,
   };
+
+  std::vector<const char*> requested_extensions;
+  VkInstanceCreateFlags instance_flags = 0;
+
+#ifdef __APPLE__
+  instance_flags |= VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR;
+
+  uint32_t extension_count = 0;
+  result = vkEnumerateInstanceExtensionProperties(
+      nullptr, &extension_count, nullptr);
+
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to enumerate instance extensions");
+    return result;
+  }
+
+  std::vector<VkExtensionProperties> extension_properties(extension_count);
+  result = vkEnumerateInstanceExtensionProperties(
+      nullptr, &extension_count, extension_properties.data());
+
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to enumerate instance extensions");
+    return result;
+  }
+
+  if (std::any_of(
+          extension_properties.begin(),
+          extension_properties.end(),
+          [](const auto& extension) {
+            return strcmp(
+                       VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME,
+                       extension.extensionName) == 0;
+          })) {
+    requested_extensions.push_back(
+        VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME);
+  }
+
+  if (requested_extensions.empty()) {
+    ET_LOG(Error, "VK_KHR_portability_enumeration not found");
+  }
+
+#endif
+
   VkInstanceCreateInfo instance_info{
       .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
       .pNext = nullptr,
-      .flags = 0,
+      .flags = instance_flags,
       .pApplicationInfo = &app_info,
-      0,
-      nullptr,
-      0,
-      nullptr};
+      .enabledLayerCount = 0,
+      .ppEnabledLayerNames = nullptr,
+      .enabledExtensionCount =
+          static_cast<uint32_t>(requested_extensions.size()),
+      .ppEnabledExtensionNames = requested_extensions.data(),
+  };
   result = vkCreateInstance(&instance_info, nullptr, instance);
   if (result != VK_SUCCESS) {
     ET_LOG(Error, "Failed to create VkInstance");