pytorch
diff --git a/‎.ci/scripts/test_model.sh‎
Lines changed: 8 additions & 5 deletions b/‎.ci/scripts/test_model.sh‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎.ci/scripts/wheel/test_macos.py‎
Lines changed: 4 additions & 6 deletions b/‎.ci/scripts/wheel/test_macos.py‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎.github/workflows/build-wheels-linux.yml‎
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/build-wheels-linux.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/workflows/build-wheels-macos.yml‎
Lines changed: 6 additions & 1 deletion b/‎.github/workflows/build-wheels-macos.yml‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎.gitmodules‎
Lines changed: 2 additions & 2 deletions b/‎.gitmodules‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/README.md‎
Lines changed: 0 additions & 18 deletions b/‎backends/arm/README.md‎
Lines changed: 0 additions & 18 deletions
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 48 additions & 0 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 26 additions & 61 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 26 additions & 61 deletions
diff --git a/‎backends/arm/_passes/cast_int64_pass.py‎
Lines changed: 2 additions & 2 deletions b/‎backends/arm/_passes/cast_int64_pass.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/_passes/cast_to_int32_pass.py‎
Lines changed: 54 additions & 0 deletions b/‎backends/arm/_passes/cast_to_int32_pass.py‎
Lines changed: 54 additions & 0 deletions
@@ -96,15 +96,15 @@ test_model() {
       bash examples/models/llama/install_requirements.sh
       # Test export_llama script: python3 -m examples.models.llama.export_llama.
       # Use Llama random checkpoint with Qwen 2.5 1.5b model configuration.
-      "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -c examples/models/llama/params/demo_rand_params.pth -p examples/models/qwen2_5/1_5b_config.json
+      "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -p examples/models/qwen2_5/1_5b_config.json
       rm "./${MODEL_NAME}.pte"
       return  # Skip running with portable executor runnner since portable doesn't support Qwen's biased linears.
   fi
   if [[ "${MODEL_NAME}" == "phi_4_mini" ]]; then
       # Install requirements for export_llama
       bash examples/models/llama/install_requirements.sh
       # Test export_llama script: python3 -m examples.models.llama.export_llama.
-      "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -c examples/models/llama/params/demo_rand_params.pth -p examples/models/phi_4_mini/config.json
+      "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -p examples/models/phi_4_mini/config.json
       run_portable_executor_runner
       rm "./${MODEL_NAME}.pte"
       return
@@ -224,19 +224,22 @@ test_model_with_coreml() {
 
   "${PYTHON_EXECUTABLE}" -m examples.apple.coreml.scripts.export --model_name="${MODEL_NAME}" --compute_precision "${DTYPE}"
   EXPORTED_MODEL=$(find "." -type f -name "${MODEL_NAME}*.pte" -print -quit)
-  # TODO:
+
   if [ -n "$EXPORTED_MODEL" ]; then
     EXPORTED_MODEL_WITH_DTYPE="${EXPORTED_MODEL%.pte}_${DTYPE}.pte"
     mv "$EXPORTED_MODEL" "$EXPORTED_MODEL_WITH_DTYPE"
     EXPORTED_MODEL="$EXPORTED_MODEL_WITH_DTYPE"
-    echo "Renamed file path: $EXPORTED_MODEL"
+    echo "OK exported model: $EXPORTED_MODEL"
   else
-    echo "No .pte file found"
+    echo "[error] failed to export model: no .pte file found"
     exit 1
   fi
 
   # Run the model
   if [ "${should_test}" = true ]; then
+    echo "Installing requirements needed to build coreml_executor_runner..."
+    backends/apple/coreml/scripts/install_requirements.sh
+
     echo "Testing exported model with coreml_executor_runner..."
     local out_dir=$(mktemp -d)
     COREML_EXECUTOR_RUNNER_OUT_DIR="${out_dir}" examples/apple/coreml/scripts/build_executor_runner.sh
 
@@ -15,11 +15,9 @@
                 model=Model.Mv3,
                 backend=Backend.XnnpackQuantizationDelegation,
             ),
-            # Enable this once CoreML is suppported out-of-the-box
-            # https://github.com/pytorch/executorch/issues/9019
-            # test_base.ModelTest(
-            #     model=Model.Mv3,
-            #     backend=Backend.CoreMlTest,
-            # )
+            test_base.ModelTest(
+                model=Model.Mv3,
+                backend=Backend.CoreMlTest,
+            ),
         ]
     )
@@ -6,6 +6,9 @@ on:
     paths:
       - .ci/**/*
       - .github/workflows/build-wheels-linux.yml
+      - examples/**/*
+      - pyproject.toml
+      - setup.py
   push:
     branches:
       - nightly
 
@@ -6,6 +6,9 @@ on:
     paths:
       - .ci/**/*
       - .github/workflows/build-wheels-macos.yml
+      - examples/**/*
+      - pyproject.toml
+      - setup.py
   push:
     branches:
       - nightly
@@ -57,6 +60,8 @@ jobs:
       pre-script: ${{ matrix.pre-script }}
       post-script: ${{ matrix.post-script }}
       package-name: ${{ matrix.package-name }}
-      runner-type: macos-m1-stable
+      # Meta's macOS runners do not have Xcode, so use GitHub's runners.
+      runner-type: macos-latest-xlarge
+      setup-miniconda: true
       smoke-test-script: ${{ matrix.smoke-test-script }}
       trigger-event: ${{ github.event_name }}
@@ -1,9 +1,9 @@
 [submodule "backends/arm/third-party/ethos-u-core-driver"]
 	path = backends/arm/third-party/ethos-u-core-driver
-	url = https://github.com/pytorch-labs/ethos-u-core-driver-mirror
+	url = https://git.gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-core-driver.git
 [submodule "backends/arm/third-party/serialization_lib"]
 	path = backends/arm/third-party/serialization_lib
-	url = https://github.com/pytorch-labs/tosa_serialization_lib-mirror
+	url = https://git.gitlab.arm.com/tosa/tosa-serialization.git
 [submodule "backends/vulkan/third-party/Vulkan-Headers"]
 	path = backends/vulkan/third-party/Vulkan-Headers
 	url = https://github.com/KhronosGroup/Vulkan-Headers
 
@@ -79,30 +79,12 @@ test                            #  Root test folder
 
 Some example commands to run these tests follow. Run a single test:
 
-```
-python -m unittest backends.arm.test.ops.test_add.TestSimpleAdd -k test_add2_tosa_BI
-```
-
-or with pytest
-
 ```
 pytest -c /dev/null -v -n auto backends/arm/test/ops/test_add.py -k test_add2_tosa_BI
 ```
 
-Or all tests in "TestSimpleAdd":
-
-```
-python -m unittest backends.arm.test.ops.test_add.TestSimpleAdd
-```
-
 Or discover and run many tests:
 
-```
-python -m unittest discover -s backends/arm/test/ops/
-```
-
-or with pytest
-
 ```
 pytest -c /dev/null -v -n auto backends/arm/test/ops/
 ```
 
@@ -0,0 +1,48 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from . import arm_pass_utils  # noqa
+from .annotate_channels_last_dim_order_pass import AnnotateChannelsLastDimOrder  # noqa
+from .annotate_decomposed_matmul import AnnotateDecomposedMatmulPass  # noqa
+from .cast_int64_pass import CastInt64BuffersToInt32Pass  # noqa
+from .cast_to_int32_pass import CastToInt32Pass  # noqa
+from .conv1d_unsqueeze_pass import Conv1dUnsqueezePass  # noqa
+from .convert_any_default_dim_dims_pass import ConvertAnyDefaultDimDimsPass  # noqa
+from .convert_expand_copy_to_repeat import ConvertExpandCopyToRepeatPass  # noqa
+from .convert_full_like_to_full_pass import ConvertFullLikeToFullPass  # noqa
+from .convert_minmax_pass import ConvertMinMaxPass  # noqa
+from .convert_split_to_slice import ConvertSplitToSlicePass  # noqa
+from .convert_squeezes_to_view import ConvertSqueezesToViewPass  # noqa
+from .convert_to_clamp import ConvertToClampPass  # noqa
+from .decompose_batchnorm_pass import DecomposeBatchNormPass  # noqa
+from .decompose_div_pass import DecomposeDivPass  # noqa
+from .decompose_layernorm_pass import DecomposeLayerNormPass  # noqa
+from .decompose_linear_pass import DecomposeLinearPass  # noqa
+from .decompose_meandim_pass import DecomposeMeanDimPass  # noqa
+from .decompose_select import DecomposeSelectPass  # noqa
+from .decompose_softmax_pass import DecomposeSoftmaxPass  # noqa
+from .decompose_softmax_unstable_pass import DecomposeSoftmaxUnstablePass  # noqa
+from .decompose_var_pass import DecomposeVarPass  # noqa
+from .fold_qdq_with_annotated_qparams_pass import (  # noqa
+    FoldAndAnnotateQParamsPass,
+    QuantizeOperatorArguments,
+    RetraceFoldedDtypesPass,
+)
+from .fuse_batchnorm2d_pass import FuseBatchnorm2DPass  # noqa
+from .fuse_constant_ops_pass import ComputeConstantOpsAOT, FuseConstantArgsPass  # noqa
+from .fuse_quantized_activation_pass import FuseQuantizedActivationPass  # noqa
+from .insert_rescales_pass import InsertRescalePass  # noqa
+from .insert_table_ops import InsertTableOpsPass  # noqa
+from .keep_dims_false_to_squeeze_pass import KeepDimsFalseToSqueezePass  # noqa
+from .match_arg_ranks_pass import MatchArgRanksPass  # noqa
+from .meandim_to_averagepool_pass import ConvertMeanDimToAveragePoolPass  # noqa
+from .mm_to_bmm_pass import ConvertMmToBmmPass  # noqa
+from .remove_clone_pass import RemoveClonePass  # noqa
+from .scalars_to_attribute_pass import ScalarsToAttributePass  # noqa
+from .size_adjust_conv2d_pass import SizeAdjustConv2DPass  # noqa
+from .unsqueeze_before_repeat_pass import UnsqueezeBeforeRepeatPass  # noqa
+from .unsqueeze_scalar_placeholders_pass import UnsqueezeScalarPlaceholdersPass  # noqa
+from .arm_pass_manager import ArmPassManager  # noqa  # usort: skip
@@ -7,82 +7,45 @@
 
 # pyre-unsafe
 
-from executorch.backends.arm._passes.annotate_channels_last_dim_order_pass import (
+from executorch.backends.arm._passes import (
     AnnotateChannelsLastDimOrder,
-)
-from executorch.backends.arm._passes.annotate_decomposed_matmul import (
     AnnotateDecomposedMatmulPass,
-)
-from executorch.backends.arm._passes.cast_int64_pass import CastInt64ToInt32Pass
-from executorch.backends.arm._passes.conv1d_unsqueeze_pass import Conv1dUnsqueezePass
-from executorch.backends.arm._passes.convert_any_default_dim_dims_pass import (
+    CastInt64BuffersToInt32Pass,
+    CastToInt32Pass,
+    ComputeConstantOpsAOT,
+    Conv1dUnsqueezePass,
     ConvertAnyDefaultDimDimsPass,
-)
-from executorch.backends.arm._passes.convert_expand_copy_to_repeat import (
     ConvertExpandCopyToRepeatPass,
-)
-from executorch.backends.arm._passes.convert_full_like_to_full_pass import (
     ConvertFullLikeToFullPass,
-)
-from executorch.backends.arm._passes.convert_minmax_pass import ConvertMinMaxPass
-from executorch.backends.arm._passes.convert_split_to_slice import (
+    ConvertMeanDimToAveragePoolPass,
+    ConvertMinMaxPass,
+    ConvertMmToBmmPass,
     ConvertSplitToSlicePass,
-)
-from executorch.backends.arm._passes.convert_squeezes_to_view import (  # type: ignore[import-not-found]
     ConvertSqueezesToViewPass,
-)
-from executorch.backends.arm._passes.convert_to_clamp import ConvertToClampPass
-from executorch.backends.arm._passes.decompose_batchnorm_pass import (
+    ConvertToClampPass,
     DecomposeBatchNormPass,
-)
-from executorch.backends.arm._passes.decompose_div_pass import DecomposeDivPass
-from executorch.backends.arm._passes.decompose_layernorm_pass import (
+    DecomposeDivPass,
     DecomposeLayerNormPass,
-)
-from executorch.backends.arm._passes.decompose_linear_pass import DecomposeLinearPass
-from executorch.backends.arm._passes.decompose_meandim_pass import DecomposeMeanDimPass
-from executorch.backends.arm._passes.decompose_select import (  # type: ignore[import-not-found]
+    DecomposeLinearPass,
+    DecomposeMeanDimPass,
     DecomposeSelectPass,
-)
-from executorch.backends.arm._passes.decompose_softmax_pass import DecomposeSoftmaxPass
-from executorch.backends.arm._passes.decompose_softmax_unstable_pass import (
+    DecomposeSoftmaxPass,
     DecomposeSoftmaxUnstablePass,
-)
-from executorch.backends.arm._passes.decompose_var_pass import DecomposeVarPass
-from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
+    DecomposeVarPass,
     FoldAndAnnotateQParamsPass,
-    QuantizeOperatorArguments,
-    RetraceFoldedDtypesPass,
-)
-from executorch.backends.arm._passes.fuse_batchnorm2d_pass import FuseBatchnorm2DPass
-from executorch.backends.arm._passes.fuse_constant_ops_pass import (
-    ComputeConstantOpsAOT,
+    FuseBatchnorm2DPass,
     FuseConstantArgsPass,
-)
-from executorch.backends.arm._passes.fuse_quantized_activation_pass import (  # type: ignore[import-not-found]
     FuseQuantizedActivationPass,
-)
-from executorch.backends.arm._passes.insert_rescales_pass import InsertRescalePass
-from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
-from executorch.backends.arm._passes.keep_dims_false_to_squeeze_pass import (
+    InsertRescalePass,
+    InsertTableOpsPass,
     KeepDimsFalseToSqueezePass,
-)
-from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass
-from executorch.backends.arm._passes.meandim_to_averagepool_pass import (  # type: ignore[attr-defined]
-    ConvertMeanDimToAveragePoolPass,
-)
-from executorch.backends.arm._passes.mm_to_bmm_pass import (  # type: ignore[import-not-found]
-    ConvertMmToBmmPass,
-)
-from executorch.backends.arm._passes.remove_clone_pass import RemoveClonePass
-from executorch.backends.arm._passes.scalars_to_attribute_pass import (
+    MatchArgRanksPass,
+    QuantizeOperatorArguments,
+    RemoveClonePass,
+    RetraceFoldedDtypesPass,
     ScalarsToAttributePass,
-)
-from executorch.backends.arm._passes.size_adjust_conv2d_pass import SizeAdjustConv2DPass
-from executorch.backends.arm._passes.unsqueeze_before_repeat_pass import (
+    SizeAdjustConv2DPass,
     UnsqueezeBeforeRepeatPass,
-)
-from executorch.backends.arm._passes.unsqueeze_scalar_placeholders_pass import (
     UnsqueezeScalarPlaceholdersPass,
 )
 from executorch.backends.arm.tosa_specification import Tosa_0_80, TosaSpecification
@@ -118,6 +81,8 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(ConvertToClampPass())
         self.add_pass(ConvertMinMaxPass())
         self.add_pass(ConvertAnyDefaultDimDimsPass())
+        if isinstance(self.tosa_spec, Tosa_0_80) and self.tosa_spec.is_U55_subset:
+            self.add_pass(CastToInt32Pass())
 
         self.add_pass(ReplaceScalarWithTensorArgPass())
         self.add_pass(AnnotateDecomposedMatmulPass())
@@ -132,7 +97,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(SizeAdjustConv2DPass())
         self.add_pass(ConvertExpandCopyToRepeatPass())
         self.add_pass(UnsqueezeBeforeRepeatPass())
-        self.add_pass(CastInt64ToInt32Pass(exported_program))
+        self.add_pass(CastInt64BuffersToInt32Pass(exported_program))
         self.add_pass(KeepDimsFalseToSqueezePass())
         self.add_pass(Conv1dUnsqueezePass(exported_program))
         self.add_pass(DecomposeSelectPass())
@@ -179,7 +144,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(SizeAdjustConv2DPass())
         self.add_pass(ConvertExpandCopyToRepeatPass())
         self.add_pass(UnsqueezeBeforeRepeatPass())
-        self.add_pass(CastInt64ToInt32Pass(exported_program))
+        self.add_pass(CastInt64BuffersToInt32Pass(exported_program))
         self.add_pass(KeepDimsFalseToSqueezePass())
         self.add_pass(Conv1dUnsqueezePass(exported_program))
         self.add_pass(DecomposeSelectPass())
 
@@ -15,13 +15,13 @@
 logger.setLevel(logging.WARNING)
 
 
-class CastInt64ToInt32Pass(ExportPass):
+class CastInt64BuffersToInt32Pass(ExportPass):
     """
     Cast int64 buffers to int32 if the int64 data is in int32 range.
     """
 
     def __init__(self, exported_program: torch.export.ExportedProgram):
-        super(CastInt64ToInt32Pass, self).__init__()
+        super(CastInt64BuffersToInt32Pass, self).__init__()
         self.exported_program = exported_program
 
     def _assert_within_int32(self, tensor: torch.Tensor, node: torch.fx.Node):
 
@@ -0,0 +1,54 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+
+class CastToInt32Pass(ExportPass):
+    """Casts the input to int32 if it is not already and casts back the output to the original input dtype."""
+
+    targeted_ops = {
+        exir_ops.edge.aten.bitwise_left_shift.Tensor,
+        exir_ops.edge.aten.bitwise_right_shift.Tensor,
+    }
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in self.targeted_ops:
+            return super().call_operator(op, args, kwargs, meta)
+
+        new_args: list = []
+        did_cast = False
+        for arg in args:
+            if arg.data.dtype != torch.int32:
+                new_args.append(
+                    super().call_operator(
+                        exir_ops.edge.dim_order_ops._to_dim_order_copy.default,
+                        (arg,),
+                        {"dtype": torch.int32},
+                        meta,
+                    )
+                )
+                did_cast = True
+            else:
+                new_args.append(arg)
+
+        output = super().call_operator(
+            op,
+            tuple(new_args),
+            {},
+            meta,
+        )
+
+        if did_cast:
+            output = super().call_operator(
+                exir_ops.edge.dim_order_ops._to_dim_order_copy.default,
+                (output,),
+                {"dtype": args[0].data.dtype},
+                meta,
+            )
+        return output