pytorch
diff --git a/‎.ci/docker/requirements-ci.txt‎
Lines changed: 4 additions & 7 deletions b/‎.ci/docker/requirements-ci.txt‎
Lines changed: 4 additions & 7 deletions
diff --git a/‎.ci/scripts/build-qnn-sdk.sh‎
Lines changed: 9 additions & 2 deletions b/‎.ci/scripts/build-qnn-sdk.sh‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎.ci/scripts/setup-qnn-deps.sh‎
Lines changed: 2 additions & 2 deletions b/‎.ci/scripts/setup-qnn-deps.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.ci/scripts/test_llama.sh‎
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/test_llama.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/apple/coreml/scripts/install_requirements.sh‎
Lines changed: 1 addition & 5 deletions b/‎backends/apple/coreml/scripts/install_requirements.sh‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎backends/arm/README.md‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 4 additions & 9 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 4 additions & 9 deletions
diff --git a/‎backends/arm/_passes/decompose_select.py‎
Lines changed: 56 additions & 0 deletions b/‎backends/arm/_passes/decompose_select.py‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎backends/arm/_passes/tag_io_quant_pass.py‎
Lines changed: 0 additions & 51 deletions b/‎backends/arm/_passes/tag_io_quant_pass.py‎
Lines changed: 0 additions & 51 deletions
diff --git a/‎backends/arm/arm_backend.py‎
Lines changed: 16 additions & 26 deletions b/‎backends/arm/arm_backend.py‎
Lines changed: 16 additions & 26 deletions
@@ -1,17 +1,14 @@
 mpmath==1.3.0
-numpy==1.21.3; python_version == '3.10'
-numpy==1.23.2; python_version == '3.11'
-numpy; python_version >= '3.12'
+numpy==2.0.0; python_version >= '3.10'
 PyYAML==6.0.1
 ruamel.yaml==0.17.32
 sympy==1.12
 timm==0.6.13
 tomli==2.0.1
 torchsr==1.0.4
-transformers==4.38.0
+transformers==4.47.1
 zstd==1.5.5.1
-pandas==2.0.3; python_version == '3.10'
-pandas; python_version >= '3.11'
+pandas==2.2.2; python_version >= '3.10'
 pytest==7.2.0
 pytest-cov==4.1.0
 expecttest==0.1.6
@@ -24,7 +21,7 @@ sphinx-gallery==0.14.0
 breathe==4.34.0
 exhale==0.2.3
 docutils==0.16
-matplotlib==3.7.2
+matplotlib==3.9.4
 # PyTorch Theme
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
 myst-parser==0.18.1
 
@@ -1,5 +1,6 @@
 #!/bin/bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -11,10 +12,16 @@ set -o xtrace
 build_qnn_backend() {
   echo "Start building qnn backend."
   export ANDROID_NDK_ROOT=/opt/ndk
-  export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728
+  export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
   export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)"
 
-  bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release
+  # Workaround to avoid issues around missing flatccrt library (depending on the
+  # number of jobs used), see issue #7300:
+  # Build twice (second time with `--no_clean`) to make sure libflatccrt.a is
+  # available.
+  # TODO: Remove this workaround once the underlying issue is fixed.
+  bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release || \
+  bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release --no_clean
 }
 
 set_up_aot() {
 
@@ -16,9 +16,9 @@ install_qnn() {
   QNN_INSTALLATION_DIR=/tmp/qnn
   mkdir -p "${QNN_INSTALLATION_DIR}"
 
-  curl -Lo /tmp/v2.25.0.24.07.28.zip "https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.25.0.240728.zip"
+  curl -Lo /tmp/v2.28.0.24.10.29.zip "https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.28.0.241029.zip"
   echo "Finishing downloading qnn sdk."
-  unzip -qo /tmp/v2.25.0.24.07.28.zip -d /tmp
+  unzip -qo /tmp/v2.28.0.24.10.29.zip -d /tmp
   echo "Finishing unzip qnn sdk."
 
 
 
@@ -121,7 +121,7 @@ echo "COREML option ${COREML}"
 if [[ "${MODE}" =~ .*qnn.* ]]; then
   QNN=ON
   export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)"
-  export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728
+  export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
   export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang"
   export PYTHONPATH=".."
   cp schema/program.fbs exir/_serialize/program.fbs
 
@@ -47,11 +47,7 @@ cmake --build "$COREMLTOOLS_DIR_PATH/build" --parallel
 
 echo "${green}ExecuTorch: Installing coremltools."
 pip install "$COREMLTOOLS_DIR_PATH"
-# CoreMLTools have started supporting numpy 2.0,
-# but ExecuTorch example model test env is still using older transformers,
-# so for now we will need to downgrade numpy to 1.x
-# TODO: Remove this numpy downgrade once later transformers starts to be used
-pip install numpy==1.26.4
+
 STATUS=$?
 if [ $STATUS -ne 0 ]; then
     echo "${red}ExecuTorch: Failed to install coremltools."
 
@@ -119,7 +119,7 @@ backends/arm/test/setup_testing.sh
 The you can run the tests with
 
 ```
-pytest -c /dev/null -v -n auto backends/arm/test --arm_quantize_io --arm_run_corstoneFVP
+pytest -c /dev/null -v -n auto backends/arm/test --arm_run_corstoneFVP
 ```
 
 ### Code coverage
 
@@ -28,6 +28,7 @@
 )
 from executorch.backends.arm._passes.decompose_linear_pass import DecomposeLinearPass
 from executorch.backends.arm._passes.decompose_meandim_pass import DecomposeMeanDimPass
+from executorch.backends.arm._passes.decompose_select import DecomposeSelectPass
 from executorch.backends.arm._passes.decompose_softmaxes_pass import (
     DecomposeSoftmaxesPass,
 )
@@ -62,7 +63,6 @@
 )
 from executorch.backends.xnnpack._passes.remove_getitem_op import RemoveGetItemPass
 from executorch.exir import ExportedProgram
-from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_manager import PassManager
 
@@ -72,9 +72,7 @@ class ArmPassManager(PassManager):
     def _transform(self, graph_module: torch.fx.GraphModule):
         return self(graph_module).graph_module
 
-    def transform_to_backend_pipeline(
-        self, exported_program: ExportedProgram, compile_spec: list[CompileSpec]
-    ):
+    def transform_to_backend_pipeline(self, exported_program: ExportedProgram):
         """Apply passes before transforming program to backend"""
         self.add_pass(FuseQuantizedActivationPass())
         self.add_pass(DecomposeLinearPass())
@@ -137,11 +135,8 @@ def transform_to_backend_pipeline(
         self.add_pass(KeepDimsFalseToSqueezePass())
         self.add_pass(Conv1dUnsqueezePass(exported_program))
         self.add_pass(DecomposeSoftmaxesPass())
-        for spec in compile_spec:
-            if spec.key == "permute_memory_format":
-                memory_format = spec.value.decode()
-                if memory_format == "nhwc":
-                    self.add_pass(AnnotateChannelsLastDimOrder())
+        self.add_pass(DecomposeSelectPass())
+        self.add_pass(AnnotateChannelsLastDimOrder())
 
         return self._transform(exported_program.graph_module)
 
 
@@ -0,0 +1,56 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+import torch
+from executorch.backends.arm._passes.arm_pass_utils import create_node
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class DecomposeSelectPass(ExportPass):
+    """
+    This pass decomposes select into slice + squeeze to ensure that Aten and TOSA outputs has the same rank (input rank -1)
+    """
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        for node in graph_module.graph.nodes:
+
+            if node.op != "call_function":
+                continue
+
+            if node.target in (
+                exir_ops.edge.aten.select.int,
+                exir_ops.edge.aten.select_copy.int,
+            ):
+                slice_op = exir_ops.edge.aten.slice_copy.Tensor
+                squeeze_op = exir_ops.edge.aten.squeeze_copy.dims
+            else:
+                continue
+
+            input_node, dim, index = node.args
+
+            rank = len(input_node.meta["val"].size())
+            dim = dim % rank if dim < 0 else dim
+            index = index % rank if index < 0 else index
+            dim_list = list(range(rank))
+
+            with graph_module.graph.inserting_before(node):
+                slice_node = create_node(
+                    graph_module.graph, slice_op, (input_node, dim, index, index + 1)
+                )
+                squeeze_node = create_node(
+                    graph_module.graph, squeeze_op, (slice_node, dim_list)
+                )
+
+            node.replace_all_uses_with(squeeze_node)
+            graph_module.graph.erase_node(node)
+
+        graph_module.graph.eliminate_dead_code()
+        graph_module.recompile()
+        graph_module = super().call(graph_module).graph_module
+        return PassResult(graph_module, True)
@@ -1,4 +1,4 @@
-# Copyright 2023-2024 Arm Limited and/or its affiliates.
+# Copyright 2023-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -49,8 +49,6 @@ def __init__(self):
         self.compiler_flags = []
         self.output_format = None
         self.path_for_intermediates = None
-        # TODO MLETORCH-265 Remove permute_nhwc flag
-        self.permute_nhwc = False
         self.quantize_io = False
         self.tosa_version = None
         self.input_order = None
@@ -118,16 +116,6 @@ def dump_intermediate_artifacts_to(
         self.path_for_intermediates = output_path
         return self
 
-    def set_permute_memory_format(
-        self, set_nhwc_permutation: bool = True
-    ) -> "ArmCompileSpecBuilder":
-        """
-        Permute to channel last in compiler and runtime. Compilation and
-        runtime will convert rank 4 inputs to channel last for each sub-graph.
-        """
-        self.permute_nhwc = set_nhwc_permutation
-        return self
-
     def set_quantize_io(self, quantize_io: bool = False) -> "ArmCompileSpecBuilder":
         """
         Quantization of inputs and dequantization of outputs for cases where
@@ -170,11 +158,6 @@ def build(self) -> List[CompileSpec]:
                 CompileSpec("debug_artifact_path", self.path_for_intermediates.encode())
             )
 
-        if self.permute_nhwc:
-            self.compile_spec.append(
-                CompileSpec("permute_memory_format", "nhwc".encode())
-            )
-
         if self.input_order:
             self.compile_spec.append(
                 CompileSpec(
@@ -188,20 +171,27 @@ def build(self) -> List[CompileSpec]:
         return self.compile_spec
 
 
-def is_permute_memory(compile_spec: List[CompileSpec]) -> bool:
-    for spec in compile_spec:
-        if spec.key == "permute_memory_format":
-            return spec.value.decode() == "nhwc"
-    return False
-
-
 def is_tosa(compile_spec: List[CompileSpec]) -> bool:
     for spec in compile_spec:
         if spec.key == "output_format":
             return spec.value.decode() == "tosa"
     return False
 
 
+def is_quantize_io(compile_specs: List[CompileSpec]) -> bool:
+    for spec in compile_specs:
+        if spec.key == "quantize_io" and spec.value.decode() == "True":
+            return True
+    return False
+
+
+def get_tosa_version(compile_spec: List[CompileSpec]) -> TosaSpecification:
+    for spec in compile_spec:
+        if spec.key == "tosa_version":
+            return TosaSpecification.create_from_string(spec.value.decode())
+    raise RuntimeError("Could not find TOSA version in CompileSpec")
+
+
 def get_intermediate_path(compile_spec: List[CompileSpec]) -> Optional[str]:
     for spec in compile_spec:
         if spec.key == "debug_artifact_path":
@@ -264,7 +254,7 @@ def preprocess(  # noqa: C901
         # const data directly. Path created and data written only in debug builds.
         tosa_graph = ts.TosaSerializer(artifact_path)
         graph_module = ArmPassManager().transform_to_backend_pipeline(
-            exported_program=edge_program, compile_spec=compile_spec
+            exported_program=edge_program
         )
 
         node_visitors = get_node_visitors(edge_program, tosa_spec)