pytorch
diff --git a/‎.ci/docker/ci_commit_pins/pytorch.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/pytorch.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/scripts/test_qnn_static_llm.sh‎
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/test_qnn_static_llm.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 2 additions & 0 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/arm/_passes/convert_expand_copy_to_repeat.py‎
Lines changed: 4 additions & 3 deletions b/‎backends/arm/_passes/convert_expand_copy_to_repeat.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎backends/arm/_passes/convert_permute_singleton_to_view_pass.py‎
Lines changed: 62 additions & 0 deletions b/‎backends/arm/_passes/convert_permute_singleton_to_view_pass.py‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎backends/arm/_passes/decompose_embedding_pass.py‎
Lines changed: 0 additions & 1 deletion b/‎backends/arm/_passes/decompose_embedding_pass.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎backends/arm/_passes/insert_int32_casts_after_int64_placeholders.py‎
Lines changed: 2 additions & 0 deletions b/‎backends/arm/_passes/insert_int32_casts_after_int64_placeholders.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/arm/ethosu/backend.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/ethosu/backend.py‎
Lines changed: 1 addition & 1 deletion
@@ -1 +1 @@
-e6f766c7d750d40603eee3f66c5915bac606b3ea
+556fc09a9f67f24ca5591ec049c5d0c347c5f62a
@@ -81,7 +81,7 @@ elif [[ "${TASK_NAME}" == "stories_260k_bc" ]]; then
     fi
 
 elif [[ "${TASK_NAME}" == "smollm2_135m" ]]; then
-    $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_static_smollm2 --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./static_smollm2 --enable_x86_64
+    $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_static_llm_model --model_name smollm2_135m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./static_smollm2 --enable_x86_64
     exit_code1=$?
     if [ $exit_code1 -ne 0 ]; then
         exit 1
 
@@ -347,7 +347,7 @@ jobs:
         elif [[ ${{ matrix.os}} == "zephyr-preset" ]]; then
           setup_script_args="--target-toolchain zephyr"
           toolchain_prefix=arm-zephyr-eabi-
-          threshold="135240" # 132 KiB
+          threshold="135656" # 132 KiB
           toolchain_cmake=examples/zephyr/x86_64-linux-arm-zephyr-eabi-gcc.cmake
         else
           echo "Fail unsupport OS selection ${{ matrix.os }}"
 
@@ -21,6 +21,9 @@
 from .convert_int64_output_ops_to_int32 import ConvertInt64OutputOpsToInt32Pass  # noqa
 from .convert_int_pow_to_mul import ConvertIntPowToMuls  # noqa
 from .convert_minmax_pass import ConvertMinMaxPass  # noqa
+from .convert_permute_singleton_to_view_pass import (  # noqa
+    ConvertPermuteSingletonToViewPass,
+)
 from .convert_split_to_slice import ConvertSplitToSlicePass  # noqa
 from .convert_squeezes_to_view import ConvertSqueezesToViewPass  # noqa
 from .convert_to_clamp import ConvertToClampPass  # noqa
 
@@ -27,6 +27,7 @@
     ConvertIntPowToMuls,
     ConvertMinMaxPass,
     ConvertMmToBmmPass,
+    ConvertPermuteSingletonToViewPass,
     ConvertSplitToSlicePass,
     ConvertSqueezesToViewPass,
     ConvertToClampPass,
@@ -234,6 +235,7 @@ def _tosa_pipeline(
         self.add_pass(CastToInt32Pass())
         self.add_pass(BroadcastArgsPass())
 
+        self.add_pass(ConvertPermuteSingletonToViewPass())
         self.add_pass(FuseViewCopyTransform())
         self.add_pass(FuseConstantArgsPass(exported_program))
         self.add_pass(DecomposeConv2dWithInt16ActivationPass())
 
@@ -20,6 +20,7 @@
 
 
 def calculate_multiples(args):
+    """Returns expand args converted to repeat args, and whether the expand changes the rank"""
     input_node_or_tensor = args[0]
 
     if isinstance(input_node_or_tensor, torch.fx.node.Node):
@@ -45,7 +46,7 @@ def calculate_multiples(args):
         multiples[i] if multiples[i] != -1 and extended_shape[i] == 1 else 1
         for i in range(expanded_rank)
     ]
-    return multiples
+    return multiples, expanded_rank != len(input_shape)
 
 
 class ConvertExpandCopyToRepeatPass(ArmPass):
@@ -62,9 +63,9 @@ def call_operator(self, op, args, kwargs, meta):
         if op != self.expand_copy:
             return super().call_operator(op, args, kwargs, meta)
 
-        multiples = calculate_multiples(args)
+        multiples, changes_rank = calculate_multiples(args)
 
-        if all((x == 1 for x in multiples)):
+        if all((x == 1 for x in multiples)) and not changes_rank:
             # All dimensions/repetitions occur only once. Remove node
             # altogether since it's in practice just a copy.
             logger.warning("Found redundant expand node (no-op). Removing it.")
 
@@ -0,0 +1,62 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import Sequence, Set, Tuple, Type
+
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+from torch._ops import OpOverload
+
+
+_PERMUTE_TARGETS: Tuple[OpOverload, ...] = (
+    exir_ops.edge.aten.permute.default,
+    exir_ops.edge.aten.permute_copy.default,
+)
+
+
+class ConvertPermuteSingletonToViewPass(ExportPass):
+    """Replace permutations that only move singleton axes with a reshape.
+
+    Examples:
+    x = rand(1,1,1,4)
+    y = permute(x, (0,3,1,2))
+
+    becomes:
+    x = rand(1,1,1,4)
+    y = view_copy(x, (1,4,1,1))
+    """
+
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in _PERMUTE_TARGETS:
+            return super().call_operator(op, args, kwargs, meta)
+
+        input_tensor = args[0].data
+        permutation = args[1]
+        if not is_singleton_permutation(input_tensor.shape, permutation):
+            return super().call_operator(op, args, kwargs, meta)
+
+        output_shape = meta["val"].shape
+        view_args = (args[0], output_shape)
+        return super().call_operator(
+            exir_ops.edge.aten.view_copy.default, view_args, kwargs, meta
+        )
+
+
+def is_singleton_permutation(shape: Sequence[int], permutation: Sequence[int]) -> bool:
+    """
+    Treat as a view only when non-singleton axes keep their order; singleton
+    axes may move freely since they carry no data volume.
+    """
+    rank = len(shape)
+    normalized_perm = [d % rank for d in permutation]
+
+    non_singleton_axes = [i for i, size in enumerate(shape) if size != 1]
+    permuted_non_singleton_axes = [axis for axis in normalized_perm if shape[axis] != 1]
+
+    return permuted_non_singleton_axes == non_singleton_axes
@@ -17,7 +17,6 @@
 from .arm_pass_utils import create_node, get_first_fake_tensor
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.WARNING)
 
 
 class DecomposeEmbeddingPass(ArmPass):
 
@@ -36,6 +36,8 @@ class InsertInt32CastsAfterInt64PlaceholdersPass(ArmPass):
     # Key: op overload; Value: zero-based indices of positional args that must be i64.
     I64_INPUT_ARG_POSITIONS = {
         torch.ops.aten.one_hot.default: (0,),
+        torch.ops.aten.index_copy_.default: (2,),
+        torch.ops.aten.index_copy.default: (2,),
     }
 
     def _insert_callsite_i32_to_i64_casts(self, graph_module: torch.fx.GraphModule):
 
@@ -63,7 +63,7 @@ def _compile_tosa_flatbuffer(
         binary = vela_compile(
             tosa_flatbuffer,
             compile_flags,
-            verbose=logger.getEffectiveLevel() == logging.INFO,
+            verbose=logger.getEffectiveLevel() <= logging.INFO,
             intermediate_path=compile_spec.get_intermediate_path(),
         )
         return binary
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-e6f766c7d750d40603eee3f66c5915bac606b3ea`
	`1`	`+556fc09a9f67f24ca5591ec049c5d0c347c5f62a`
Original file line number	Diff line number	Diff line change
`@@ -36,6 +36,8 @@ class InsertInt32CastsAfterInt64PlaceholdersPass(ArmPass):`
`36`	`36`	`# Key: op overload; Value: zero-based indices of positional args that must be i64.`
`37`	`37`	`I64_INPUT_ARG_POSITIONS = {`
`38`	`38`	`torch.ops.aten.one_hot.default: (0,),`
	`39`	`+ torch.ops.aten.index_copy_.default: (2,),`
	`40`	`+ torch.ops.aten.index_copy.default: (2,),`
`39`	`41`	`}`
`40`	`42`
`41`	`43`	`def _insert_callsite_i32_to_i64_casts(self, graph_module: torch.fx.GraphModule):`
Original file line number	Diff line number	Diff line change
`@@ -63,7 +63,7 @@ def _compile_tosa_flatbuffer(`
`63`	`63`	`binary = vela_compile(`
`64`	`64`	`tosa_flatbuffer,`
`65`	`65`	`compile_flags,`
`66`		`- verbose=logger.getEffectiveLevel() == logging.INFO,`
	`66`	`+ verbose=logger.getEffectiveLevel() <= logging.INFO,`
`67`	`67`	`intermediate_path=compile_spec.get_intermediate_path(),`
`68`	`68`	`)`
`69`	`69`	`return binary`