pytorch
diff --git a/‎.github/workflows/android-perf-private-device-experiment.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/android-perf-private-device-experiment.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/android-perf.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/android-perf.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/apple-perf-private-device-experiment.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/apple-perf-private-device-experiment.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/apple-perf.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/apple-perf.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 0 additions & 29 deletions b/‎.github/workflows/trunk.yml‎
Lines changed: 0 additions & 29 deletions
diff --git a/‎backends/arm/README.md‎
Lines changed: 6 additions & 0 deletions b/‎backends/arm/README.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 5 additions & 1 deletion b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎backends/arm/_passes/decompose_embedding_pass.py‎
Lines changed: 120 additions & 0 deletions b/‎backends/arm/_passes/decompose_embedding_pass.py‎
Lines changed: 120 additions & 0 deletions
@@ -18,7 +18,7 @@ on:
         description: Models to be benchmarked
         required: false
         type: string
-        default: google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf
+        default: Qwen/Qwen3-0.6B
       devices:
         description: Target devices to run benchmark
         required: false
@@ -34,7 +34,7 @@ on:
         description: Models to be benchmarked
         required: false
         type: string
-        default: google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf
+        default: Qwen/Qwen3-0.6B
       devices:
         description: Target devices to run benchmark
         required: false
@@ -57,6 +57,6 @@ jobs:
       id-token: write
       contents: read
     with:
-      models: ${{ inputs.models || 'Qwen/Qwen3-0.6B' }}
+      models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' || 'Qwen/Qwen3-0.6B' }}
       devices: samsung_galaxy_s22_private
       benchmark_configs: ${{ inputs.benchmark_configs }}
@@ -22,7 +22,7 @@ on:
         description: Models to be benchmarked
         required: false
         type: string
-        default: llama
+        default: Qwen/Qwen3-0.6B
       devices:
         description: Target devices to run benchmark
         required: false
@@ -38,7 +38,7 @@ on:
         description: Models to be benchmarked
         required: false
         type: string
-        default: llama
+        default: Qwen/Qwen3-0.6B
       devices:
         description: Target devices to run benchmark
         required: false
@@ -72,7 +72,7 @@ jobs:
           # Separate default values from the workflow dispatch. To ensure defaults are accessible
           # during scheduled runs and to provide flexibility for different defaults between
           # on-demand and periodic benchmarking.
-          CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf' || 'llama' }}
+          CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf' || 'Qwen/Qwen3-0.6B' }}
           CRON_DEFAULT_DEVICES: samsung_galaxy_s22
         run: |
           set -eux
 
@@ -18,7 +18,7 @@ on:
         description: Models to be benchmarked
         required: false
         type: string
-        default: google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf
+        default: Qwen/Qwen3-0.6B
       devices:
         description: Target devices to run benchmark
         required: false
@@ -34,7 +34,7 @@ on:
         description: Models to be benchmarked
         required: false
         type: string
-        default: Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf
+        default: Qwen/Qwen3-0.6B
       devices:
         description: Target devices to run benchmark
         required: false
@@ -57,6 +57,6 @@ jobs:
       id-token: write
       contents: read
     with:
-      models: ${{ inputs.models || 'Qwen/Qwen3-0.6B' }}
+      models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' || 'Qwen/Qwen3-0.6B' }}
       devices: apple_iphone_15_private
       benchmark_configs: ${{ inputs.benchmark_configs }}
@@ -22,7 +22,7 @@ on:
         description: Models to be benchmarked
         required: false
         type: string
-        default: llama
+        default: Qwen/Qwen3-0.6B
       devices:
         description: Target devices to run benchmark
         required: false
@@ -38,7 +38,7 @@ on:
         description: Models to be benchmarked
         required: false
         type: string
-        default: llama
+        default: Qwen/Qwen3-0.6B
       devices:
         description: Target devices to run benchmark
         required: false
@@ -72,7 +72,7 @@ jobs:
           # Separate default values from the workflow dispatch. To ensure defaults are accessible
           # during scheduled runs and to provide flexibility for different defaults between
           # on-demand and periodic benchmarking.
-          CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' || 'llama' }}
+          CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' || 'Qwen/Qwen3-0.6B' }}
           CRON_DEFAULT_DEVICES: apple_iphone_15
         run: |
           set -eux
 
@@ -693,32 +693,3 @@ jobs:
       build-mode: Release
       build-tool: cmake
       docker-image: executorch-ubuntu-22.04-clang12
-
-  unittest-nxp-neutron:
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    permissions:
-      id-token: write
-      contents: read
-    with:
-      runner: linux.2xlarge
-      docker-image: executorch-ubuntu-22.04-clang12
-      submodules: 'recursive'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        set -eux
-
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        # Build and install Executorch
-        PYTHON_EXECUTABLE=python \
-        CMAKE_ARGS="-DEXECUTORCH_BUILD_NXP_NEUTRON=ON" \
-        .ci/scripts/setup-linux.sh --build-tool "cmake"
-
-        # Install test requirements
-        pip install -r backends/nxp/requirements-tests.txt
-
-        # Run pytest
-        PYTHON_EXECUTABLE=python bash backends/nxp/run_unittests.sh
@@ -187,3 +187,9 @@ It is possible to control the compilation flow to aid in development and debug o
 Configuration of the EthosUBackend export flow is controlled by CompileSpec information (essentially used as compilation flags) to determine which of these outputs is produced. In particular this allows for use of the tosa_reference_model to run intermediate output to check for correctness and quantization accuracy without a full loop via hardware implemntation.
 
 As this is in active development see the EthosUBackend for accurate information on [compilation flags](https://github.com/pytorch/executorch/blob/29f6dc9353e90951ed3fae3c57ae416de0520067/backends/arm/arm_backend.py#L319-L324)
+
+## Model specific and optional passes
+The current TOSA version does not support int64. For LLMs for example LLama, often aten.emedding is the first operator and it requires int64 indicies.
+In order to lower this to TOSA and int64->int32 cast need to be injected. This pass need to run very early in the lowering process and can be passed in to the to_edge_transform_and_lower() function call as an optional parameter. See example in: backends/arm/test/models/test_llama.py.
+By doing this aten.embedding will be decomposed into to aten.index_select which can handle int32 indices.
+Note that this additional step is only needed for pure float models. With quantization this is automatically handled during annotation before the export stage.
@@ -22,6 +22,7 @@
 from .convert_to_clamp import ConvertToClampPass  # noqa
 from .decompose_cosine_similarity_pass import DecomposeCosineSimilarityPass  # noqa
 from .decompose_div_pass import DecomposeDivPass  # noqa
+from .decompose_embedding_pass import DecomposeEmbeddingPass  # noqa  # noqa
 from .decompose_gelu_pass import DecomposeGeluPass  # noqa
 from .decompose_groupnorm_pass import DecomposeGroupNormPass  # noqa
 from .decompose_layernorm_pass import DecomposeLayerNormPass  # noqa
@@ -46,6 +47,9 @@
 from .fuse_constant_ops_pass import ComputeConstantOpsAOT, FuseConstantArgsPass  # noqa
 from .fuse_equal_placeholders_pass import FuseEqualPlaceholdersPass  # noqa
 from .fuse_quantized_activation_pass import FuseQuantizedActivationPass  # noqa
+from .insert_int64_input_cast_pass import (  # noqa  # noqa
+    InsertCastForOpsWithInt64InputPass,
+)
 from .insert_rescales_pass import InsertRescalePass  # noqa
 from .insert_table_ops import InsertTableOpsPass  # noqa
 from .match_arg_ranks_pass import MatchArgRanksPass  # noqa
 
@@ -6,7 +6,6 @@
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
-
 from executorch.backends.arm._passes import (
     AnnotateChannelsLastDimOrder,
     AnnotateDecomposedMatmulPass,
@@ -26,6 +25,7 @@
     ConvertToClampPass,
     DecomposeCosineSimilarityPass,
     DecomposeDivPass,
+    DecomposeEmbeddingPass,
     DecomposeGeluPass,
     DecomposeGroupNormPass,
     DecomposeLayerNormPass,
@@ -46,6 +46,7 @@
     FuseConstantArgsPass,
     FuseEqualPlaceholdersPass,
     FuseQuantizedActivationPass,
+    InsertCastForOpsWithInt64InputPass,
     InsertRescalePass,
     InsertTableOpsPass,
     MatchArgRanksPass,
@@ -139,6 +140,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(DecomposeSqrtPass())
         self.add_pass(ConvertIntPowToMuls())
         self.add_pass(ReplaceScalarWithTensorArgPassTOSAMI())
+        self.add_pass(DecomposeEmbeddingPass())
         self.add_pass(FuseQuantizedActivationPass())
         self.add_pass(RemoveGetItemPass())
         self.add_pass(ConvertSplitToSlicePass())
@@ -211,6 +213,8 @@ def transform_to_backend_pipeline(self, exported_program: ExportedProgram):
             )
 
     def transform_for_annotation_pipeline(self, graph_module: GraphModule):
+        self.add_pass(InsertCastForOpsWithInt64InputPass())
+        self.add_pass(DecomposeEmbeddingPass())
         self.add_pass(DecomposeScaledDotProductAttention())
         self.add_pass(ReplaceScalarWithTensorArgPassTOSABI())
         self.add_pass(ScalarsToAttributePass())
 
@@ -0,0 +1,120 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+
+import logging
+from math import prod
+
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+from .arm_pass_utils import create_node, get_first_fake_tensor
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.WARNING)
+
+
+class DecomposeEmbeddingPass(ExportPass):
+    """
+    This pass decomposes embedding into index_select.
+
+    Example:
+          o = embedding(w, i)
+    Becomes:
+          i = view_copy(i)  # flatten indices
+          o = index_select(w, i)
+          o = view_copy(o)  # reshape back output
+    Note:
+         i = indices is expected to be int32 before this pass
+    """
+
+    aten_ops = (torch.ops.aten.embedding.default,)
+    edge_ops = (exir_ops.edge.aten.embedding.default,)
+
+    def get_decomposition(self, op):
+        if op in self.aten_ops:
+            return (
+                torch.ops.aten.view_copy.default,
+                torch.ops.aten.index_select.default,
+            )
+
+        if op in self.edge_ops:
+            return (
+                exir_ops.edge.aten.view_copy.default,
+                exir_ops.edge.aten.index_select.default,
+            )
+        raise RuntimeError(
+            f"[{self.__class__.__name__}] Can't get decomposition for op {op}"
+        )
+
+    def call(self, graph_module):
+        graph = graph_module.graph
+        modified_graph = False
+
+        for node in graph.nodes:
+            if node.op != "call_function":
+                continue
+            if node.target not in self.aten_ops + self.edge_ops:
+                continue
+
+            args = node.args
+
+            weights = args[0]
+            indices = args[1]
+
+            weights_shape = get_first_fake_tensor(weights).shape
+            indices_shape = get_first_fake_tensor(indices).shape
+
+            output_shape = torch.Size(list(indices_shape) + [weights_shape[1]])
+            if output_shape != get_first_fake_tensor(node).shape:
+                raise RuntimeError(
+                    f"[{self.__class__.__name__}] Unexpected output shape mismatch {output_shape} "
+                    "!= {get_first_fake_tensor(node).shape}"
+                )
+
+            view_copy_op, index_select_op = self.get_decomposition(node.target)
+
+            with graph.inserting_before(node):
+                reshaped_indices = [prod(list(indices_shape))]
+                flattened_indices = create_node(
+                    graph=graph,
+                    op_target=view_copy_op,
+                    args=(indices, reshaped_indices),
+                )
+                node.replace_input_with(indices, flattened_indices)
+
+                index_select = create_node(
+                    graph=graph,
+                    op_target=index_select_op,
+                    args=(weights, 0, flattened_indices),
+                )
+                node.replace_all_uses_with(index_select)
+                graph.erase_node(node)
+
+            with graph.inserting_after(index_select):
+                restored_output = create_node(
+                    graph,
+                    view_copy_op,
+                )
+                restored_output.args = (
+                    index_select,
+                    output_shape,
+                )
+                original_users = [
+                    user for user in index_select.users if user != restored_output
+                ]
+                for user in original_users:
+                    user.replace_input_with(index_select, restored_output)
+
+            modified_graph = True
+
+        if modified_graph:
+            graph.eliminate_dead_code()
+            graph_module.recompile()
+            graph_module = super().call(graph_module).graph_module
+        return PassResult(graph_module, modified_graph)