Skip to content

Commit 6d969bf

Browse files
Merge branch 'main' into use_xnnpack_defines_unary_binary
2 parents 309eef4 + 8cfa858 commit 6d969bf

File tree

222 files changed

+6568
-1260
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

222 files changed

+6568
-1260
lines changed

.github/workflows/android-perf-private-device-experiment.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ on:
1818
description: Models to be benchmarked
1919
required: false
2020
type: string
21-
default: google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf
21+
default: Qwen/Qwen3-0.6B
2222
devices:
2323
description: Target devices to run benchmark
2424
required: false
@@ -34,7 +34,7 @@ on:
3434
description: Models to be benchmarked
3535
required: false
3636
type: string
37-
default: google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf
37+
default: Qwen/Qwen3-0.6B
3838
devices:
3939
description: Target devices to run benchmark
4040
required: false
@@ -57,6 +57,6 @@ jobs:
5757
id-token: write
5858
contents: read
5959
with:
60-
models: ${{ inputs.models || 'Qwen/Qwen3-0.6B' }}
60+
models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' || 'Qwen/Qwen3-0.6B' }}
6161
devices: samsung_galaxy_s22_private
6262
benchmark_configs: ${{ inputs.benchmark_configs }}

.github/workflows/android-perf.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ on:
2222
description: Models to be benchmarked
2323
required: false
2424
type: string
25-
default: llama
25+
default: Qwen/Qwen3-0.6B
2626
devices:
2727
description: Target devices to run benchmark
2828
required: false
@@ -38,7 +38,7 @@ on:
3838
description: Models to be benchmarked
3939
required: false
4040
type: string
41-
default: llama
41+
default: Qwen/Qwen3-0.6B
4242
devices:
4343
description: Target devices to run benchmark
4444
required: false
@@ -72,7 +72,7 @@ jobs:
7272
# Separate default values from the workflow dispatch. To ensure defaults are accessible
7373
# during scheduled runs and to provide flexibility for different defaults between
7474
# on-demand and periodic benchmarking.
75-
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf' || 'llama' }}
75+
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf' || 'Qwen/Qwen3-0.6B' }}
7676
CRON_DEFAULT_DEVICES: samsung_galaxy_s22
7777
run: |
7878
set -eux

.github/workflows/apple-perf-private-device-experiment.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ on:
1818
description: Models to be benchmarked
1919
required: false
2020
type: string
21-
default: google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf
21+
default: Qwen/Qwen3-0.6B
2222
devices:
2323
description: Target devices to run benchmark
2424
required: false
@@ -34,7 +34,7 @@ on:
3434
description: Models to be benchmarked
3535
required: false
3636
type: string
37-
default: Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf
37+
default: Qwen/Qwen3-0.6B
3838
devices:
3939
description: Target devices to run benchmark
4040
required: false
@@ -57,6 +57,6 @@ jobs:
5757
id-token: write
5858
contents: read
5959
with:
60-
models: ${{ inputs.models || 'Qwen/Qwen3-0.6B' }}
60+
models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' || 'Qwen/Qwen3-0.6B' }}
6161
devices: apple_iphone_15_private
6262
benchmark_configs: ${{ inputs.benchmark_configs }}

.github/workflows/apple-perf.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ on:
2222
description: Models to be benchmarked
2323
required: false
2424
type: string
25-
default: llama
25+
default: Qwen/Qwen3-0.6B
2626
devices:
2727
description: Target devices to run benchmark
2828
required: false
@@ -38,7 +38,7 @@ on:
3838
description: Models to be benchmarked
3939
required: false
4040
type: string
41-
default: llama
41+
default: Qwen/Qwen3-0.6B
4242
devices:
4343
description: Target devices to run benchmark
4444
required: false
@@ -72,7 +72,7 @@ jobs:
7272
# Separate default values from the workflow dispatch. To ensure defaults are accessible
7373
# during scheduled runs and to provide flexibility for different defaults between
7474
# on-demand and periodic benchmarking.
75-
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' || 'llama' }}
75+
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' || 'Qwen/Qwen3-0.6B' }}
7676
CRON_DEFAULT_DEVICES: apple_iphone_15
7777
run: |
7878
set -eux

.github/workflows/trunk.yml

Lines changed: 0 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -693,32 +693,3 @@ jobs:
693693
build-mode: Release
694694
build-tool: cmake
695695
docker-image: executorch-ubuntu-22.04-clang12
696-
697-
unittest-nxp-neutron:
698-
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
699-
permissions:
700-
id-token: write
701-
contents: read
702-
with:
703-
runner: linux.2xlarge
704-
docker-image: executorch-ubuntu-22.04-clang12
705-
submodules: 'recursive'
706-
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
707-
timeout: 90
708-
script: |
709-
set -eux
710-
711-
# The generic Linux job chooses to use base env, not the one setup by the image
712-
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
713-
conda activate "${CONDA_ENV}"
714-
715-
# Build and install Executorch
716-
PYTHON_EXECUTABLE=python \
717-
CMAKE_ARGS="-DEXECUTORCH_BUILD_NXP_NEUTRON=ON" \
718-
.ci/scripts/setup-linux.sh --build-tool "cmake"
719-
720-
# Install test requirements
721-
pip install -r backends/nxp/requirements-tests.txt
722-
723-
# Run pytest
724-
PYTHON_EXECUTABLE=python bash backends/nxp/run_unittests.sh

backends/arm/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,3 +187,9 @@ It is possible to control the compilation flow to aid in development and debug o
187187
Configuration of the EthosUBackend export flow is controlled by CompileSpec information (essentially used as compilation flags) to determine which of these outputs is produced. In particular this allows for use of the tosa_reference_model to run intermediate output to check for correctness and quantization accuracy without a full loop via hardware implemntation.
188188

189189
As this is in active development see the EthosUBackend for accurate information on [compilation flags](https://github.com/pytorch/executorch/blob/29f6dc9353e90951ed3fae3c57ae416de0520067/backends/arm/arm_backend.py#L319-L324)
190+
191+
## Model specific and optional passes
192+
The current TOSA version does not support int64. For LLMs for example LLama, often aten.emedding is the first operator and it requires int64 indicies.
193+
In order to lower this to TOSA and int64->int32 cast need to be injected. This pass need to run very early in the lowering process and can be passed in to the to_edge_transform_and_lower() function call as an optional parameter. See example in: backends/arm/test/models/test_llama.py.
194+
By doing this aten.embedding will be decomposed into to aten.index_select which can handle int32 indices.
195+
Note that this additional step is only needed for pure float models. With quantization this is automatically handled during annotation before the export stage.

backends/arm/_passes/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from .convert_to_clamp import ConvertToClampPass # noqa
2323
from .decompose_cosine_similarity_pass import DecomposeCosineSimilarityPass # noqa
2424
from .decompose_div_pass import DecomposeDivPass # noqa
25+
from .decompose_embedding_pass import DecomposeEmbeddingPass # noqa # noqa
2526
from .decompose_gelu_pass import DecomposeGeluPass # noqa
2627
from .decompose_groupnorm_pass import DecomposeGroupNormPass # noqa
2728
from .decompose_layernorm_pass import DecomposeLayerNormPass # noqa
@@ -46,6 +47,9 @@
4647
from .fuse_constant_ops_pass import ComputeConstantOpsAOT, FuseConstantArgsPass # noqa
4748
from .fuse_equal_placeholders_pass import FuseEqualPlaceholdersPass # noqa
4849
from .fuse_quantized_activation_pass import FuseQuantizedActivationPass # noqa
50+
from .insert_int64_input_cast_pass import ( # noqa # noqa
51+
InsertCastForOpsWithInt64InputPass,
52+
)
4953
from .insert_rescales_pass import InsertRescalePass # noqa
5054
from .insert_table_ops import InsertTableOpsPass # noqa
5155
from .match_arg_ranks_pass import MatchArgRanksPass # noqa

backends/arm/_passes/arm_pass_manager.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
# LICENSE file in the root directory of this source tree.
77

88
# pyre-unsafe
9-
109
from executorch.backends.arm._passes import (
1110
AnnotateChannelsLastDimOrder,
1211
AnnotateDecomposedMatmulPass,
@@ -26,6 +25,7 @@
2625
ConvertToClampPass,
2726
DecomposeCosineSimilarityPass,
2827
DecomposeDivPass,
28+
DecomposeEmbeddingPass,
2929
DecomposeGeluPass,
3030
DecomposeGroupNormPass,
3131
DecomposeLayerNormPass,
@@ -46,6 +46,7 @@
4646
FuseConstantArgsPass,
4747
FuseEqualPlaceholdersPass,
4848
FuseQuantizedActivationPass,
49+
InsertCastForOpsWithInt64InputPass,
4950
InsertRescalePass,
5051
InsertTableOpsPass,
5152
MatchArgRanksPass,
@@ -139,6 +140,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
139140
self.add_pass(DecomposeSqrtPass())
140141
self.add_pass(ConvertIntPowToMuls())
141142
self.add_pass(ReplaceScalarWithTensorArgPassTOSAMI())
143+
self.add_pass(DecomposeEmbeddingPass())
142144
self.add_pass(FuseQuantizedActivationPass())
143145
self.add_pass(RemoveGetItemPass())
144146
self.add_pass(ConvertSplitToSlicePass())
@@ -211,6 +213,8 @@ def transform_to_backend_pipeline(self, exported_program: ExportedProgram):
211213
)
212214

213215
def transform_for_annotation_pipeline(self, graph_module: GraphModule):
216+
self.add_pass(InsertCastForOpsWithInt64InputPass())
217+
self.add_pass(DecomposeEmbeddingPass())
214218
self.add_pass(DecomposeScaledDotProductAttention())
215219
self.add_pass(ReplaceScalarWithTensorArgPassTOSABI())
216220
self.add_pass(ScalarsToAttributePass())
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
# Copyright 2025 Arm Limited and/or its affiliates.
2+
#
3+
# This source code is licensed under the BSD-style license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
6+
# pyre-unsafe
7+
8+
9+
import logging
10+
from math import prod
11+
12+
import torch
13+
from executorch.exir.dialects._ops import ops as exir_ops
14+
from executorch.exir.pass_base import ExportPass, PassResult
15+
16+
from .arm_pass_utils import create_node, get_first_fake_tensor
17+
18+
logger = logging.getLogger(__name__)
19+
logger.setLevel(logging.WARNING)
20+
21+
22+
class DecomposeEmbeddingPass(ExportPass):
23+
"""
24+
This pass decomposes embedding into index_select.
25+
26+
Example:
27+
o = embedding(w, i)
28+
Becomes:
29+
i = view_copy(i) # flatten indices
30+
o = index_select(w, i)
31+
o = view_copy(o) # reshape back output
32+
Note:
33+
i = indices is expected to be int32 before this pass
34+
"""
35+
36+
aten_ops = (torch.ops.aten.embedding.default,)
37+
edge_ops = (exir_ops.edge.aten.embedding.default,)
38+
39+
def get_decomposition(self, op):
40+
if op in self.aten_ops:
41+
return (
42+
torch.ops.aten.view_copy.default,
43+
torch.ops.aten.index_select.default,
44+
)
45+
46+
if op in self.edge_ops:
47+
return (
48+
exir_ops.edge.aten.view_copy.default,
49+
exir_ops.edge.aten.index_select.default,
50+
)
51+
raise RuntimeError(
52+
f"[{self.__class__.__name__}] Can't get decomposition for op {op}"
53+
)
54+
55+
def call(self, graph_module):
56+
graph = graph_module.graph
57+
modified_graph = False
58+
59+
for node in graph.nodes:
60+
if node.op != "call_function":
61+
continue
62+
if node.target not in self.aten_ops + self.edge_ops:
63+
continue
64+
65+
args = node.args
66+
67+
weights = args[0]
68+
indices = args[1]
69+
70+
weights_shape = get_first_fake_tensor(weights).shape
71+
indices_shape = get_first_fake_tensor(indices).shape
72+
73+
output_shape = torch.Size(list(indices_shape) + [weights_shape[1]])
74+
if output_shape != get_first_fake_tensor(node).shape:
75+
raise RuntimeError(
76+
f"[{self.__class__.__name__}] Unexpected output shape mismatch {output_shape} "
77+
"!= {get_first_fake_tensor(node).shape}"
78+
)
79+
80+
view_copy_op, index_select_op = self.get_decomposition(node.target)
81+
82+
with graph.inserting_before(node):
83+
reshaped_indices = [prod(list(indices_shape))]
84+
flattened_indices = create_node(
85+
graph=graph,
86+
op_target=view_copy_op,
87+
args=(indices, reshaped_indices),
88+
)
89+
node.replace_input_with(indices, flattened_indices)
90+
91+
index_select = create_node(
92+
graph=graph,
93+
op_target=index_select_op,
94+
args=(weights, 0, flattened_indices),
95+
)
96+
node.replace_all_uses_with(index_select)
97+
graph.erase_node(node)
98+
99+
with graph.inserting_after(index_select):
100+
restored_output = create_node(
101+
graph,
102+
view_copy_op,
103+
)
104+
restored_output.args = (
105+
index_select,
106+
output_shape,
107+
)
108+
original_users = [
109+
user for user in index_select.users if user != restored_output
110+
]
111+
for user in original_users:
112+
user.replace_input_with(index_select, restored_output)
113+
114+
modified_graph = True
115+
116+
if modified_graph:
117+
graph.eliminate_dead_code()
118+
graph_module.recompile()
119+
graph_module = super().call(graph_module).graph_module
120+
return PassResult(graph_module, modified_graph)

0 commit comments

Comments
 (0)