Skip to content

Commit 1442563

Browse files
authored
Merge branch 'main' into add-abs-ops-to-executorch
2 parents 8d21d10 + 5e4d6b6 commit 1442563

File tree

166 files changed

+3563
-979
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

166 files changed

+3563
-979
lines changed

CMakeLists.txt

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,10 @@ option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR "Build the Flat Tensor extension"
186186
OFF
187187
)
188188

189+
option(EXECUTORCH_BUILD_EXTENSION_LLM "Build the LLM extension"
190+
OFF
191+
)
192+
189193
option(EXECUTORCH_BUILD_EXTENSION_MODULE "Build the Module extension" OFF)
190194

191195
option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL "Build the Runner Util extension"
@@ -245,7 +249,7 @@ cmake_dependent_option(
245249
)
246250

247251
if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
248-
set(EXECUTORCH_BUILF_EXTENSION_DATA_LOADER ON)
252+
set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
249253
endif()
250254

251255
if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
@@ -348,6 +352,7 @@ if(EXECUTORCH_BUILD_PTHREADPOOL)
348352
endif()
349353

350354
if(EXECUTORCH_BUILD_TESTS)
355+
set(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
351356
include(CTest)
352357
endif()
353358

@@ -373,7 +378,7 @@ if(NOT "${_repo_dir_name}" STREQUAL "executorch")
373378
"fix for this restriction."
374379
)
375380
endif()
376-
set(_common_include_directories ${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/runtime/core/portable_type)
381+
set(_common_include_directories ${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/runtime/core/portable_type/c10)
377382

378383
#
379384
# The `_<target>_srcs` lists are defined by including ${EXECUTORCH_SRCS_FILE}.
@@ -717,6 +722,10 @@ if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
717722
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/flat_tensor/serialize)
718723
endif()
719724

725+
if(EXECUTORCH_BUILD_EXTENSION_LLM)
726+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/tokenizer)
727+
endif()
728+
720729
if(EXECUTORCH_BUILD_EXTENSION_MODULE)
721730
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
722731
endif()

backends/apple/coreml/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ target_include_directories(
134134
coremldelegate PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/util
135135
)
136136
target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/..)
137-
target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/runtime/core/portable_type)
137+
target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
138138
target_compile_definitions(coremldelegate PRIVATE C10_USING_CUSTOM_GENERATED_MACROS)
139139
target_link_libraries(coremldelegate PRIVATE executorch_core)
140140

backends/apple/coreml/partition/coreml_partitioner.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# Please refer to the license found in the LICENSE file in the root directory of the source tree.
44

55
import logging
6-
from typing import List, Optional
6+
from typing import Callable, List, Optional, Tuple
77

88
import coremltools as ct
99

@@ -104,3 +104,17 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
104104
return PartitionResult(
105105
tagged_exported_program=exported_program, partition_tags=partition_tags
106106
)
107+
108+
def ops_to_not_decompose(
109+
self, ep: ExportedProgram
110+
) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]:
111+
do_not_decompose = []
112+
op_support = OperatorsSupportedForCoreMLBackend()
113+
for node in ep.graph.nodes:
114+
if (
115+
node.op == "call_function"
116+
and isinstance(node.target, torch._ops.OpOverload)
117+
and op_support.is_node_supported(None, node)
118+
):
119+
do_not_decompose.append(node.target)
120+
return do_not_decompose, None

backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -922,7 +922,7 @@
922922
"$(SRCROOT)/../kvstore",
923923
"$(SRCROOT)/../inmemoryfs",
924924
"$(SRCROOT)/../include",
925-
"$(SRCROOT)/../include/executorch/runtime/core/portable_type",
925+
"$(SRCROOT)/../include/executorch/runtime/core/portable_type/c10",
926926
"$(SRCROOT)/../sdk",
927927
"$(SRCROOT)/../util",
928928
"$(SRCROOT)/../../third-party/nlohmann_json/single_include",
@@ -954,7 +954,7 @@
954954
"$(SRCROOT)/../kvstore",
955955
"$(SRCROOT)/../inmemoryfs",
956956
"$(SRCROOT)/../include",
957-
"$(SRCROOT)/../include/executorch/runtime/core/portable_type",
957+
"$(SRCROOT)/../include/executorch/runtime/core/portable_type/c10",
958958
"$(SRCROOT)/../sdk",
959959
"$(SRCROOT)/../util",
960960
"$(SRCROOT)/../../third-party/nlohmann_json/single_include",

backends/apple/coreml/test/test_coreml_partitioner.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
from executorch.backends.apple.coreml.compiler import CoreMLBackend
1515
from executorch.backends.apple.coreml.partition import CoreMLPartitioner
16+
from executorch.exir.backend.utils import format_delegated_graph
1617

1718

1819
class TestCoreMLPartitioner(unittest.TestCase):
@@ -79,6 +80,50 @@ def test_vit_skip_conv(self):
7980
"getitem",
8081
]
8182

83+
def test_ops_to_not_decompose(self):
84+
class Model(torch.nn.Module):
85+
def forward(self, q, k, v, mask):
86+
return torch.ops.aten.scaled_dot_product_attention.default(
87+
q, k, v, attn_mask=mask
88+
)
89+
90+
model = Model()
91+
model.eval()
92+
93+
batch_size = 1
94+
n_heads = 12
95+
seq_len = 1
96+
max_seq_length = 32
97+
embedding_dim = 16
98+
q = torch.randn(batch_size, n_heads, seq_len, embedding_dim)
99+
k = torch.randn(batch_size, n_heads, max_seq_length, embedding_dim)
100+
v = torch.randn(batch_size, n_heads, max_seq_length, embedding_dim)
101+
mask = torch.randn(seq_len, max_seq_length)
102+
example_inputs = (q, k, v, mask)
103+
ep = torch.export.export(model, example_inputs)
104+
coreml_partitioner = CoreMLPartitioner()
105+
106+
# Using to_edge_transform_and_lower, we expect SDPA will be preserved and show up in delegated graph
107+
edge_program_manager = executorch.exir.to_edge_transform_and_lower(
108+
ep, partitioner=[coreml_partitioner]
109+
)
110+
self.assertTrue(
111+
"executorch.exir.dialects.edge._ops.aten.scaled_dot_product_attention.default"
112+
in format_delegated_graph(
113+
edge_program_manager.exported_program().graph_module
114+
)
115+
)
116+
117+
# Using to_edge flow, we expect SDPA will be decomposed and not show up in delegated graph
118+
edge_program_manager2 = executorch.exir.to_edge(ep)
119+
edge_program_manager2.to_backend(coreml_partitioner)
120+
self.assertTrue(
121+
"executorch.exir.dialects.edge._ops.aten.scaled_dot_product_attention.default"
122+
not in format_delegated_graph(
123+
edge_program_manager2.exported_program().graph_module
124+
)
125+
)
126+
82127
def test_buffer(self):
83128
embedding_dim = 3
84129
max_seq_len = 2
@@ -129,4 +174,5 @@ def forward(self, q, k_val, input_pos):
129174
test_runner = TestCoreMLPartitioner()
130175
test_runner.test_add_sub_skip_mm()
131176
test_runner.test_vit_skip_conv()
177+
test_runner.test_ops_to_not_decompose()
132178
test_runner.test_buffer()

backends/arm/CMakeLists.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2023 Arm Limited and/or its affiliates.
1+
# Copyright 2023, 2025 Arm Limited and/or its affiliates.
22
#
33
# This source code is licensed under the BSD-style license found in the
44
# LICENSE file in the root directory of this source tree.
@@ -14,15 +14,15 @@ endif()
1414

1515
include(${EXECUTORCH_ROOT}/build/Utils.cmake)
1616

17-
set(_common_include_directories ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type)
17+
set(_common_include_directories ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
1818
add_compile_definitions(C10_USING_CUSTOM_GENERATED_MACROS)
1919

2020
# Third-party folder and Ethos-U driver inclued
2121
set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party")
2222
set(DRIVER_ETHOSU_INCLUDE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver/include")
2323
include_directories(${DRIVER_ETHOSU_INCLUDE_DIR})
2424

25-
set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp
25+
set(_arm_baremetal_sources backends/arm/runtime/EthosUBackend.cpp
2626
backends/arm/runtime/VelaBinStream.cpp
2727
)
2828
list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/")

backends/arm/README.md

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ ethos-u-vela compilation stack. which follows the fully AoT flow.
1515
## Layout
1616

1717
Export:
18-
- `arm_backend.py` - Main entrypoint for the ArmPartitioner and ArmBackend. For more information see the section on
18+
- `ethosu_backend.py` - Main entrypoint for the EthosUBackend. For more information see the section on
1919
[Arm Backend Architecture](#arm-backend-architecture). For examples of use see `executorch/examples/arm`.
2020
- `tosa_mapping.py` - utilities for mapping edge dialect to TOSA
2121
- `tosa_quant_utils.py` - utilities for mapping quantization information to TOSA encoding
@@ -29,11 +29,11 @@ Passes:
2929
- `*_pass.py` - Compiler passes derived from ExportPass
3030

3131
Quantization:
32-
- `arm_quantizer.py` - Quantizer for Arm backend
32+
- `arm_quantizer.py` - Quantizers for Arm backend. Contains the EthosUQuantizer which inherits from the TOSAQuantizer
3333
- `arm_quantizer_utils.py` - Utilities for quantization
3434

3535
Runtime:
36-
- `runtime/ArmBackendEthosU.cpp` - The Arm backend implementation of the ExecuTorch runtime backend (BackendInterface) for Ethos-U
36+
- `runtime/ArmEthosUBackend.cpp` - The Arm backend implementation of the ExecuTorch runtime backend (BackendInterface) for Ethos-U
3737

3838
Other:
3939
- `third-party/` - Dependencies on other code - in particular the TOSA serialization_lib for compiling to TOSA and the ethos-u-core-driver for the bare-metal backend supporting Ethos-U
@@ -177,6 +177,7 @@ create an issue on [github](https://www.github.com/pytorch/executorch/issues).
177177
# Arm Backend Architecture
178178

179179
The broad principle with the Arm backend implemention for ExecuTorch is to support multiple Arm devices and device configurations through a largely Homogeneous flow with maximal sharing of class logic.
180+
The EthosUBackend is currently the one user facing API that target the Ethos-U55 and Ethos-U85 hardware IP. It is using the TOSABackend under the hood to share code and functionality, but also to separate testing possibilities to the TOSA flow itself.
180181

181182
In practice for compilation, this means that the flow goes via [Arm TOSA](https://www.mlplatform.org/tosa/tosa_spec.html) to produce a common IR and quantization behaviour compatible with our various IP, and typically, device-specific backends to further lower to a device specific binary which can happen ahead of time (within the Python development flow) or at runtime (during a JIT compilation stage).
182183

@@ -185,22 +186,22 @@ In practice for the runtime, this means we will share common runtime backend fun
185186

186187
## Arm Backend Status and Maturity
187188

188-
The Arm Backend should be considered a prototype quality at this point, likely subject to significant change and improvement, and with a limited coverage of functionality. We are actively developing this codebase.
189+
The Arm EthosU Backend should be considered a prototype quality at this point, likely subject to significant change and improvement, and with a limited coverage of functionality. We are actively developing this codebase.
189190

190191
## Current flows
191192

192-
The ArmBackend has a two stage process,
193-
- Compile to TOSA to rationalise the graph into known hardware support profiles. Currently this is to v0.80 TOSA BI with specific concern to a subset which gives support on Ethos-U55, the target of the initial prototype efforts.
193+
The EthosUBackend has a two stage process,
194+
- Compile to TOSA to rationalise the graph into known hardware support profiles. Currently this is to v0.80 TOSA BI with specific concern to a subset which gives support on Ethos-U55 and Ethos-U85, the target of the initial prototype efforts. This calls into the TOSABackend.
194195
- Lower via the ethos-u-vela compilation flow which takes TOSA v0.80 as an input and produces a low level commandstream for the hardware which is then passed via the delegate to the ethos-u-core-driver for direct execution.
195196

196-
The ArmPartitioner is currenly used to ensure the operations converted are Ethos-U compatible, but will be extended to offer spec-correct TOSA Base inference and TOSA Main Inference generation in future.
197+
The EthosUPartitioner is currenly used to ensure the operations converted are Ethos-U compatible, but will be extended to offer spec-correct TOSA Base inference and TOSA Main Inference generation in future.
198+
199+
There is also a generic TOSABackend with accompanying TOSAPartitioner and TOSAQuantizer, which are used by the EthosUBackend and friends. The Arm TOSA Backend can be used by it's own to verify the lowering to the TOSA representation of the model (refer to the unit tests in backends/arm/test which uses the TOSA backend in the test suites).
197200

198201
### Controlling compilation
199202

200203
It is possible to control the compilation flow to aid in development and debug of both networks and the code itself.
201204

202-
Configuration of the ArmBackend export flow is controlled by CompileSpec information (essentially used as compilation flags) to determine which of these outputs is produced. In particular this allows for use of the tosa_reference_model to run intermediate output to check for correctness and quantization accuracy without a full loop via hardware implemntation.
203-
204-
As this is in active development see the ArmBackend for accurate information on [compilation flags](https://github.com/pytorch/executorch/blob/29f6dc9353e90951ed3fae3c57ae416de0520067/backends/arm/arm_backend.py#L319-L324)
205+
Configuration of the EthosUBackend export flow is controlled by CompileSpec information (essentially used as compilation flags) to determine which of these outputs is produced. In particular this allows for use of the tosa_reference_model to run intermediate output to check for correctness and quantization accuracy without a full loop via hardware implemntation.
205206

206-
You can also refer to the [example TOSA end-to-end code](/examples/arm/arm_tosa_e2e.py)
207+
As this is in active development see the EthosUBackend for accurate information on [compilation flags](https://github.com/pytorch/executorch/blob/29f6dc9353e90951ed3fae3c57ae416de0520067/backends/arm/arm_backend.py#L319-L324)

backends/arm/TARGETS

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,10 @@ load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
44
python_library(
55
name = "arm_partitioner",
66
srcs = [
7-
"arm_partitioner.py",
7+
"ethosu_backend.py",
8+
"ethosu_partitioner.py",
9+
"tosa_backend.py",
10+
"tosa_partitioner.py",
811
],
912
typing = True,
1013
deps = [

backends/arm/_passes/arm_pass_manager.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Copyright (c) Meta Platforms, Inc. and affiliates.
2-
# Copyright 2024-2025 Arm Limited and/or its affiliates.
32
# All rights reserved.
3+
# Copyright 2024-2025 Arm Limited and/or its affiliates.
44
#
55
# This source code is licensed under the BSD-style license found in the
66
# LICENSE file in the root directory of this source tree.
@@ -18,6 +18,9 @@
1818
from executorch.backends.arm._passes.convert_expand_copy_to_repeat import (
1919
ConvertExpandCopyToRepeatPass,
2020
)
21+
from executorch.backends.arm._passes.convert_full_like_to_full_pass import (
22+
ConvertFullLikeToFullPass,
23+
)
2124
from executorch.backends.arm._passes.convert_split_to_slice import (
2225
ConvertSplitToSlicePass,
2326
)
@@ -49,6 +52,7 @@
4952
from executorch.backends.arm._passes.fuse_quantized_activation_pass import ( # type: ignore[import-not-found]
5053
FuseQuantizedActivationPass,
5154
)
55+
from executorch.backends.arm._passes.insert_rescales_pass import InsertRescalePass
5256
from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
5357
from executorch.backends.arm._passes.keep_dims_false_to_squeeze_pass import (
5458
KeepDimsFalseToSqueezePass,
@@ -72,6 +76,7 @@
7276
UnsqueezeScalarPlaceholdersPass,
7377
)
7478
from executorch.backends.arm.tosa_specification import TosaSpecification
79+
7580
from executorch.backends.xnnpack._passes.remove_getitem_op import RemoveGetItemPass
7681
from executorch.exir import ExportedProgram
7782
from executorch.exir.pass_manager import PassManager
@@ -95,6 +100,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
95100
self.add_pass(ConvertMmToBmmPass())
96101
self.add_pass(DecomposeLinearPass())
97102
self.add_pass(ConvertMeanDimToAveragePoolPass())
103+
self.add_pass(ConvertFullLikeToFullPass())
98104

99105
self.add_pass(AnnotateDecomposedMatmulPass())
100106
self.add_pass(QuantizeOperatorArguments())
@@ -115,7 +121,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
115121
self.add_pass(ConvertSqueezesToViewPass())
116122

117123
self.add_pass(AnnotateChannelsLastDimOrder())
118-
124+
self.add_pass(InsertRescalePass())
119125
return self._transform(exported_program.graph_module)
120126

121127
def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
@@ -133,7 +139,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
133139
self.add_pass(ConvertMeanDimToAveragePoolPass())
134140
self.add_pass(DecomposeDivPass())
135141
self.add_pass(DecomposeSoftmaxesPass())
136-
142+
self.add_pass(ConvertFullLikeToFullPass())
137143
self.add_pass(AnnotateDecomposedMatmulPass())
138144
self.add_pass(QuantizeOperatorArguments())
139145
self.add_pass(FoldAndAnnotateQParamsPass()) # type: ignore[call-arg]
@@ -153,6 +159,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
153159
self.add_pass(ConvertSqueezesToViewPass())
154160

155161
self.add_pass(AnnotateChannelsLastDimOrder())
162+
self.add_pass(InsertRescalePass())
156163

157164
return self._transform(exported_program.graph_module)
158165

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Copyright 2025 Arm Limited and/or its affiliates.
2+
#
3+
# This source code is licensed under the BSD-style license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
6+
from executorch.exir.dialects._ops import ops as exir_ops
7+
from executorch.exir.pass_base import ExportPass
8+
9+
10+
class ConvertFullLikeToFullPass(ExportPass):
11+
"""As per the full_like pytorch documentation,
12+
`torch.full_like(input, fill_value)` is equivalent to
13+
`torch.full(input.size(),
14+
fill_value,
15+
dtype=input.dtype,
16+
layout=input.layout,
17+
device=input.device
18+
)`
19+
Skip layout and device since it's not relevant for our backend.
20+
"""
21+
22+
def call_operator(self, op, args, kwargs, meta):
23+
if op not in [
24+
exir_ops.edge.aten.full_like.default,
25+
]:
26+
return super().call_operator(op, args, kwargs, meta)
27+
28+
tensor = args[0].data
29+
full_args = (list(tensor.shape), args[1])
30+
full_kwargs = {"dtype": tensor.dtype}
31+
return super().call_operator(
32+
exir_ops.edge.aten.full.default, full_args, full_kwargs, meta
33+
)

0 commit comments

Comments
 (0)