Skip to content

Commit a07d815

Browse files
authored
Merge branch 'main' into SuppportUnaryLogInXNNPACKDelegate
2 parents 6240d2f + 851b373 commit a07d815

File tree

162 files changed

+4274
-1737
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

162 files changed

+4274
-1737
lines changed

.buckconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939

4040
[buck2]
4141
restarter=true
42+
file_watcher=notify
4243

4344
[oss]
4445
folly_cxx_tests = False

.ci/scripts/build-qnn-sdk.sh

100644100755
Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,12 @@ set -o xtrace
1111

1212
build_qnn_backend() {
1313
echo "Start building qnn backend."
14-
export ANDROID_NDK_ROOT=/opt/ndk
15-
export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
14+
export ANDROID_NDK_ROOT=${ANDROID_NDK_ROOT:-/opt/ndk}
15+
export QNN_SDK_ROOT=${QNN_SDK_ROOT:-/tmp/qnn/2.28.0.241029}
1616
export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)"
1717

18-
# Workaround to avoid issues around missing flatccrt library (depending on the
19-
# number of jobs used), see issue #7300:
20-
# Build twice (second time with `--no_clean`) to make sure libflatccrt.a is
21-
# available.
22-
# TODO: Remove this workaround once the underlying issue is fixed.
23-
bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release || \
24-
bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release --no_clean
18+
parallelism=$(( $(nproc) - 1 ))
19+
bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number ${parallelism} --release
2520
}
2621

2722
set_up_aot() {

CMakeLists.txt

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -48,21 +48,33 @@ project(executorch)
4848
# MARK: - Start EXECUTORCH_H12025_BUILD_MIGRATION --------------------------------------------------
4949

5050
include(${PROJECT_SOURCE_DIR}/tools/cmake/common/preset.cmake)
51+
include(${PROJECT_SOURCE_DIR}/tools/cmake/Utils.cmake)
52+
include(CMakeDependentOption)
53+
include(ExternalProject)
5154

5255
if(NOT CMAKE_CXX_STANDARD)
5356
set(CMAKE_CXX_STANDARD 17)
5457
endif()
5558
announce_configured_options(CMAKE_CXX_STANDARD)
5659

60+
if(NOT CMAKE_SYSTEM_PROCESSOR)
61+
set(CMAKE_SYSTEM_PROCESSOR ${CMAKE_HOST_SYSTEM_PROCESSOR})
62+
endif()
63+
announce_configured_options(CMAKE_SYSTEM_PROCESSOR)
64+
5765
if(NOT CMAKE_BUILD_TYPE)
5866
set(CMAKE_BUILD_TYPE Debug)
5967
endif()
6068
announce_configured_options(CMAKE_BUILD_TYPE)
6169

70+
if(NOT PYTHON_EXECUTABLE)
71+
resolve_python_executable()
72+
endif()
73+
announce_configured_options(PYTHON_EXECUTABLE)
74+
6275
announce_configured_options(CMAKE_CXX_COMPILER_ID)
6376
announce_configured_options(CMAKE_TOOLCHAIN_FILE)
6477
announce_configured_options(BUCK2)
65-
announce_configured_options(PYTHON_EXECUTABLE)
6678

6779
load_build_preset()
6880
include(${PROJECT_SOURCE_DIR}/tools/cmake/preset/default.cmake)
@@ -72,10 +84,6 @@ print_configured_options()
7284

7385
# MARK: - End EXECUTORCH_H12025_BUILD_MIGRATION ----------------------------------------------------
7486

75-
include(tools/cmake/Utils.cmake)
76-
include(CMakeDependentOption)
77-
include(ExternalProject)
78-
7987
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
8088

8189
# Setup RPATH.
@@ -251,11 +259,6 @@ if(EXECUTORCH_BUILD_TESTS)
251259
include(CTest)
252260
endif()
253261

254-
if(NOT PYTHON_EXECUTABLE)
255-
resolve_python_executable()
256-
endif()
257-
message(STATUS "Using python executable '${PYTHON_EXECUTABLE}'")
258-
259262
# TODO(dbort): Fix these warnings and remove this flag.
260263
set(_common_compile_options -Wno-deprecated-declarations -fPIC)
261264

@@ -579,6 +582,7 @@ if(EXECUTORCH_BUILD_PYBIND)
579582
${TORCH_PYTHON_LIBRARY}
580583
bundled_program
581584
etdump
585+
flatccrt
582586
executorch
583587
extension_data_loader
584588
util

CMakePresets.json

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
},
1616
{
1717
"name": "macos-arm64",
18+
"displayName": "Build everything buildable on macOS arm64",
1819
"inherits": ["common"],
1920
"generator": "Xcode",
2021
"cacheVariables": {
@@ -28,6 +29,20 @@
2829
"type": "equals",
2930
"rhs": "Darwin"
3031
}
32+
},
33+
{
34+
"name": "pybind",
35+
"displayName": "Build pybindings exported in the wheel",
36+
"inherits": ["common"],
37+
"cacheVariables": {
38+
"EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/pybind.cmake",
39+
"CMAKE_OSX_DEPLOYMENT_TARGET": "10.15"
40+
},
41+
"condition": {
42+
"type": "inList",
43+
"string": "${hostSystemName}",
44+
"list": ["Darwin", "Linux", "Windows"]
45+
}
3146
}
3247
]
3348
}

backends/apple/mps/CMakeLists.txt

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,6 @@ endif()
1818

1919
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
2020

21-
if(NOT PYTHON_EXECUTABLE)
22-
resolve_python_executable()
23-
endif()
24-
2521
set(_common_compile_options -Wno-deprecated-declarations)
2622
set(_common_include_directories ${EXECUTORCH_ROOT}/..)
2723

backends/arm/_passes/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from .annotate_channels_last_dim_order_pass import AnnotateChannelsLastDimOrder # noqa
99
from .annotate_decomposed_matmul import AnnotateDecomposedMatmulPass # noqa
1010
from .arm_pass import ArmPass # noqa
11+
from .broadcast_args_pass import BroadcastArgsPass # noqa
1112
from .cast_int64_pass import CastInt64BuffersToInt32Pass # noqa
1213
from .cast_to_int32_pass import CastToInt32Pass # noqa
1314
from .conv1d_unsqueeze_pass import Conv1dUnsqueezePass # noqa
@@ -24,6 +25,7 @@
2425
from .decompose_gelu_pass import DecomposeGeluPass # noqa
2526
from .decompose_layernorm_pass import DecomposeLayerNormPass # noqa
2627
from .decompose_leaky_relu_pass import DecomposeLeakyReLUPass # noqa
28+
from .decompose_linalg_vector_norm_pass import DecomposeLinearVectorNormPass # noqa
2729
from .decompose_linear_pass import DecomposeLinearPass # noqa
2830
from .decompose_meandim_pass import DecomposeMeanDimPass # noqa
2931
from .decompose_ne_pass import DecomposeNotEqualPass # noqa

backends/arm/_passes/arm_pass_manager.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from executorch.backends.arm._passes import (
1111
AnnotateChannelsLastDimOrder,
1212
AnnotateDecomposedMatmulPass,
13+
BroadcastArgsPass,
1314
CastInt64BuffersToInt32Pass,
1415
CastToInt32Pass,
1516
ComputeConstantOpsAOT,
@@ -29,6 +30,7 @@
2930
DecomposeLayerNormPass,
3031
DecomposeLeakyReLUPass,
3132
DecomposeLinearPass,
33+
DecomposeLinearVectorNormPass,
3234
DecomposeMeanDimPass,
3335
DecomposeNotEqualPass,
3436
DecomposeSelectPass,
@@ -86,6 +88,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
8688
self.add_pass(ConvertSplitToSlicePass())
8789
self.add_pass(ConvertMmToBmmPass())
8890
self.add_pass(DecomposeLinearPass())
91+
self.add_pass(DecomposeLinearVectorNormPass())
8992
self.add_pass(DecomposeMeanDimPass())
9093
self.add_pass(ConvertFullLikeToFullPass())
9194
self.add_pass(ConvertToClampPass())
@@ -102,6 +105,8 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
102105
self.add_pass(RetraceFoldedDtypesPass())
103106
self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
104107
self.add_pass(MatchArgRanksPass(exported_program))
108+
if self.tosa_spec.is_U55_subset:
109+
self.add_pass(BroadcastArgsPass())
105110
self.add_pass(ComputeConstantOpsAOT(exported_program))
106111

107112
self.add_pass(RemoveClonePass())
@@ -133,6 +138,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
133138
self.add_pass(FuseBatchnorm2DPass(exported_program))
134139
self.add_pass(ConvertMmToBmmPass())
135140
self.add_pass(DecomposeLinearPass())
141+
self.add_pass(DecomposeLinearVectorNormPass())
136142
self.add_pass(DecomposeLeakyReLUPass())
137143
self.add_pass(DecomposeBatchNormPass())
138144
self.add_pass(DecomposeLayerNormPass())
@@ -207,6 +213,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
207213
self.add_pass(DecomposeCosineSimilarityPass())
208214
self.add_pass(DecomposeDivPass())
209215
self.add_pass(DecomposeLeakyReLUPass())
216+
self.add_pass(DecomposeLinearVectorNormPass())
210217
self.add_pass(DecomposeSqrtPass())
211218
self.add_pass(DecomposeSiluPass())
212219

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# Copyright 2025 Arm Limited and/or its affiliates.
2+
#
3+
# This source code is licensed under the BSD-style license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
6+
from executorch.backends.arm._passes import ArmPass
7+
8+
from executorch.backends.arm._passes.arm_pass_utils import (
9+
create_node,
10+
get_first_fake_tensor,
11+
)
12+
13+
from executorch.exir.dialects._ops import ops as exir_ops
14+
15+
from executorch.exir.pass_base import PassResult
16+
from torch.fx import GraphModule, Node
17+
18+
19+
class BroadcastArgsPass(ArmPass):
20+
"""
21+
Pass to manually broadcast arguments by inserting repeats.
22+
This is done when more than one arg needs broadcasting.
23+
"""
24+
25+
targeted_ops = {
26+
exir_ops.edge.aten.add.Tensor,
27+
exir_ops.edge.aten.sub.Tensor,
28+
# mul is indirectly targeting div as div is decompsed to reciprocal + mul
29+
exir_ops.edge.aten.mul.Tensor,
30+
}
31+
32+
def call(self, graph_module: GraphModule) -> PassResult:
33+
for node in graph_module.graph.nodes:
34+
if node.op != "call_function" or node.target not in self.targeted_ops:
35+
continue
36+
37+
output_shape = get_first_fake_tensor(node).shape
38+
nbr_of_broacasts = 0
39+
for arg in node.args:
40+
if not isinstance(arg, Node):
41+
continue
42+
43+
shape = get_first_fake_tensor(arg).shape
44+
if shape != output_shape:
45+
nbr_of_broacasts += 1
46+
if nbr_of_broacasts > 1:
47+
multiples = [
48+
int(output_shape[d] / shape[d])
49+
for d in range(len(output_shape))
50+
]
51+
with graph_module.graph.inserting_before(node):
52+
repeat = create_node(
53+
graph_module.graph,
54+
exir_ops.edge.aten.repeat.default,
55+
args=(arg, multiples),
56+
kwargs={},
57+
from_node=node,
58+
)
59+
node.replace_input_with(arg, repeat)
60+
61+
graph_module.recompile()
62+
graph_module = super().call(graph_module).graph_module
63+
return PassResult(graph_module, True)
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# Copyright 2025 Arm Limited and/or its affiliates.
2+
#
3+
# This source code is licensed under the BSD-style license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
6+
import torch
7+
from executorch.exir.pass_base import ExportPass
8+
9+
10+
class DecomposeLinearVectorNormPass(ExportPass):
11+
"""
12+
This pass decomposes aten.linalg_vector_norm.default into more primitive ops.
13+
We need to add this pass before quantization for graph annotation.
14+
By default, aten.linalg_vector_norm op is decomposed during legalization to Edge IR.
15+
16+
The decomposition is as follows:
17+
18+
For p == 1:
19+
out = REDUCE_SUM(ABS(x), dims, keepdim)
20+
21+
For p == 2:
22+
out = SQRT(REDUCE_SUM(MUL(x, x), dims, keepdim))
23+
24+
For arbitrary p:
25+
We dont support arbitrary p, because our decomposition looks like
26+
out = POW(REDUCE_SUM(POW(ABS(x), p), dims, keepdim), 1/p)
27+
In this case we need to wrap p into Tensor and we need to know
28+
dtype prior, but we dont know this from FX graph.
29+
"""
30+
31+
torch_linalg_vector_norm = (torch.ops.aten.linalg_vector_norm.default,)
32+
33+
def call_operator(self, op, args, kwargs, meta):
34+
if op not in self.torch_linalg_vector_norm:
35+
return super().call_operator(op, args, kwargs, meta)
36+
37+
# Extract inputs and optional arguments.
38+
# Expected args:
39+
# args[0]: input tensor
40+
# args[1]: norm order 'p' (optional, default: 2.0)
41+
# args[2]: dimensions to reduce (should be provided)
42+
# args[3]: keepdim flag (optional, default: False)
43+
input_tensor = args[0]
44+
norm_order = args[1] if len(args) > 1 else 2.0
45+
norm_dim = args[2] if len(args) > 2 else None
46+
keepdim = args[3] if len(args) > 3 else False
47+
48+
if norm_order not in (1, 2):
49+
raise ValueError(
50+
f"The order of {norm_order}\n"
51+
f"is not supported for linalg_vector_norm operator"
52+
)
53+
54+
if norm_dim is None:
55+
raise ValueError("The norm_dim for linalg_vector_norm is None.")
56+
57+
dims = [norm_dim] if isinstance(norm_dim, int) else list(norm_dim)
58+
59+
# Decomposition based on norm order.
60+
if norm_order == 1:
61+
op1 = super().call_operator(
62+
torch.ops.aten.abs.default, (input_tensor,), {}, meta
63+
)
64+
op2 = super().call_operator(
65+
torch.ops.aten.sum.dim_IntList, (op1, dims, keepdim), {}, meta
66+
)
67+
return op2
68+
69+
elif norm_order == 2:
70+
# For p == 2, decomposition is sqrt(sum(x * x, dims, keepdim))
71+
op1 = super().call_operator(
72+
torch.ops.aten.mul.Tensor, (input_tensor, input_tensor), {}, meta
73+
)
74+
op2 = super().call_operator(
75+
torch.ops.aten.sum.dim_IntList, (op1, dims, keepdim), {}, meta
76+
)
77+
op3 = super().call_operator(torch.ops.aten.sqrt.default, (op2,), {}, meta)
78+
return op3

backends/arm/arm_vela.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,8 @@ def vela_compile(tosa_flatbuffer: bytes, args: List[str], verbose: bool = False)
7373
np_path = os.path.join(tmpdir, "output", "out_vela.npz")
7474
else:
7575
np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz")
76-
blocks = b""
7776

77+
blocks = b""
7878
with np.load(np_path, allow_pickle=False) as data:
7979
# Construct our modified output_blocks with data in a form easily
8080
# digested on the device side
@@ -92,7 +92,7 @@ def vela_compile(tosa_flatbuffer: bytes, args: List[str], verbose: bool = False)
9292
if not isinstance(data["scratch_shape"][0], np.int64):
9393
raise RuntimeError("Expected scratch to be int64")
9494
block_length = int(data["scratch_shape"][0])
95-
bin_blocks["scratch_data"] = b"\x00" * block_length
95+
bin_blocks["scratch_size"] = struct.pack("<I", block_length)
9696

9797
# Capture inputs and outputs
9898
bin_blocks["inputs"] = vela_bin_pack_io("input", data)

0 commit comments

Comments
 (0)