Skip to content

Commit 534973e

Browse files
author
morelos
committed
Update on "[ET-VK][ez][Ops] registering Q/DQ/CQP ops and specifying optimal storage"
# Context Certain quantization operators need scales and zeros to be set with a storage layout as buffers. Since the existing op_registry does not allow specifying how input parameters are set with their memory or storage layout, we need to specify that the optimal storage type is buffer so that is conversion pass is added to ensure that the inputs are also buffers. # Changes This moves the quantized_decomposed operators in their own registration, while also specifying that buffer is preferred. Differential Revision: [D77746131](https://our.internmc.facebook.com/intern/diff/D77746131/) [ghstack-poisoned]
2 parents 734e1f8 + 50325b3 commit 534973e

File tree

65 files changed

+3899
-292
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

65 files changed

+3899
-292
lines changed

.ci/scripts/setup-arm-baremetal-tools.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,4 @@
88
# Setup arm example environment (including TOSA tools)
99
git config --global user.email "[email protected]"
1010
git config --global user.name "Github Executorch"
11-
bash examples/arm/setup.sh --i-agree-to-the-contained-eula
11+
bash examples/arm/setup.sh --i-agree-to-the-contained-eula ${@:-}

.github/workflows/build-presets.yml

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,45 @@ jobs:
3434
${CONDA_RUN} cmake --preset ${{ matrix.preset }}
3535
${CONDA_RUN} cmake --build cmake-out -j$(( $(sysctl -n hw.ncpu) - 1 ))
3636
37+
zephyr:
38+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
39+
strategy:
40+
fail-fast: false
41+
matrix:
42+
preset: [zephyr]
43+
with:
44+
job-name: build
45+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
46+
runner: linux.2xlarge
47+
docker-image: executorch-ubuntu-22.04-arm-sdk
48+
submodules: recursive
49+
timeout: 90
50+
script: |
51+
set -eux
52+
# The generic Linux job chooses to use base env, not the one setup by the image
53+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
54+
conda activate "${CONDA_ENV}"
55+
56+
./install_requirements.sh > /dev/null
57+
58+
# Download toolchain
59+
toolchain_url="https://github.com/zephyrproject-rtos/sdk-ng/releases/download/v0.17.2/toolchain_linux-x86_64_arm-zephyr-eabi.tar.xz"
60+
toolchain_dir="arm-zephyr-eabi"
61+
curl --output "${toolchain_dir}.tar.xz" -L "${toolchain_url}"
62+
63+
# Verify download
64+
echo "93128be0235cf5cf5f1ee561aa6eac5f ${toolchain_dir}.tar.xz" > arm-zephyr-eabi.md5
65+
md5sum -c --strict arm-zephyr-eabi.md5
66+
67+
# Extract and install to PATH
68+
tar xf "${toolchain_dir}.tar.xz"
69+
rm -f "${toolchain_dir}.tar.xz"
70+
toolchain_bin_path="$(cd ${toolchain_dir}/bin && pwd)"
71+
export PATH=$PATH:${toolchain_bin_path}
72+
73+
# Build Arm Zephyr Preset
74+
cmake --preset ${{ matrix.preset }}
75+
cmake --build cmake-out -j$(( $(nproc) - 1 ))
3776
linux:
3877
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
3978
strategy:

.github/workflows/trunk.yml

Lines changed: 43 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,10 @@ jobs:
223223
permissions:
224224
id-token: write
225225
contents: read
226+
strategy:
227+
matrix:
228+
os: [bare_metal, zephyr-preset]
229+
fail-fast: false
226230
with:
227231
runner: linux.2xlarge
228232
docker-image: executorch-ubuntu-22.04-arm-sdk
@@ -234,35 +238,62 @@ jobs:
234238
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
235239
conda activate "${CONDA_ENV}"
236240
241+
cxx_flags="-fno-exceptions -fno-rtti -Wall -Werror -Wno-int-in-bool-context -DET_HAVE_PREAD=0"
242+
setup_script_args=""
243+
if [[ ${{ matrix.os}} == "bare_metal" ]]; then
244+
toolchain_prefix=arm-none-eabi-
245+
threshold="103268" # ~100KiB
246+
toolchain_cmake=examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake
247+
elif [[ ${{ matrix.os}} == "zephyr-preset" ]]; then
248+
setup_script_args="--target-toolchain zephyr"
249+
toolchain_prefix=arm-zephyr-eabi-
250+
threshold="133120" # should be ~125KB, set threshold to 130KB
251+
toolchain_cmake=examples/zephyr/x86_64-linux-arm-zephyr-eabi-gcc.cmake
252+
else
253+
echo "Fail unsupport OS selection ${{ matrix.os }}"
254+
exit 1
255+
fi
256+
237257
source .ci/scripts/utils.sh
238258
install_executorch "--use-pt-pinned-commit"
239-
.ci/scripts/setup-arm-baremetal-tools.sh
259+
.ci/scripts/setup-arm-baremetal-tools.sh ${setup_script_args}
240260
source examples/arm/ethos-u-scratch/setup_path.sh
241261
242-
# User baremetal toolchain
243-
arm-none-eabi-c++ --version
244-
toolchain_cmake=examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake
262+
# User toolchain
263+
${toolchain_prefix}c++ --version
264+
265+
# Setup cmake target to desired toolchain
245266
toolchain_cmake=$(realpath ${toolchain_cmake})
246267
247-
# Build and test size test
248-
bash test/build_size_test.sh "-DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake} -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON"
268+
# Build and run size test
269+
if [[ ${{ matrix.os}} == "bare_metal" ]]; then
270+
bash test/build_size_test.sh "-DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake} -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON"
271+
elif [[ ${{ matrix.os}} == "zephyr-preset" ]]; then
272+
CXXFLAGS=${cxx_flags} cmake --preset zephyr -DCMAKE_BUILD_TYPE=Release -DEXECUTORCH_OPTIMIZE_SIZE=ON -DCMAKE_INSTALL_PREFIX=cmake-out -Bcmake-out .
273+
cmake --build cmake-out -j9 --target install --config Release
274+
CXXFLAGS=${cxx_flags} cmake -DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake} -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=cmake-out -Bcmake-out/test test
275+
cmake --build cmake-out/test -j9 --config Release
276+
else
277+
echo "Fail unsupport OS selection ${{ matrix.os }}"
278+
exit 1
279+
fi
280+
249281
elf="cmake-out/test/size_test"
250282
251283
# Dump basic info
252284
ls -al ${elf}
253-
arm-none-eabi-size ${elf}
285+
${toolchain_prefix}size ${elf}
254286
255-
# Dump symbols
287+
# Dump symbol
256288
python .github/scripts/run_nm.py -e ${elf}
257-
python .github/scripts/run_nm.py -e ${elf} -f "executorch" -p "arm-none-eabi-"
258-
python .github/scripts/run_nm.py -e ${elf} -f "executorch_text" -p "arm-none-eabi-"
289+
python .github/scripts/run_nm.py -e ${elf} -f "executorch" -p "${toolchain_prefix}"
290+
python .github/scripts/run_nm.py -e ${elf} -f "executorch_text" -p "${toolchain_prefix}"
259291
260292
# Add basic guard - TODO: refine this!
261-
arm-none-eabi-strip ${elf}
293+
${toolchain_prefix}strip ${elf}
262294
output=$(ls -la ${elf})
263295
arr=($output)
264296
size=${arr[4]}
265-
threshold="103268" # ~100KiB
266297
echo "size: $size, threshold: $threshold"
267298
if [[ "$size" -le "$threshold" ]]; then
268299
echo "Success $size <= $threshold"

CMakePresets.json

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,17 @@
104104
"Windows"
105105
]
106106
}
107+
},
108+
{
109+
"name": "zephyr",
110+
"displayName": "Build everything buildable on Zephyr RTOS",
111+
"inherits": [
112+
"common"
113+
],
114+
"cacheVariables": {
115+
"EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/zephyr.cmake",
116+
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/examples/zephyr/x86_64-linux-arm-zephyr-eabi-gcc.cmake"
117+
}
107118
}
108119
]
109120
}

backends/arm/_passes/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from .convert_squeezes_to_view import ConvertSqueezesToViewPass # noqa
2424
from .convert_to_clamp import ConvertToClampPass # noqa
2525
from .decompose_acosh_pass import DecomposeAcoshPass # noqa
26+
from .decompose_adaptive_avg_pool2d_pass import DecomposeAdaptiveAvgPool2dPass # noqa
2627
from .decompose_atan_pass import DecomposeAtanPass # noqa
2728
from .decompose_avg_pool2d import DecomposeAvgPool2d # noqa
2829
from .decompose_batch_norm_no_stats import DecomposeBatchNormNoStatsPass # noqa
@@ -41,6 +42,7 @@
4142
from .decompose_ne_pass import DecomposeNotEqualPass # noqa
4243
from .decompose_round_pass import DecomposeRoundPass # noqa
4344
from .decompose_select import DecomposeSelectPass # noqa
45+
from .decompose_sign_pass import DecomposeSignPass # noqa
4446
from .decompose_silu_pass import DecomposeSiluPass # noqa
4547
from .decompose_sinh_pass import DecomposeSinhPass # noqa
4648
from .decompose_softmax_pass import DecomposeSoftmaxPass # noqa

backends/arm/_passes/arm_pass_manager.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
# LICENSE file in the root directory of this source tree.
77

88
# pyre-unsafe
9+
10+
import executorch.backends.arm.tosa.dialect # noqa: unused
911
from executorch.backends.arm._passes import (
1012
AddBiasPass,
1113
AnnotateChannelsLastDimOrder,
@@ -26,6 +28,7 @@
2628
ConvertSqueezesToViewPass,
2729
ConvertToClampPass,
2830
DecomposeAcoshPass,
31+
DecomposeAdaptiveAvgPool2dPass,
2932
DecomposeAtanPass,
3033
DecomposeAvgPool2d,
3134
DecomposeBatchNormNoStatsPass,
@@ -44,6 +47,7 @@
4447
DecomposeNotEqualPass,
4548
DecomposeRoundPass,
4649
DecomposeSelectPass,
50+
DecomposeSignPass,
4751
DecomposeSiluPass,
4852
DecomposeSinhPass,
4953
DecomposeSoftmaxPass,
@@ -124,6 +128,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
124128
if self.tosa_spec.is_U55_subset:
125129
self.add_pass(BroadcastArgsPass())
126130
self.add_pass(DecomposeLinearPass())
131+
self.add_pass(DecomposeAdaptiveAvgPool2dPass())
127132
self.add_pass(DecomposeAvgPool2d())
128133
self.add_pass(ComputeConstantOpsAOT(exported_program))
129134

@@ -158,6 +163,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
158163
self.add_pass(ConvertIntPowToMuls())
159164
self.add_pass(CastBoolToInt8Pass())
160165
self.add_pass(DecomposeSinhPass())
166+
self.add_pass(DecomposeSignPass())
161167
self.add_pass(ReplaceScalarWithTensorArgPassTOSAMI())
162168
self.add_pass(DecomposeEmbeddingPass())
163169
self.add_pass(FuseQuantizedActivationPass())
@@ -190,6 +196,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
190196
self.add_pass(RetraceFoldedDtypesPass())
191197
self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
192198
self.add_pass(MatchArgRanksPass(exported_program))
199+
self.add_pass(DecomposeAdaptiveAvgPool2dPass())
193200
self.add_pass(DecomposeAvgPool2d())
194201
self.add_pass(ComputeConstantOpsAOT(exported_program))
195202

@@ -242,6 +249,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
242249
self.add_pass(DecomposeScaledDotProductAttention())
243250
self.add_pass(DecomposeRoundPass())
244251
self.add_pass(CastBoolToInt8Pass())
252+
self.add_pass(DecomposeSignPass())
245253
self.add_pass(ReplaceScalarWithTensorArgPassTOSABI())
246254
self.add_pass(ScalarsToAttributePass())
247255
self.add_pass(DecomposeGroupNormPass())
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# Copyright 2025 Arm Limited and/or its affiliates.
2+
#
3+
# This source code is licensed under the BSD-style license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
6+
from math import ceil, floor
7+
8+
import torch
9+
10+
from executorch.backends.arm._passes import ArmPass
11+
12+
from executorch.exir.dialects._ops import ops as exir_ops
13+
14+
edge_ops = (exir_ops.edge.aten._adaptive_avg_pool2d.default,)
15+
aten_ops = (torch.ops.aten.adaptive_avg_pool2d.default,)
16+
17+
18+
def _get_decomposition(op) -> tuple:
19+
if op in edge_ops:
20+
return (
21+
exir_ops.edge.aten.avg_pool2d.default,
22+
exir_ops.edge.aten.slice_copy.Tensor,
23+
exir_ops.edge.aten.cat.default,
24+
)
25+
if op in aten_ops:
26+
return (
27+
torch.ops.aten.avg_pool2d.default,
28+
torch.ops.aten.slice_copy.Tensor,
29+
torch.ops.aten.cat.default,
30+
)
31+
raise RuntimeError(f"Unable to get decomposition for op {op}")
32+
33+
34+
class DecomposeAdaptiveAvgPool2dPass(ArmPass):
35+
"""
36+
Decomposes AdaptiveAvgPool2d into AvgPool2d operations.
37+
38+
An input tensor of shape (N, C, H, W) is transformed into an output tensor
39+
of shape (N, C, output_size_h, output_size_w).
40+
41+
The output is of size output_size_h x output_size_w for any input.
42+
"""
43+
44+
def call_operator(self, op, args, kwargs, meta, updated=False):
45+
if op not in (edge_ops + aten_ops):
46+
return super().call_operator(op, args, kwargs, meta, updated)
47+
48+
avg_pool2d_op, slice_op, cat_op = _get_decomposition(op)
49+
50+
x = args[0]
51+
_, _, input_size_h, input_size_w = x.data.shape
52+
53+
(output_size_h, output_size_w) = args[1]
54+
55+
# Vela currently only allows a stride in the interval of [1,3] for AvgPool2d.
56+
# To accommodate this, the AvgPool2d op is applied to pooling regions and the results are concatenated.
57+
58+
res = []
59+
for out_i in range(output_size_h):
60+
row = []
61+
for out_j in range(output_size_w):
62+
# Calculate pooling regions
63+
start_h = floor(out_i * input_size_h / output_size_h)
64+
end_h = ceil((out_i + 1) * input_size_h / output_size_h)
65+
start_w = floor(out_j * input_size_w / output_size_w)
66+
end_w = ceil((out_j + 1) * input_size_w / output_size_w)
67+
68+
# Slice along H
69+
x_h = super().call_operator(
70+
slice_op, (x, 2, start_h, end_h), kwargs, meta, True
71+
)
72+
# Slice along W
73+
x_hw = super().call_operator(
74+
slice_op, (x_h, 3, start_w, end_w), kwargs, meta, True
75+
)
76+
77+
# Apply avg pooling with kernel size equal to the pooling region
78+
kernel_h = end_h - start_h
79+
kernel_w = end_w - start_w
80+
pool_args = (x_hw, (kernel_h, kernel_w), (1, 1), (0, 0))
81+
pooled = super().call_operator(
82+
avg_pool2d_op, pool_args, kwargs, meta, True
83+
)
84+
row.append(pooled)
85+
86+
# Concatenate row results along width (dim=3)
87+
row_tensor = super().call_operator(cat_op, (row, 3), kwargs, meta, True)
88+
res.append(row_tensor)
89+
90+
# Concatenate all rows along height (dim=2)
91+
out = super().call_operator(cat_op, (res, 2), kwargs, meta, True)
92+
return out
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# Copyright 2025 Arm Limited and/or its affiliates.
2+
#
3+
# This source code is licensed under the BSD-style license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
6+
import torch
7+
8+
from executorch.backends.arm._passes import ArmPass
9+
from executorch.exir.dialects._ops import ops as exir_ops
10+
11+
12+
# For MI case
13+
edge_sign = exir_ops.edge.aten.sign.default
14+
# For BI case
15+
aten_sign = torch.ops.aten.sign.default
16+
17+
18+
def get_ops(op):
19+
"""Returns the appropriate operator functions based on the input operator."""
20+
if op == edge_sign:
21+
return (
22+
exir_ops.edge.aten.gt.Scalar,
23+
exir_ops.edge.aten.lt.Scalar,
24+
exir_ops.edge.aten.where.self,
25+
exir_ops.edge.aten.neg.default,
26+
exir_ops.edge.aten.mul.Scalar,
27+
exir_ops.edge.aten.add.Scalar,
28+
)
29+
elif op == aten_sign:
30+
return (
31+
torch.ops.aten.gt.Scalar,
32+
torch.ops.aten.lt.Scalar,
33+
torch.ops.aten.where.self,
34+
torch.ops.aten.neg.default,
35+
torch.ops.aten.mul.Scalar,
36+
torch.ops.aten.add.Scalar,
37+
)
38+
else:
39+
raise ValueError(f"Unsupported operator: {op}")
40+
41+
42+
class DecomposeSignPass(ArmPass):
43+
"""Decomposes the sign operator into a sequence of operations that are supported by the Arm backend."""
44+
45+
def call_operator(self, op, args, kwargs, meta):
46+
if op not in (edge_sign, aten_sign):
47+
return super().call_operator(op, args, kwargs, meta)
48+
49+
gt_op, lt_op, where_op, neg_op, mul_op, add_op = get_ops(op)
50+
51+
x = args[0]
52+
53+
gt_mask = super().call_operator(gt_op, (x, 0.0), {}, meta, updated=True)
54+
lt_mask = super().call_operator(lt_op, (x, 0.0), {}, meta, updated=True)
55+
56+
zeros = super().call_operator(mul_op, (x, 0.0), {}, meta, updated=True)
57+
ones = super().call_operator(add_op, (zeros, 1.0), {}, meta, updated=True)
58+
neg_ones = super().call_operator(neg_op, (ones,), {}, meta, updated=True)
59+
60+
negative_tensor = super().call_operator(
61+
where_op, (lt_mask, neg_ones, zeros), {}, meta, updated=True
62+
)
63+
positive_tensor = super().call_operator(
64+
where_op, (gt_mask, ones, zeros), {}, meta, updated=True
65+
)
66+
67+
return super().call_operator(
68+
where_op,
69+
(lt_mask, negative_tensor, positive_tensor),
70+
{},
71+
meta,
72+
updated=True,
73+
)

0 commit comments

Comments
 (0)