Skip to content

Commit dd3e8fd

Browse files
author
morelos
committed
Update base for Update on "[ET-VK][Ops] quantize_per_channel reference impl and testing"
# Context In order to properly enable dynamic quantization, we create the quantize_per_channel operator as its seemingly useful to have for the pipeline. # Changes This creates the wrapper for the cpu reference implementation, and also a dummy reference implementation I created just to test against it. Differential Revision: [D77746132](https://our.internmc.facebook.com/intern/diff/D77746132/) [ghstack-poisoned]
2 parents 08ed085 + 6669637 commit dd3e8fd

File tree

65 files changed

+3899
-292
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

65 files changed

+3899
-292
lines changed

.ci/scripts/setup-arm-baremetal-tools.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,4 @@
88
# Setup arm example environment (including TOSA tools)
99
git config --global user.email "[email protected]"
1010
git config --global user.name "Github Executorch"
11-
bash examples/arm/setup.sh --i-agree-to-the-contained-eula
11+
bash examples/arm/setup.sh --i-agree-to-the-contained-eula ${@:-}

.github/workflows/build-presets.yml

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,45 @@ jobs:
3434
${CONDA_RUN} cmake --preset ${{ matrix.preset }}
3535
${CONDA_RUN} cmake --build cmake-out -j$(( $(sysctl -n hw.ncpu) - 1 ))
3636
37+
zephyr:
38+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
39+
strategy:
40+
fail-fast: false
41+
matrix:
42+
preset: [zephyr]
43+
with:
44+
job-name: build
45+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
46+
runner: linux.2xlarge
47+
docker-image: executorch-ubuntu-22.04-arm-sdk
48+
submodules: recursive
49+
timeout: 90
50+
script: |
51+
set -eux
52+
# The generic Linux job chooses to use base env, not the one setup by the image
53+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
54+
conda activate "${CONDA_ENV}"
55+
56+
./install_requirements.sh > /dev/null
57+
58+
# Download toolchain
59+
toolchain_url="https://github.com/zephyrproject-rtos/sdk-ng/releases/download/v0.17.2/toolchain_linux-x86_64_arm-zephyr-eabi.tar.xz"
60+
toolchain_dir="arm-zephyr-eabi"
61+
curl --output "${toolchain_dir}.tar.xz" -L "${toolchain_url}"
62+
63+
# Verify download
64+
echo "93128be0235cf5cf5f1ee561aa6eac5f ${toolchain_dir}.tar.xz" > arm-zephyr-eabi.md5
65+
md5sum -c --strict arm-zephyr-eabi.md5
66+
67+
# Extract and install to PATH
68+
tar xf "${toolchain_dir}.tar.xz"
69+
rm -f "${toolchain_dir}.tar.xz"
70+
toolchain_bin_path="$(cd ${toolchain_dir}/bin && pwd)"
71+
export PATH=$PATH:${toolchain_bin_path}
72+
73+
# Build Arm Zephyr Preset
74+
cmake --preset ${{ matrix.preset }}
75+
cmake --build cmake-out -j$(( $(nproc) - 1 ))
3776
linux:
3877
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
3978
strategy:

.github/workflows/trunk.yml

Lines changed: 43 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,10 @@ jobs:
223223
permissions:
224224
id-token: write
225225
contents: read
226+
strategy:
227+
matrix:
228+
os: [bare_metal, zephyr-preset]
229+
fail-fast: false
226230
with:
227231
runner: linux.2xlarge
228232
docker-image: executorch-ubuntu-22.04-arm-sdk
@@ -234,35 +238,62 @@ jobs:
234238
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
235239
conda activate "${CONDA_ENV}"
236240
241+
cxx_flags="-fno-exceptions -fno-rtti -Wall -Werror -Wno-int-in-bool-context -DET_HAVE_PREAD=0"
242+
setup_script_args=""
243+
if [[ ${{ matrix.os}} == "bare_metal" ]]; then
244+
toolchain_prefix=arm-none-eabi-
245+
threshold="103268" # ~100KiB
246+
toolchain_cmake=examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake
247+
elif [[ ${{ matrix.os}} == "zephyr-preset" ]]; then
248+
setup_script_args="--target-toolchain zephyr"
249+
toolchain_prefix=arm-zephyr-eabi-
250+
threshold="133120" # should be ~125KB, set threshold to 130KB
251+
toolchain_cmake=examples/zephyr/x86_64-linux-arm-zephyr-eabi-gcc.cmake
252+
else
253+
echo "Fail unsupport OS selection ${{ matrix.os }}"
254+
exit 1
255+
fi
256+
237257
source .ci/scripts/utils.sh
238258
install_executorch "--use-pt-pinned-commit"
239-
.ci/scripts/setup-arm-baremetal-tools.sh
259+
.ci/scripts/setup-arm-baremetal-tools.sh ${setup_script_args}
240260
source examples/arm/ethos-u-scratch/setup_path.sh
241261
242-
# User baremetal toolchain
243-
arm-none-eabi-c++ --version
244-
toolchain_cmake=examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake
262+
# User toolchain
263+
${toolchain_prefix}c++ --version
264+
265+
# Setup cmake target to desired toolchain
245266
toolchain_cmake=$(realpath ${toolchain_cmake})
246267
247-
# Build and test size test
248-
bash test/build_size_test.sh "-DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake} -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON"
268+
# Build and run size test
269+
if [[ ${{ matrix.os}} == "bare_metal" ]]; then
270+
bash test/build_size_test.sh "-DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake} -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON"
271+
elif [[ ${{ matrix.os}} == "zephyr-preset" ]]; then
272+
CXXFLAGS=${cxx_flags} cmake --preset zephyr -DCMAKE_BUILD_TYPE=Release -DEXECUTORCH_OPTIMIZE_SIZE=ON -DCMAKE_INSTALL_PREFIX=cmake-out -Bcmake-out .
273+
cmake --build cmake-out -j9 --target install --config Release
274+
CXXFLAGS=${cxx_flags} cmake -DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake} -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=cmake-out -Bcmake-out/test test
275+
cmake --build cmake-out/test -j9 --config Release
276+
else
277+
echo "Fail unsupport OS selection ${{ matrix.os }}"
278+
exit 1
279+
fi
280+
249281
elf="cmake-out/test/size_test"
250282
251283
# Dump basic info
252284
ls -al ${elf}
253-
arm-none-eabi-size ${elf}
285+
${toolchain_prefix}size ${elf}
254286
255-
# Dump symbols
287+
# Dump symbol
256288
python .github/scripts/run_nm.py -e ${elf}
257-
python .github/scripts/run_nm.py -e ${elf} -f "executorch" -p "arm-none-eabi-"
258-
python .github/scripts/run_nm.py -e ${elf} -f "executorch_text" -p "arm-none-eabi-"
289+
python .github/scripts/run_nm.py -e ${elf} -f "executorch" -p "${toolchain_prefix}"
290+
python .github/scripts/run_nm.py -e ${elf} -f "executorch_text" -p "${toolchain_prefix}"
259291
260292
# Add basic guard - TODO: refine this!
261-
arm-none-eabi-strip ${elf}
293+
${toolchain_prefix}strip ${elf}
262294
output=$(ls -la ${elf})
263295
arr=($output)
264296
size=${arr[4]}
265-
threshold="103268" # ~100KiB
266297
echo "size: $size, threshold: $threshold"
267298
if [[ "$size" -le "$threshold" ]]; then
268299
echo "Success $size <= $threshold"

CMakePresets.json

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,17 @@
104104
"Windows"
105105
]
106106
}
107+
},
108+
{
109+
"name": "zephyr",
110+
"displayName": "Build everything buildable on Zephyr RTOS",
111+
"inherits": [
112+
"common"
113+
],
114+
"cacheVariables": {
115+
"EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/zephyr.cmake",
116+
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/examples/zephyr/x86_64-linux-arm-zephyr-eabi-gcc.cmake"
117+
}
107118
}
108119
]
109120
}

backends/arm/_passes/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from .convert_squeezes_to_view import ConvertSqueezesToViewPass # noqa
2424
from .convert_to_clamp import ConvertToClampPass # noqa
2525
from .decompose_acosh_pass import DecomposeAcoshPass # noqa
26+
from .decompose_adaptive_avg_pool2d_pass import DecomposeAdaptiveAvgPool2dPass # noqa
2627
from .decompose_atan_pass import DecomposeAtanPass # noqa
2728
from .decompose_avg_pool2d import DecomposeAvgPool2d # noqa
2829
from .decompose_batch_norm_no_stats import DecomposeBatchNormNoStatsPass # noqa
@@ -41,6 +42,7 @@
4142
from .decompose_ne_pass import DecomposeNotEqualPass # noqa
4243
from .decompose_round_pass import DecomposeRoundPass # noqa
4344
from .decompose_select import DecomposeSelectPass # noqa
45+
from .decompose_sign_pass import DecomposeSignPass # noqa
4446
from .decompose_silu_pass import DecomposeSiluPass # noqa
4547
from .decompose_sinh_pass import DecomposeSinhPass # noqa
4648
from .decompose_softmax_pass import DecomposeSoftmaxPass # noqa

backends/arm/_passes/arm_pass_manager.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
# LICENSE file in the root directory of this source tree.
77

88
# pyre-unsafe
9+
10+
import executorch.backends.arm.tosa.dialect # noqa: unused
911
from executorch.backends.arm._passes import (
1012
AddBiasPass,
1113
AnnotateChannelsLastDimOrder,
@@ -26,6 +28,7 @@
2628
ConvertSqueezesToViewPass,
2729
ConvertToClampPass,
2830
DecomposeAcoshPass,
31+
DecomposeAdaptiveAvgPool2dPass,
2932
DecomposeAtanPass,
3033
DecomposeAvgPool2d,
3134
DecomposeBatchNormNoStatsPass,
@@ -44,6 +47,7 @@
4447
DecomposeNotEqualPass,
4548
DecomposeRoundPass,
4649
DecomposeSelectPass,
50+
DecomposeSignPass,
4751
DecomposeSiluPass,
4852
DecomposeSinhPass,
4953
DecomposeSoftmaxPass,
@@ -124,6 +128,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
124128
if self.tosa_spec.is_U55_subset:
125129
self.add_pass(BroadcastArgsPass())
126130
self.add_pass(DecomposeLinearPass())
131+
self.add_pass(DecomposeAdaptiveAvgPool2dPass())
127132
self.add_pass(DecomposeAvgPool2d())
128133
self.add_pass(ComputeConstantOpsAOT(exported_program))
129134

@@ -158,6 +163,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
158163
self.add_pass(ConvertIntPowToMuls())
159164
self.add_pass(CastBoolToInt8Pass())
160165
self.add_pass(DecomposeSinhPass())
166+
self.add_pass(DecomposeSignPass())
161167
self.add_pass(ReplaceScalarWithTensorArgPassTOSAMI())
162168
self.add_pass(DecomposeEmbeddingPass())
163169
self.add_pass(FuseQuantizedActivationPass())
@@ -190,6 +196,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
190196
self.add_pass(RetraceFoldedDtypesPass())
191197
self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
192198
self.add_pass(MatchArgRanksPass(exported_program))
199+
self.add_pass(DecomposeAdaptiveAvgPool2dPass())
193200
self.add_pass(DecomposeAvgPool2d())
194201
self.add_pass(ComputeConstantOpsAOT(exported_program))
195202

@@ -242,6 +249,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
242249
self.add_pass(DecomposeScaledDotProductAttention())
243250
self.add_pass(DecomposeRoundPass())
244251
self.add_pass(CastBoolToInt8Pass())
252+
self.add_pass(DecomposeSignPass())
245253
self.add_pass(ReplaceScalarWithTensorArgPassTOSABI())
246254
self.add_pass(ScalarsToAttributePass())
247255
self.add_pass(DecomposeGroupNormPass())
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# Copyright 2025 Arm Limited and/or its affiliates.
2+
#
3+
# This source code is licensed under the BSD-style license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
6+
from math import ceil, floor
7+
8+
import torch
9+
10+
from executorch.backends.arm._passes import ArmPass
11+
12+
from executorch.exir.dialects._ops import ops as exir_ops
13+
14+
edge_ops = (exir_ops.edge.aten._adaptive_avg_pool2d.default,)
15+
aten_ops = (torch.ops.aten.adaptive_avg_pool2d.default,)
16+
17+
18+
def _get_decomposition(op) -> tuple:
19+
if op in edge_ops:
20+
return (
21+
exir_ops.edge.aten.avg_pool2d.default,
22+
exir_ops.edge.aten.slice_copy.Tensor,
23+
exir_ops.edge.aten.cat.default,
24+
)
25+
if op in aten_ops:
26+
return (
27+
torch.ops.aten.avg_pool2d.default,
28+
torch.ops.aten.slice_copy.Tensor,
29+
torch.ops.aten.cat.default,
30+
)
31+
raise RuntimeError(f"Unable to get decomposition for op {op}")
32+
33+
34+
class DecomposeAdaptiveAvgPool2dPass(ArmPass):
35+
"""
36+
Decomposes AdaptiveAvgPool2d into AvgPool2d operations.
37+
38+
An input tensor of shape (N, C, H, W) is transformed into an output tensor
39+
of shape (N, C, output_size_h, output_size_w).
40+
41+
The output is of size output_size_h x output_size_w for any input.
42+
"""
43+
44+
def call_operator(self, op, args, kwargs, meta, updated=False):
45+
if op not in (edge_ops + aten_ops):
46+
return super().call_operator(op, args, kwargs, meta, updated)
47+
48+
avg_pool2d_op, slice_op, cat_op = _get_decomposition(op)
49+
50+
x = args[0]
51+
_, _, input_size_h, input_size_w = x.data.shape
52+
53+
(output_size_h, output_size_w) = args[1]
54+
55+
# Vela currently only allows a stride in the interval of [1,3] for AvgPool2d.
56+
# To accommodate this, the AvgPool2d op is applied to pooling regions and the results are concatenated.
57+
58+
res = []
59+
for out_i in range(output_size_h):
60+
row = []
61+
for out_j in range(output_size_w):
62+
# Calculate pooling regions
63+
start_h = floor(out_i * input_size_h / output_size_h)
64+
end_h = ceil((out_i + 1) * input_size_h / output_size_h)
65+
start_w = floor(out_j * input_size_w / output_size_w)
66+
end_w = ceil((out_j + 1) * input_size_w / output_size_w)
67+
68+
# Slice along H
69+
x_h = super().call_operator(
70+
slice_op, (x, 2, start_h, end_h), kwargs, meta, True
71+
)
72+
# Slice along W
73+
x_hw = super().call_operator(
74+
slice_op, (x_h, 3, start_w, end_w), kwargs, meta, True
75+
)
76+
77+
# Apply avg pooling with kernel size equal to the pooling region
78+
kernel_h = end_h - start_h
79+
kernel_w = end_w - start_w
80+
pool_args = (x_hw, (kernel_h, kernel_w), (1, 1), (0, 0))
81+
pooled = super().call_operator(
82+
avg_pool2d_op, pool_args, kwargs, meta, True
83+
)
84+
row.append(pooled)
85+
86+
# Concatenate row results along width (dim=3)
87+
row_tensor = super().call_operator(cat_op, (row, 3), kwargs, meta, True)
88+
res.append(row_tensor)
89+
90+
# Concatenate all rows along height (dim=2)
91+
out = super().call_operator(cat_op, (res, 2), kwargs, meta, True)
92+
return out
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# Copyright 2025 Arm Limited and/or its affiliates.
2+
#
3+
# This source code is licensed under the BSD-style license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
6+
import torch
7+
8+
from executorch.backends.arm._passes import ArmPass
9+
from executorch.exir.dialects._ops import ops as exir_ops
10+
11+
12+
# For MI case
13+
edge_sign = exir_ops.edge.aten.sign.default
14+
# For BI case
15+
aten_sign = torch.ops.aten.sign.default
16+
17+
18+
def get_ops(op):
19+
"""Returns the appropriate operator functions based on the input operator."""
20+
if op == edge_sign:
21+
return (
22+
exir_ops.edge.aten.gt.Scalar,
23+
exir_ops.edge.aten.lt.Scalar,
24+
exir_ops.edge.aten.where.self,
25+
exir_ops.edge.aten.neg.default,
26+
exir_ops.edge.aten.mul.Scalar,
27+
exir_ops.edge.aten.add.Scalar,
28+
)
29+
elif op == aten_sign:
30+
return (
31+
torch.ops.aten.gt.Scalar,
32+
torch.ops.aten.lt.Scalar,
33+
torch.ops.aten.where.self,
34+
torch.ops.aten.neg.default,
35+
torch.ops.aten.mul.Scalar,
36+
torch.ops.aten.add.Scalar,
37+
)
38+
else:
39+
raise ValueError(f"Unsupported operator: {op}")
40+
41+
42+
class DecomposeSignPass(ArmPass):
43+
"""Decomposes the sign operator into a sequence of operations that are supported by the Arm backend."""
44+
45+
def call_operator(self, op, args, kwargs, meta):
46+
if op not in (edge_sign, aten_sign):
47+
return super().call_operator(op, args, kwargs, meta)
48+
49+
gt_op, lt_op, where_op, neg_op, mul_op, add_op = get_ops(op)
50+
51+
x = args[0]
52+
53+
gt_mask = super().call_operator(gt_op, (x, 0.0), {}, meta, updated=True)
54+
lt_mask = super().call_operator(lt_op, (x, 0.0), {}, meta, updated=True)
55+
56+
zeros = super().call_operator(mul_op, (x, 0.0), {}, meta, updated=True)
57+
ones = super().call_operator(add_op, (zeros, 1.0), {}, meta, updated=True)
58+
neg_ones = super().call_operator(neg_op, (ones,), {}, meta, updated=True)
59+
60+
negative_tensor = super().call_operator(
61+
where_op, (lt_mask, neg_ones, zeros), {}, meta, updated=True
62+
)
63+
positive_tensor = super().call_operator(
64+
where_op, (gt_mask, ones, zeros), {}, meta, updated=True
65+
)
66+
67+
return super().call_operator(
68+
where_op,
69+
(lt_mask, negative_tensor, positive_tensor),
70+
{},
71+
meta,
72+
updated=True,
73+
)

0 commit comments

Comments
 (0)