Skip to content

Commit 43b50ff

Browse files
authored
Add initial backends/cadence/vision module scaffold with optimized softmax kernel (no iDMA), fix new op dependencies,, update namespace (#12480)
Differential Revision: D82685201 Pull Request resolved: #14398
1 parent a523306 commit 43b50ff

40 files changed

+5030
-0
lines changed

backends/cadence/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,9 @@ elseif(EXECUTORCH_FUSION_G3_OPT)
8888
${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib
8989
${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
9090
)
91+
elseif(EXECUTORCH_VISION_OPT)
92+
set(TARGET_DIR vision)
93+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels)
9194
else()
9295
set(TARGET_DIR reference)
9396
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels)
Lines changed: 265 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,265 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
#
3+
# This yaml file contains operators that are also defined by the ATen library.
4+
# For lean mode:
5+
# - Codegen'd target `executorch_generated_lib` will be reading all the information
6+
# from this file, including operator schema and kernel metadata.
7+
# - Selective build target `codegen:executorch_defined_ops` now is selecting all the
8+
# operators in this file, by dumping all the op names into `selected_operators.yaml`.
9+
#
10+
# See the README.md file in executorch/kernels/portable for a description of the syntax used
11+
# by this file.
12+
13+
14+
# aten ops
15+
- op: _to_copy.out
16+
kernels:
17+
- arg_meta: null
18+
kernel_name: torch::executor::to_copy_out
19+
20+
- op: _softmax.out
21+
kernels:
22+
- arg_meta: null
23+
kernel_name: impl::vision::native::_softmax_out
24+
25+
- op: add.out
26+
kernels:
27+
- arg_meta: null
28+
kernel_name: impl::vision::native::add_out
29+
30+
- op: bmm.out
31+
kernels:
32+
- arg_meta: null
33+
kernel_name: torch::executor::bmm_out
34+
35+
- op: cat.out
36+
kernels:
37+
- arg_meta: null
38+
kernel_name: torch::executor::cat_out
39+
40+
- op: clone.out
41+
kernels:
42+
- arg_meta: null
43+
kernel_name: torch::executor::clone_out
44+
45+
- op: div.out
46+
kernels:
47+
- arg_meta: null
48+
kernel_name: torch::executor::div_out
49+
50+
- op: div.out_mode
51+
kernels:
52+
- arg_meta: null
53+
kernel_name: torch::executor::div_out_mode
54+
55+
- op: embedding.out
56+
kernels:
57+
- arg_meta: null
58+
kernel_name: impl::vision::native::embedding_out
59+
60+
- op: empty.out
61+
kernels:
62+
- arg_meta: null
63+
kernel_name: torch::executor::empty_out
64+
65+
- op: expand_copy.out
66+
kernels:
67+
- arg_meta: null
68+
kernel_name: torch::executor::expand_copy_out
69+
70+
- op: full.out
71+
kernels:
72+
- arg_meta: null
73+
kernel_name: impl::vision::native::full_out
74+
75+
- op: gelu.out
76+
kernels:
77+
- arg_meta: null
78+
kernel_name: torch::executor::gelu_out
79+
80+
- op: hardtanh.out
81+
kernels:
82+
- arg_meta: null
83+
kernel_name: torch::executor::hardtanh_out
84+
85+
- op: max_pool2d_with_indices.out
86+
kernels:
87+
- arg_meta: null
88+
kernel_name: torch::executor::max_pool2d_with_indices_out
89+
90+
- op: mean.out
91+
kernels:
92+
- arg_meta: null
93+
kernel_name: torch::executor::mean_dim_out
94+
95+
- op: mul.out
96+
kernels:
97+
- arg_meta: null
98+
kernel_name: torch::executor::mul_out
99+
100+
- op: mul.Scalar_out
101+
kernels:
102+
- arg_meta: null
103+
kernel_name: torch::executor::mul_scalar_out
104+
105+
- op: permute_copy.out
106+
kernels:
107+
- arg_meta: null
108+
kernel_name: torch::executor::permute_copy_out
109+
110+
- op: rsqrt.out
111+
kernels:
112+
- arg_meta: null
113+
kernel_name: torch::executor::rsqrt_out
114+
115+
- op: sigmoid.out
116+
kernels:
117+
- arg_meta: null
118+
kernel_name: torch::executor::sigmoid_out
119+
120+
- op: slice_copy.Tensor_out
121+
kernels:
122+
- arg_meta: null
123+
kernel_name: torch::executor::slice_copy_Tensor_out
124+
125+
- op: split_with_sizes_copy.out
126+
kernels:
127+
- arg_meta: null
128+
kernel_name: torch::executor::split_with_sizes_copy_out
129+
130+
- op: sub.out
131+
kernels:
132+
- arg_meta: null
133+
kernel_name: torch::executor::sub_out
134+
135+
- op: view_copy.out
136+
kernels:
137+
- arg_meta: null
138+
kernel_name: impl::vision::native::view_copy_out
139+
140+
- op: where.self_out
141+
kernels:
142+
- arg_meta: null
143+
kernel_name: torch::executor::where_out
144+
145+
- op: transpose_copy.int_out
146+
kernels:
147+
- arg_meta: null
148+
kernel_name: torch::executor::transpose_copy_int_out
149+
150+
- op: eq.Scalar_out
151+
kernels:
152+
- arg_meta: null
153+
kernel_name: torch::executor::eq_scalar_out
154+
155+
- op: logical_not.out
156+
kernels:
157+
- arg_meta: null
158+
kernel_name: torch::executor::logical_not_out
159+
160+
- op: any.out
161+
kernels:
162+
- arg_meta: null
163+
kernel_name: torch::executor::any_out
164+
165+
- op: native_group_norm.out
166+
kernels:
167+
- arg_meta: null
168+
kernel_name: torch::executor::native_group_norm_out
169+
170+
- op: sum.IntList_out
171+
kernels:
172+
- arg_meta: null
173+
kernel_name: torch::executor::sum_dim_out
174+
175+
- op: select_copy.int_out
176+
kernels:
177+
- arg_meta: null
178+
kernel_name: torch::executor::select_copy_int_out
179+
180+
# custom ops
181+
- func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
182+
variants: function
183+
kernels:
184+
- arg_meta: null
185+
kernel_name: impl::vision::native::quantize_per_tensor_out
186+
187+
- func: cadence::dequantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
188+
variants: function
189+
kernels:
190+
- arg_meta: null
191+
kernel_name: impl::vision::native::dequantize_per_tensor_out
192+
193+
- func: cadence::quantized_conv.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
194+
kernels:
195+
- arg_meta: null
196+
kernel_name: impl::vision::native::quantized_conv_out
197+
198+
- func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
199+
kernels:
200+
- arg_meta: null
201+
kernel_name: impl::vision::native::quantized_layer_norm_out
202+
- func: cadence::quantized_layer_norm.per_tensor_out(Tensor input, float in_scale, int in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
203+
kernels:
204+
- arg_meta: null
205+
kernel_name: impl::vision::native::quantized_layer_norm_per_tensor_out
206+
207+
- func: cadence::quantized_linear.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
208+
kernels:
209+
- arg_meta: null
210+
kernel_name: impl::vision::native::quantized_linear_out
211+
212+
- func: cadence::quantized_relu.out(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
213+
kernels:
214+
- arg_meta: null
215+
kernel_name: impl::vision::native::quantized_relu_out
216+
217+
- func: cadence::quantized_relu.per_tensor_out(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
218+
kernels:
219+
- arg_meta: null
220+
kernel_name: impl::vision::native::quantized_relu_per_tensor_out
221+
222+
- func: cadence::quantized_matmul.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!)
223+
kernels:
224+
- arg_meta: null
225+
kernel_name: impl::vision::native::quantized_matmul_out
226+
227+
- func: cadence::quantized_linear.per_tensor_out(Tensor src, Tensor weight, Tensor bias, SymInt src_zero_point, SymInt weight_zero_point, SymInt out_multiplier, SymInt out_shift, SymInt out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
228+
kernels:
229+
- arg_meta: null
230+
kernel_name: impl::vision::native::quantized_linear_per_tensor_out
231+
232+
- func: cadence::im2row.out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, Tensor in_zero_point, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
233+
kernels:
234+
- arg_meta: null
235+
kernel_name: impl::vision::native::im2row_out
236+
237+
- func: cadence::im2row.per_tensor_out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, int in_zero_point, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
238+
kernels:
239+
- arg_meta: null
240+
kernel_name: impl::vision::native::im2row_per_tensor_out
241+
242+
- func: cadence::quantized_conv.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
243+
kernels:
244+
- arg_meta: null
245+
kernel_name: impl::vision::native::quantized_conv_per_tensor_out
246+
247+
- func: cadence::quantized_fully_connected.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
248+
kernels:
249+
- arg_meta: null
250+
kernel_name: impl::vision::native::quantized_fully_connected_out
251+
252+
- func: cadence::quantized_fully_connected.per_tensor_out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, int weight_zero_point, int out_multiplier, int out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
253+
kernels:
254+
- arg_meta: null
255+
kernel_name: impl::vision::native::quantized_fully_connected_per_tensor_out
256+
257+
- func: cadence::requantize.out(Tensor input, Tensor in_scale, Tensor in_zero_point, Tensor out_scale, Tensor out_zero_point, ScalarType out_dtype, *, Tensor(a!) out) -> Tensor(a!)
258+
kernels:
259+
- arg_meta: null
260+
kernel_name: impl::vision::native::requantize_out
261+
262+
- func: cadence::requantize.per_tensor_out(Tensor input, float in_scale, int in_zero_point, float out_scale, int out_zero_point, ScalarType out_dtype, *, Tensor(a!) out) -> Tensor(a!)
263+
kernels:
264+
- arg_meta: null
265+
kernel_name: impl::vision::native::requantize_per_tensor_out
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#!/bin/bash
2+
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
set -euo pipefail
9+
10+
unset CMAKE_PREFIX_PATH
11+
unset XTENSA_CORE
12+
export XTENSA_CORE=XRC_Vision_130_AO
13+
git submodule sync
14+
git submodule update --init --recursive
15+
./install_requirements.sh
16+
./install_executorch.sh
17+
18+
rm -rf cmake-out
19+
20+
STEPWISE_BUILD=false
21+
22+
if $STEPWISE_BUILD; then
23+
echo "Building ExecuTorch"
24+
CXXFLAGS="-fno-exceptions -fno-rtti" cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
25+
-DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake \
26+
-DCMAKE_BUILD_TYPE=Release \
27+
-DEXECUTORCH_ENABLE_EVENT_TRACER=OFF \
28+
-DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
29+
-DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
30+
-DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
31+
-DEXECUTORCH_BUILD_CPUINFO=OFF \
32+
-DEXECUTORCH_ENABLE_LOGGING=ON \
33+
-DEXECUTORCH_USE_DL=OFF \
34+
-DEXECUTORCH_BUILD_CADENCE=OFF \
35+
-Bcmake-out .
36+
37+
echo "Building any Cadence-specific binaries on top"
38+
CXXFLAGS="-fno-exceptions -fno-rtti" cmake -DBUCK2="$BUCK" \
39+
-DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake \
40+
-DCMAKE_INSTALL_PREFIX=cmake-out \
41+
-DCMAKE_BUILD_TYPE=Release \
42+
-DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
43+
-DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
44+
-DEXECUTORCH_BUILD_CADENCE=ON \
45+
-DEXECUTORCH_ENABLE_LOGGING=ON \
46+
-DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=ON \
47+
-DEXECUTORCH_USE_DL=OFF \
48+
-DEXECUTORCH_BUILD_PORTABLE_OPS=ON \
49+
-DEXECUTORCH_BUILD_KERNELS_LLM=OFF \
50+
-DPYTHON_EXECUTABLE=python3 \
51+
-DEXECUTORCH_VISION_OPT=ON \
52+
-DHAVE_FNMATCH_H=OFF \
53+
-Bcmake-out/backends/cadence \
54+
backends/cadence
55+
cmake --build cmake-out/backends/cadence -j8
56+
else
57+
echo "Building Cadence toolchain with ExecuTorch packages"
58+
cmake_prefix_path="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags"
59+
CXXFLAGS="-fno-exceptions -fno-rtti" cmake -DBUCK2="$BUCK" \
60+
-DCMAKE_PREFIX_PATH="${cmake_prefix_path}" \
61+
-DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake \
62+
-DCMAKE_INSTALL_PREFIX=cmake-out \
63+
-DCMAKE_BUILD_TYPE=Release \
64+
-DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
65+
-DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
66+
-DEXECUTORCH_BUILD_CPUINFO=OFF \
67+
-DEXECUTORCH_BUILD_CADENCE=ON \
68+
-DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
69+
-DEXECUTORCH_ENABLE_LOGGING=ON \
70+
-DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=ON \
71+
-DEXECUTORCH_USE_DL=OFF \
72+
-DEXECUTORCH_BUILD_PORTABLE_OPS=ON \
73+
-DEXECUTORCH_BUILD_KERNELS_LLM=OFF \
74+
-DPYTHON_EXECUTABLE=python3 \
75+
-DEXECUTORCH_VISION_OPT=ON \
76+
-DHAVE_FNMATCH_H=OFF \
77+
-Bcmake-out
78+
cmake --build cmake-out --target install --config Release -j8
79+
fi
80+
81+
echo "Run simple model to verify cmake build"
82+
python3 -m examples.portable.scripts.export --model_name="add"
83+
xt-run --turbo cmake-out/executor_runner --model_path=add.pte
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
# lint_cmake: -linelength
8+
add_library(
9+
cadence_kernels
10+
kernels.cpp
11+
${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/library/api/tensor_transposef.c
12+
${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/library/api/vsoftmaxf.c
13+
${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/library/tables/expf_tbl.c
14+
${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/library/tables/nanf_tbl.c
15+
${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/library/tables/inff_tbl.c
16+
)
17+
18+
# Let files say "include <executorch/path/to/header.h>".
19+
set(_common_include_directories
20+
${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
21+
)
22+
23+
target_include_directories(
24+
cadence_kernels
25+
PUBLIC . ${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/include
26+
${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/include_private
27+
${_common_include_directories}
28+
)
29+
30+
target_link_libraries(cadence_kernels PRIVATE idma)

0 commit comments

Comments
 (0)