Skip to content

Commit 1662a9b

Browse files
sidartGithub Executorch
authored andcommitted
Summary: Initial CMSS-NN integration for Quantized Add Op
Test Plan: a) Setup for Arm FVP and run 'examples/arm/run.sh' (Check no regressions in e2e test scenarios) b) Then add to run.sh another iteration with qadd with only --quantize flag and see that quantized add op is called c) cd backends/cortex_m/test/; python test_quantize_add_fusion_pass.py ---------------------------------------------------------------------- Ran 8 tests in 11.128s OK Reviewers: Subscribers: Tasks: Tags:
1 parent 45519f6 commit 1662a9b

File tree

12 files changed

+1188
-177
lines changed

12 files changed

+1188
-177
lines changed

backends/cortex_m/CMakeLists.txt

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,38 @@ endif()
2424

2525
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
2626
include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
27+
include(ExternalProject)
28+
29+
# Download and build CMSIS-NN from GitHub
30+
set(CMSIS_NN_VERSION
31+
"v4.1.0"
32+
CACHE STRING "CMSIS-NN version to download"
33+
)
34+
set(CMSIS_NN_ROOT ${CMAKE_CURRENT_BINARY_DIR}/cmsis-nn)
35+
set(CMSIS_NN_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmsis-nn-build)
36+
set(CMSIS_NN_LIB_PATH ${CMSIS_NN_BINARY_DIR}/libcmsis-nn.a)
37+
38+
set(TARGET_CPU "cortex-m55" CACHE STRING "Target CPU for CMSIS-NN build")
39+
ExternalProject_Add(
40+
cmsis_nn_external
41+
GIT_REPOSITORY https://github.com/ARM-software/CMSIS-NN.git
42+
GIT_TAG ${CMSIS_NN_VERSION}
43+
SOURCE_DIR ${CMSIS_NN_ROOT}
44+
BINARY_DIR ${CMSIS_NN_BINARY_DIR}
45+
CMAKE_ARGS
46+
-DCMAKE_TOOLCHAIN_FILE=${EXECUTORCH_ROOT}/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake
47+
-DTARGET_CPU=${TARGET_CPU}
48+
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
49+
BUILD_COMMAND ${CMAKE_COMMAND} --build <BINARY_DIR> --parallel
50+
INSTALL_COMMAND ""
51+
BUILD_BYPRODUCTS ${CMSIS_NN_LIB_PATH}
52+
)
2753

2854
# Cortex-M ops kernel sources
2955
set(_cortex_m_kernels__srcs
3056
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantize_per_tensor.cpp
3157
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_dequantize_per_tensor.cpp
58+
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_add.cpp
3259
)
3360

3461
# Generate C++ bindings to register kernels into Executorch (for runtime). Here
@@ -44,9 +71,23 @@ message("Generated files ${gen_command_sources}")
4471

4572
# Build a library for _cortex_m_kernels_srcs
4673
add_library(cortex_m_kernels ${_cortex_m_kernels__srcs})
47-
target_link_libraries(cortex_m_kernels PRIVATE executorch)
4874
target_compile_options(cortex_m_kernels PUBLIC ${_common_compile_options})
4975

76+
# Add dependency on CMSIS-NN external project
77+
add_dependencies(cortex_m_kernels cmsis_nn_external)
78+
79+
# Set include directories - Include is directly in CMSIS-NN root
80+
target_include_directories(
81+
cortex_m_kernels
82+
PRIVATE ${EXECUTORCH_ROOT}/..
83+
${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
84+
$<BUILD_INTERFACE:${CMSIS_NN_ROOT}/Include>
85+
$<BUILD_INTERFACE:${CMSIS_NN_ROOT}>
86+
)
87+
88+
# Link against the CMSIS-NN static library directly
89+
target_link_libraries(cortex_m_kernels PUBLIC ${CMSIS_NN_LIB_PATH} executorch)
90+
5091
# cortex_m_ops_lib: Register Cortex-M ops kernels into Executorch runtime
5192
gen_operators_lib(
5293
LIB_NAME "cortex_m_ops_lib" KERNEL_LIBS cortex_m_kernels DEPS executorch
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#pragma once
10+
11+
#include <executorch/runtime/kernel/kernel_includes.h>
12+
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
13+
14+
// Include CMSIS-NN headers with C linkage
15+
extern "C" {
16+
#include "arm_nnfunctions.h"
17+
}
18+
19+
#define VALIDATE_QUANTIZED_INPUTS(ctx, input1, input2, output) \
20+
do { \
21+
ET_CHECK_MSG((input1).scalar_type() == ScalarType::Char, \
22+
"Input1 must be int8"); \
23+
ET_CHECK_MSG((input2).scalar_type() == ScalarType::Char, \
24+
"Input2 must be int8"); \
25+
ET_CHECK_MSG((output).scalar_type() == ScalarType::Char, \
26+
"Output must be int8"); \
27+
ET_KERNEL_CHECK( \
28+
ctx, \
29+
torch::executor::resize_to_broadcast_target_size( \
30+
input1, input2, output) == ::executorch::runtime::Error::Ok, \
31+
InvalidArgument, \
32+
output); \
33+
ET_CHECK_MSG((input1).sizes() == (input2).sizes(), \
34+
"Input tensors must be the same shape"); \
35+
ET_CHECK_MSG((input1).scalar_type() == (input2).scalar_type(), \
36+
"Input tensors must be the same dtype"); \
37+
} while (0)
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include "cortex_m_ops_common.h"
10+
11+
namespace cortex_m {
12+
namespace native {
13+
14+
using Tensor = torch::executor::Tensor;
15+
using ScalarType = executorch::aten::ScalarType;
16+
using Scalar = torch::executor::Scalar;
17+
using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
18+
19+
Tensor& quantized_add_out(
20+
KernelRuntimeContext& context,
21+
const Tensor& input1_int8,
22+
const Scalar& input1_zero_point,
23+
const Scalar& input1_multiplier,
24+
const Scalar& input1_shift,
25+
const Tensor& input2_int8,
26+
const Scalar& input2_zero_point,
27+
const Scalar& input2_multiplier,
28+
const Scalar& input2_shift,
29+
const Scalar& output_zero_point,
30+
const Scalar& output_multiplier,
31+
const Scalar& output_shift,
32+
Tensor& out) {
33+
VALIDATE_QUANTIZED_INPUTS(context, input1_int8, input2_int8, out);
34+
35+
ET_LOG(
36+
Info,
37+
"quantized_add_out: input1_int8.sizes() = %zu",
38+
input1_int8.sizes().size());
39+
40+
// FIX: Use template types that ExecutorTorch definitely provides
41+
// Use to<int64_t>() and to<double>() which are commonly instantiated
42+
int32_t zp1 = static_cast<int32_t>(input1_zero_point.to<int64_t>());
43+
int32_t input1_mult = static_cast<int32_t>(input1_multiplier.to<int64_t>());
44+
int input1_shift_val = static_cast<int>(input1_shift.to<int64_t>());
45+
46+
int32_t zp2 = static_cast<int32_t>(input2_zero_point.to<int64_t>());
47+
int32_t input2_mult = static_cast<int32_t>(input2_multiplier.to<int64_t>());
48+
int input2_shift_val = static_cast<int>(input2_shift.to<int64_t>());
49+
50+
int32_t out_zp = static_cast<int32_t>(output_zero_point.to<int64_t>());
51+
int32_t output_mult = static_cast<int32_t>(output_multiplier.to<int64_t>());
52+
int output_shift_val = static_cast<int>(output_shift.to<int64_t>());
53+
54+
// Left shift to maximize precision (tune as needed)
55+
const int32_t left_shift = 20;
56+
const int32_t activation_min = std::numeric_limits<int8_t>::min();
57+
const int32_t activation_max = std::numeric_limits<int8_t>::max();
58+
59+
// Resize output tensor to match input shape
60+
auto err = torch::executor::resize_tensor(out, input1_int8.sizes());
61+
if (err != executorch::runtime::Error::Ok) {
62+
ET_LOG(
63+
Error,
64+
"quantized_add_out: resize_tensor failed with error code [%d]",
65+
static_cast<int>(err));
66+
std::memset(out.mutable_data_ptr<int8_t>(), 0, out.nbytes());
67+
return out;
68+
}
69+
70+
ET_LOG(
71+
Info,
72+
"Using AoT-computed parameters: input1[mult=%d, shift=%d], input2[mult=%d, shift=%d], output[mult=%d, shift=%d]",
73+
input1_mult,
74+
input1_shift_val,
75+
input2_mult,
76+
input2_shift_val,
77+
output_mult,
78+
output_shift_val);
79+
80+
// Call CMSIS-NN kernel with precomputed parameters
81+
arm_cmsis_nn_status status = arm_elementwise_add_s8(
82+
input1_int8.const_data_ptr<int8_t>(),
83+
input2_int8.const_data_ptr<int8_t>(),
84+
static_cast<int32_t>(zp1),
85+
input1_mult,
86+
input1_shift_val,
87+
static_cast<int32_t>(zp2),
88+
input2_mult,
89+
input2_shift_val,
90+
left_shift,
91+
out.mutable_data_ptr<int8_t>(),
92+
static_cast<int32_t>(out_zp),
93+
output_mult,
94+
output_shift_val,
95+
static_cast<int32_t>(out.numel()),
96+
activation_min,
97+
activation_max);
98+
99+
if (status != ARM_CMSIS_NN_SUCCESS) {
100+
ET_LOG(
101+
Error,
102+
"quantized_add_out: arm_elementwise_add_s8 failed with status [%d]",
103+
status);
104+
std::memset(out.mutable_data_ptr<int8_t>(), 0, out.nbytes());
105+
} else {
106+
ET_LOG(
107+
Info,
108+
"quantized_add_out: Successfully completed with AoT-computed parameters!");
109+
}
110+
111+
return out;
112+
}
113+
114+
// Stub Implementation: Non-out variant for compatibility (functional variant)
115+
// EXIR/ExecuTorch runs an out-variant pass that converts
116+
// .default operations to .out variants before memory planning.
117+
// In the pass we are calling quantized_add's default variant
118+
// but ExecuTorch's kernel dispatch mechanism will end up calling the out
119+
// variant. This stub is to make sure that compiler doesn't complain.
120+
Tensor quantized_add(
121+
KernelRuntimeContext& context,
122+
const Tensor& input1_int8,
123+
const Scalar& input1_zero_point,
124+
const Scalar& input1_multiplier,
125+
const Scalar& input1_shift,
126+
const Tensor& input2_int8,
127+
const Scalar& input2_zero_point,
128+
const Scalar& input2_multiplier,
129+
const Scalar& input2_shift,
130+
const Scalar& output_zero_point,
131+
const Scalar& output_multiplier,
132+
const Scalar& output_shift) {
133+
ET_LOG(
134+
Info,
135+
"quantized_add: input1_int8.sizes() = %zu",
136+
input1_int8.sizes());
137+
return const_cast<Tensor&>(input1_int8); // to make compiler happy
138+
}
139+
140+
} // namespace native
141+
} // namespace cortex_m

backends/cortex_m/ops/operators.py

Lines changed: 135 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
"quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)"
2626
)
2727

28-
2928
@register_fake("cortex_m::quantize_per_tensor")
3029
def quantize_per_tensor_meta(
3130
input: torch.Tensor,
@@ -55,7 +54,6 @@ def quantize_per_tensor_impl(
5554
input, scale, zero_point, quant_min, quant_max, dtype
5655
)
5756

58-
5957
###
6058
# dequantize_per_tensor
6159
###
@@ -96,3 +94,138 @@ def dequantize_per_tensor_impl(
9694
return exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default(
9795
input, scale, zero_point, quant_min, quant_max, dtype
9896
)
97+
98+
99+
# Define the operator schema with multipliers and shifts (11 args)
100+
lib.define(
101+
"quantized_add("
102+
"Tensor self, Scalar self_zero_point, Scalar self_multiplier, Scalar self_shift, "
103+
"Tensor other, Scalar other_zero_point, Scalar other_multiplier, Scalar other_shift, "
104+
"Scalar output_zero_point, Scalar output_multiplier, Scalar output_shift) -> Tensor"
105+
)
106+
107+
108+
@register_fake("cortex_m::quantized_add")
109+
def quantized_add_meta(
110+
self: torch.Tensor,
111+
self_zero_point: int,
112+
self_multiplier: int,
113+
self_shift: int,
114+
other: torch.Tensor,
115+
other_zero_point: int,
116+
other_multiplier: int,
117+
other_shift: int,
118+
output_zero_point: int,
119+
output_multiplier: int,
120+
output_shift: int,
121+
) -> torch.Tensor:
122+
return torch.empty_like(self, dtype=torch.int8)
123+
124+
125+
@impl(lib, "quantized_add", "CompositeExplicitAutograd")
126+
def quantized_add_impl(
127+
self: torch.Tensor,
128+
self_zero_point: int,
129+
self_multiplier: int,
130+
self_shift: int,
131+
other: torch.Tensor,
132+
other_zero_point: int,
133+
other_multiplier: int,
134+
other_shift: int,
135+
output_zero_point: int,
136+
output_multiplier: int,
137+
output_shift: int,
138+
) -> torch.Tensor:
139+
# For now, convert back to float, add, and quantize (as placeholder)
140+
# Dequantize inputs using multiplier/shift
141+
self_fp = (self.float() - self_zero_point) * (
142+
self_multiplier / (1 << (31 - self_shift))
143+
)
144+
other_fp = (other.float() - other_zero_point) * (
145+
other_multiplier / (1 << (31 - other_shift))
146+
)
147+
148+
# Add
149+
result_fp = self_fp + other_fp
150+
151+
# Quantize output
152+
result_quantized = (
153+
result_fp / (output_multiplier / (1 << (31 - output_shift)))
154+
) + output_zero_point
155+
156+
return result_quantized.clamp(-128, 127).to(torch.int8)
157+
158+
159+
# Define the operator schema with multipliers and shifts (11 args + out tensor)
160+
lib.define(
161+
"quantized_add.out("
162+
"Tensor self, Scalar self_zero_point, Scalar self_multiplier, Scalar self_shift, "
163+
"Tensor other, Scalar other_zero_point, Scalar other_multiplier, Scalar other_shift, "
164+
"Scalar output_zero_point, Scalar output_multiplier, Scalar output_shift, "
165+
"*, Tensor(a!) out) -> Tensor(a!)"
166+
)
167+
168+
169+
# Fake meta function for shape and dtype inference during compilation
170+
@register_fake("cortex_m::quantized_add.out")
171+
def quantized_add_out_meta(
172+
self: torch.Tensor,
173+
self_zero_point: int,
174+
self_multiplier: int,
175+
self_shift: int,
176+
other: torch.Tensor,
177+
other_zero_point: int,
178+
other_multiplier: int,
179+
other_shift: int,
180+
output_zero_point: int,
181+
output_multiplier: int,
182+
output_shift: int,
183+
out: torch.Tensor,
184+
) -> torch.Tensor:
185+
# Validate shape compatibility if needed
186+
assert out.shape == self.shape, "Output shape must match input shape"
187+
# Output dtype is int8
188+
return out
189+
190+
191+
# Actual implementation delegating to backend or custom kernel
192+
@impl(lib, "quantized_add.out", "CompositeExplicitAutograd")
193+
def quantized_add_out_impl(
194+
self: torch.Tensor,
195+
self_zero_point: int,
196+
self_multiplier: int,
197+
self_shift: int,
198+
other: torch.Tensor,
199+
other_zero_point: int,
200+
other_multiplier: int,
201+
other_shift: int,
202+
output_zero_point: int,
203+
output_multiplier: int,
204+
output_shift: int,
205+
*,
206+
out: torch.Tensor,
207+
) -> torch.Tensor:
208+
# Example placeholder implementation:
209+
# Dequantize inputs using multiplier and shift
210+
self_fp = (self.float() - self_zero_point) * (
211+
self_multiplier / (1 << (31 - self_shift))
212+
)
213+
other_fp = (other.float() - other_zero_point) * (
214+
other_multiplier / (1 << (31 - other_shift))
215+
)
216+
217+
# Add in floating point
218+
result_fp = self_fp + other_fp
219+
220+
# Quantize output using multiplier and shift
221+
result_quantized = (
222+
result_fp / (output_multiplier / (1 << (31 - output_shift)))
223+
) + output_zero_point
224+
225+
# Clamp and convert to int8
226+
result_quantized = result_quantized.clamp(-128, 127).to(torch.int8)
227+
228+
# Write into the provided output tensor
229+
out.copy_(result_quantized)
230+
231+
return out

0 commit comments

Comments
 (0)