Skip to content

Commit b166a2e

Browse files
sidartGithub Executorch
authored andcommitted
Summary: Initial CMSS-NN integration for Quantized Add Op
Test Plan: a) Setup for Arm FVP and run 'examples/arm/run.sh' (Check no regressions in e2e test scenarios) b) Then add to run.sh another iteration with qadd with only --quantize flag and see that quantized add op is called c) cd backends/cortex_m/test/; python test_quantize_add_fusion_pass.py ---------------------------------------------------------------------- Ran 8 tests in 11.128s OK Reviewers: Subscribers: Tasks: Tags:
1 parent 45519f6 commit b166a2e

File tree

11 files changed

+1173
-16
lines changed

11 files changed

+1173
-16
lines changed

backends/cortex_m/CMakeLists.txt

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,36 @@ endif()
2424

2525
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
2626
include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
27+
include(ExternalProject)
28+
29+
# Download and build CMSIS-NN from GitHub
30+
set(CMSIS_NN_VERSION "v4.1.0" CACHE STRING "CMSIS-NN version to download")
31+
set(CMSIS_NN_ROOT ${CMAKE_CURRENT_BINARY_DIR}/cmsis-nn)
32+
set(CMSIS_NN_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmsis-nn-build)
33+
set(CMSIS_NN_LIB_PATH ${CMSIS_NN_BINARY_DIR}/libcmsis-nn.a)
34+
35+
ExternalProject_Add(
36+
cmsis_nn_external
37+
GIT_REPOSITORY https://github.com/ARM-software/CMSIS-NN.git
38+
GIT_TAG ${CMSIS_NN_VERSION}
39+
SOURCE_DIR ${CMSIS_NN_ROOT}
40+
BINARY_DIR ${CMSIS_NN_BINARY_DIR}
41+
CMAKE_ARGS
42+
-DCMAKE_TOOLCHAIN_FILE=${EXECUTORCH_ROOT}/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake
43+
-DTARGET_CPU=cortex-m55
44+
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
45+
BUILD_COMMAND ${CMAKE_COMMAND} --build <BINARY_DIR> --parallel
46+
INSTALL_COMMAND ""
47+
BUILD_BYPRODUCTS ${CMSIS_NN_LIB_PATH}
48+
)
2749

2850
# Cortex-M ops kernel sources
2951
set(_cortex_m_kernels__srcs
3052
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantize_per_tensor.cpp
3153
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_dequantize_per_tensor.cpp
54+
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_add.cpp
55+
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_add_tensor.cpp
56+
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_add.cpp
3257
)
3358

3459
# Generate C++ bindings to register kernels into Executorch (for runtime). Here
@@ -44,9 +69,23 @@ message("Generated files ${gen_command_sources}")
4469

4570
# Build a library for _cortex_m_kernels_srcs
4671
add_library(cortex_m_kernels ${_cortex_m_kernels__srcs})
47-
target_link_libraries(cortex_m_kernels PRIVATE executorch)
4872
target_compile_options(cortex_m_kernels PUBLIC ${_common_compile_options})
4973

74+
# Add dependency on CMSIS-NN external project
75+
add_dependencies(cortex_m_kernels cmsis_nn_external)
76+
77+
# Set include directories - Include is directly in CMSIS-NN root
78+
target_include_directories(cortex_m_kernels
79+
PRIVATE
80+
${EXECUTORCH_ROOT}/..
81+
${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
82+
$<BUILD_INTERFACE:${CMSIS_NN_ROOT}/Include>
83+
$<BUILD_INTERFACE:${CMSIS_NN_ROOT}>
84+
)
85+
86+
# Link against the CMSIS-NN static library directly
87+
target_link_libraries(cortex_m_kernels PUBLIC ${CMSIS_NN_LIB_PATH} executorch)
88+
5089
# cortex_m_ops_lib: Register Cortex-M ops kernels into Executorch runtime
5190
gen_operators_lib(
5291
LIB_NAME "cortex_m_ops_lib" KERNEL_LIBS cortex_m_kernels DEPS executorch

backends/cortex_m/ops/op_add.cpp

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/runtime/kernel/kernel_includes.h>
10+
11+
extern "C" {
12+
#include "Include/arm_nnfunctions.h"
13+
}
14+
15+
namespace cortex_m {
16+
namespace native {
17+
18+
using Tensor = torch::executor::Tensor;
19+
using Scalar = torch::executor::Scalar;
20+
using ScalarType = executorch::aten::ScalarType;
21+
using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
22+
23+
Tensor& add_out(
24+
KernelRuntimeContext& ctx,
25+
const Tensor& input1,
26+
const Tensor& input2,
27+
const Scalar& alpha,
28+
Tensor& out) {
29+
ET_LOG(Info, "add_out kernel called");
30+
31+
// Check input data types
32+
ScalarType input1_dtype = input1.scalar_type();
33+
ScalarType input2_dtype = input2.scalar_type();
34+
ScalarType out_dtype = out.scalar_type();
35+
36+
ET_LOG(Info, "Input1 dtype: %d, Input2 dtype: %d, Output dtype: %d",
37+
static_cast<int>(input1_dtype), static_cast<int>(input2_dtype), static_cast<int>(out_dtype));
38+
39+
// Stub for now
40+
return out;
41+
}
42+
43+
} // namespace native
44+
} // namespace cortex_m
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/runtime/kernel/kernel_includes.h>
10+
11+
extern "C" {
12+
#include "Include/arm_nnfunctions.h"
13+
}
14+
15+
namespace cortex_m {
16+
namespace native {
17+
18+
using Tensor = torch::executor::Tensor;
19+
using ScalarType = executorch::aten::ScalarType;
20+
using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
21+
22+
Tensor& add_Tensor(
23+
KernelRuntimeContext& ctx,
24+
const Tensor& self,
25+
const Tensor& other,
26+
Tensor& out) {
27+
ET_LOG(Info, "add_Tensor kernel called");
28+
29+
// Ensure input is char type
30+
ET_CHECK_MSG(
31+
self.scalar_type() == ScalarType::Char,
32+
"self.scalar_type() %" PRId8 " is not char type",
33+
static_cast<int8_t>(self.scalar_type()));
34+
35+
ET_CHECK_MSG(
36+
other.scalar_type() == ScalarType::Char,
37+
"other.scalar_type() %" PRId8 " is not char type",
38+
static_cast<int8_t>(other.scalar_type()));
39+
40+
// Stub for now
41+
return out;
42+
}
43+
44+
} // namespace native
45+
} // namespace cortex_m
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/runtime/kernel/kernel_includes.h>
10+
11+
#include <vector>
12+
extern "C" {
13+
#include "Include/arm_nnfunctions.h"
14+
}
15+
16+
namespace cortex_m {
17+
namespace native {
18+
19+
using Tensor = torch::executor::Tensor;
20+
using ScalarType = executorch::aten::ScalarType;
21+
using Scalar = torch::executor::Scalar;
22+
using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
23+
24+
Tensor& quantized_add_out(
25+
KernelRuntimeContext& context,
26+
const Tensor& input1_int8,
27+
const Scalar& input1_zero_point,
28+
const Scalar& input1_multiplier,
29+
const Scalar& input1_shift,
30+
const Tensor& input2_int8,
31+
const Scalar& input2_zero_point,
32+
const Scalar& input2_multiplier,
33+
const Scalar& input2_shift,
34+
const Scalar& output_zero_point,
35+
const Scalar& output_multiplier,
36+
const Scalar& output_shift,
37+
Tensor& out) {
38+
39+
ET_CHECK_MSG(input1_int8.scalar_type() == ScalarType::Char, "Input1 must be int8");
40+
ET_CHECK_MSG(input2_int8.scalar_type() == ScalarType::Char, "Input2 must be int8");
41+
ET_CHECK_MSG(out.scalar_type() == ScalarType::Char, "Output must be int8");
42+
43+
// Check tensor shapes are compatible
44+
ET_CHECK_SAME_SHAPE_AND_DTYPE2(input1_int8, input2_int8);
45+
46+
ET_LOG(Info, "quantized_add_out: input1_int8.sizes() = %zu", input1_int8.sizes().size());
47+
48+
// 🔧 FIX: Use template types that ExecutorTorch definitely provides
49+
// Use to<int64_t>() and to<double>() which are commonly instantiated
50+
int64_t zp1 = input1_zero_point.to<int64_t>();
51+
int32_t input1_mult = static_cast<int32_t>(input1_multiplier.to<int64_t>());
52+
int input1_shift_val = static_cast<int>(input1_shift.to<int64_t>());
53+
54+
int64_t zp2 = input2_zero_point.to<int64_t>();
55+
int32_t input2_mult = static_cast<int32_t>(input2_multiplier.to<int64_t>());
56+
int input2_shift_val = static_cast<int>(input2_shift.to<int64_t>());
57+
58+
int64_t out_zp = output_zero_point.to<int64_t>();
59+
int32_t output_mult = static_cast<int32_t>(output_multiplier.to<int64_t>());
60+
int output_shift_val = static_cast<int>(output_shift.to<int64_t>());
61+
62+
// Left shift to maximize precision (tune as needed)
63+
const int32_t left_shift = 20;
64+
const int32_t activation_min = std::numeric_limits<int8_t>::min();
65+
const int32_t activation_max = std::numeric_limits<int8_t>::max();
66+
67+
// Resize output tensor to match input shape
68+
auto err = torch::executor::resize_tensor(out, input1_int8.sizes());
69+
if (err != executorch::runtime::Error::Ok) {
70+
ET_LOG(Error, "quantized_add_out: resize_tensor failed with error code [%d]", static_cast<int>(err));
71+
std::memset(out.mutable_data_ptr<int8_t>(), 0, out.nbytes());
72+
return out;
73+
}
74+
75+
ET_LOG(Info, "Using AoT-computed parameters: input1[mult=%d, shift=%d], input2[mult=%d, shift=%d], output[mult=%d, shift=%d]",
76+
input1_mult, input1_shift_val, input2_mult, input2_shift_val, output_mult, output_shift_val);
77+
78+
// Call CMSIS-NN kernel with precomputed parameters
79+
arm_cmsis_nn_status status = arm_elementwise_add_s8(
80+
input1_int8.const_data_ptr<int8_t>(),
81+
input2_int8.const_data_ptr<int8_t>(),
82+
static_cast<int32_t>(zp1),
83+
input1_mult,
84+
input1_shift_val,
85+
static_cast<int32_t>(zp2),
86+
input2_mult,
87+
input2_shift_val,
88+
left_shift,
89+
out.mutable_data_ptr<int8_t>(),
90+
static_cast<int32_t>(out_zp),
91+
output_mult,
92+
output_shift_val,
93+
static_cast<int32_t>(out.numel()),
94+
activation_min,
95+
activation_max
96+
);
97+
98+
if (status != ARM_CMSIS_NN_SUCCESS) {
99+
ET_LOG(Error, "quantized_add_out: arm_elementwise_add_s8 failed with status [%d]", status);
100+
std::memset(out.mutable_data_ptr<int8_t>(), 0, out.nbytes());
101+
} else {
102+
ET_LOG(Info, "quantized_add_out: Successfully completed with AoT-computed parameters! 🎯");
103+
}
104+
105+
return out;
106+
}
107+
108+
// Stub Implementation: Non-out variant for compatibility (functional variant)
109+
// EXIR/ExecuTorch runs an out-variant pass that converts
110+
// .default operations to .out variants before memory planning.
111+
// In the pass we are calling quantized_add's default variant
112+
// but ExecuTorch's kernel dispatch mechanism will end up calling the out variant.
113+
// This stub is to make sure that compiler doesn't complain.
114+
Tensor quantized_add(
115+
KernelRuntimeContext& context,
116+
const Tensor& input1_int8,
117+
const Scalar& input1_zero_point,
118+
const Scalar& input1_multiplier,
119+
const Scalar& input1_shift,
120+
const Tensor& input2_int8,
121+
const Scalar& input2_zero_point,
122+
const Scalar& input2_multiplier,
123+
const Scalar& input2_shift,
124+
const Scalar& output_zero_point,
125+
const Scalar& output_multiplier,
126+
const Scalar& output_shift) {
127+
ET_LOG(Info, "quantized_add: input1_int8.sizes() = %zu", input1_int8.sizes().size());
128+
return const_cast<Tensor&>(input1_int8); // to make compiler happy
129+
}
130+
131+
} // namespace native
132+
} // namespace cortex_m

0 commit comments

Comments
 (0)