Skip to content

Commit 1faca01

Browse files
sidartGithub Executorch
authored andcommitted
Summary: Initial CMSS-NN integration for Quantized Add Op
Test Plan: a) Setup for Arm FVP and run 'examples/arm/run.sh' (Check no regressions in e2e test scenarios) b) Then add to run.sh another iteration with qadd with only --quantize flag and see that quantized add op is called c) cd backends/cortex_m/test/; python test_quantize_add_fusion_pass.py ---------------------------------------------------------------------- Ran 8 tests in 11.128s OK Reviewers: Subscribers: Tasks: Tags:
1 parent 335de46 commit 1faca01

15 files changed

+1390
-119
lines changed

backends/cortex_m/CMakeLists.txt

Lines changed: 41 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,6 @@
55
# This source code is licensed under the BSD-style license found in the
66
# LICENSE file in the root directory of this source tree.
77

8-
# Kernel library for Cortex-M operators. Please keep this file formatted by
9-
# running:
10-
# ~~~
11-
# cmake-format -i CMakeLists.txt
12-
# ~~~
138
cmake_minimum_required(VERSION 3.19)
149

1510
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -24,29 +19,65 @@ endif()
2419

2520
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
2621
include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
22+
include(FetchContent)
23+
24+
# CMSIS-NN version to download
25+
set(CMSIS_NN_VERSION "v4.1.0" CACHE STRING "CMSIS-NN version to download")
26+
27+
# Declare CMSIS-NN as a FetchContent project
28+
FetchContent_Declare(
29+
cmsis_nn
30+
GIT_REPOSITORY https://github.com/ARM-software/CMSIS-NN.git
31+
GIT_TAG ${CMSIS_NN_VERSION}
32+
)
33+
34+
# Download and make CMSIS-NN available
35+
FetchContent_MakeAvailable(cmsis_nn)
36+
37+
# Print paths for debugging
38+
message(STATUS "CMSIS-NN source dir: ${cmsis_nn_SOURCE_DIR}")
39+
message(STATUS "CMSIS-NN binary dir: ${cmsis_nn_BINARY_DIR}")
2740

2841
# Cortex-M ops kernel sources
2942
set(_cortex_m_kernels__srcs
3043
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantize_per_tensor.cpp
3144
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_dequantize_per_tensor.cpp
45+
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_add.cpp
3246
)
3347

34-
# Generate C++ bindings to register kernels into Executorch (for runtime). Here
35-
# select all ops in operators.yaml
48+
# Generate C++ bindings to register kernels into Executorch (for runtime)
3649
set(_yaml_file ${CMAKE_CURRENT_LIST_DIR}/ops/operators.yaml)
3750
gen_selected_ops(LIB_NAME "cortex_m_ops_lib" OPS_SCHEMA_YAML "${_yaml_file}")
3851

39-
# Generate bindings for the kernels
4052
generate_bindings_for_kernels(
4153
LIB_NAME "cortex_m_ops_lib" CUSTOM_OPS_YAML "${_yaml_file}"
4254
)
4355
message("Generated files ${gen_command_sources}")
4456

45-
# Build a library for _cortex_m_kernels_srcs
57+
# Build a library for cortex_m_kernels
4658
add_library(cortex_m_kernels ${_cortex_m_kernels__srcs})
47-
target_link_libraries(cortex_m_kernels PRIVATE executorch)
4859
target_compile_options(cortex_m_kernels PUBLIC ${_common_compile_options})
4960

61+
# Include directories for cortex_m_kernels
62+
target_include_directories(
63+
cortex_m_kernels
64+
PRIVATE ${EXECUTORCH_ROOT}/..
65+
${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
66+
${cmsis_nn_SOURCE_DIR}/Include
67+
${cmsis_nn_SOURCE_DIR}
68+
)
69+
70+
# Link directly to the CMSIS-NN static library file
71+
# This is the most reliable approach - link to the actual .a file
72+
target_link_libraries(cortex_m_kernels PUBLIC
73+
${cmsis_nn_BINARY_DIR}/libcmsis-nn.a
74+
executorch
75+
)
76+
77+
# Add dependency to ensure CMSIS-NN builds before we try to link
78+
# Use the actual CMSIS-NN target name (usually 'cmsis-nn')
79+
add_dependencies(cortex_m_kernels cmsis-nn)
80+
5081
# cortex_m_ops_lib: Register Cortex-M ops kernels into Executorch runtime
5182
gen_operators_lib(
5283
LIB_NAME "cortex_m_ops_lib" KERNEL_LIBS cortex_m_kernels DEPS executorch

backends/cortex_m/ops/TARGETS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ python_library(
1616
],
1717
deps = [
1818
"fbcode//caffe2:torch",
19+
"//executorch/backends/cortex_m/passes:passes_utils",
1920
],
2021
)
2122

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#pragma once
10+
11+
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
12+
#include <executorch/runtime/kernel/kernel_includes.h>
13+
14+
#include <executorch/kernels/portable/cpu/scalar_utils.h>
15+
#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
16+
#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
17+
#include <executorch/runtime/kernel/kernel_includes.h>
18+
#include <executorch/runtime/platform/assert.h>
19+
20+
// Include CMSIS-NN headers with C linkage
21+
extern "C" {
22+
#include "arm_nnfunctions.h"
23+
}
24+
25+
using Tensor = torch::executor::Tensor;
26+
using ScalarType = executorch::aten::ScalarType;
27+
using Scalar = torch::executor::Scalar;
28+
using Error = executorch::runtime::Error;
29+
30+
// Basic tensor type / layout validation and dimension order checking
31+
inline void validate_quantized_tensor_types_and_dim_order(
32+
const Tensor& input1,
33+
const Tensor& input2,
34+
Tensor& output) {
35+
ET_CHECK_MSG(input1.scalar_type() == ScalarType::Char, "Input1 must be int8");
36+
ET_CHECK_MSG(input2.scalar_type() == ScalarType::Char, "Input2 must be int8");
37+
ET_CHECK_MSG(output.scalar_type() == ScalarType::Char, "Output must be int8");
38+
ET_CHECK_MSG(
39+
executorch::runtime::tensors_have_same_dim_order(input1, input2, output),
40+
"Tensors must have same dimension order");
41+
}
42+
43+
/**
44+
* Validate quantization parameters for inputs and output.
45+
*
46+
* Checks that zero points fit in int8 range, multipliers fit in int32 range,
47+
* and shifts are within a valid bit-shift range (0-31).
48+
*
49+
* Ensures parameters comply with Ahead-Of-Time (AOT) quantization requirements
50+
* and CMSIS-NN kernel expectations.
51+
*
52+
* Raises errors via ET_KERNEL_CHECK if any check fails.
53+
*/
54+
inline void validate_quantization_params(
55+
const Scalar& zero_point1,
56+
const Scalar& multiplier1,
57+
const Scalar& shift1,
58+
const Scalar& zero_point2,
59+
const Scalar& multiplier2,
60+
const Scalar& shift2,
61+
const Scalar& output_zero_point,
62+
const Scalar& output_multiplier,
63+
const Scalar& output_shift,
64+
Tensor& output) {
65+
// Extract int64_t values from Scalars
66+
int64_t zp1_val = zero_point1.to<int64_t>();
67+
int64_t mult1_val = multiplier1.to<int64_t>();
68+
int64_t shift1_val = shift1.to<int64_t>();
69+
70+
int64_t zp2_val = zero_point2.to<int64_t>();
71+
int64_t mult2_val = multiplier2.to<int64_t>();
72+
int64_t shift2_val = shift2.to<int64_t>();
73+
74+
int64_t out_zp_val = output_zero_point.to<int64_t>();
75+
int64_t out_mult_val = output_multiplier.to<int64_t>();
76+
int64_t out_shift_val = output_shift.to<int64_t>();
77+
78+
ET_CHECK_MSG(
79+
zp1_val >= std::numeric_limits<int8_t>::min() &&
80+
zp1_val <= std::numeric_limits<int8_t>::max(),
81+
"Zero point 1 must be in int8 range [Value: %d]",
82+
zp1_val);
83+
84+
ET_CHECK_MSG(
85+
zp2_val >= std::numeric_limits<int8_t>::min() &&
86+
zp2_val <= std::numeric_limits<int8_t>::max(),
87+
"Zero point 2 must be in int8 range [Value: %d]",
88+
zp2_val);
89+
90+
ET_CHECK_MSG(
91+
out_zp_val >= std::numeric_limits<int8_t>::min() &&
92+
out_zp_val <= std::numeric_limits<int8_t>::max(),
93+
"Output zero point must be in int8 range [Value: %d]",
94+
out_zp_val);
95+
96+
// Check multipliers fit in int32 range (AOT quantize_multiplier_aot clamps to
97+
// int32)
98+
ET_CHECK_MSG(
99+
mult1_val >= std::numeric_limits<int32_t>::min() &&
100+
mult1_val <= std::numeric_limits<int32_t>::max(),
101+
"Multiplier 1 must be in int32 range [Value: %d]",
102+
mult1_val);
103+
104+
ET_CHECK_MSG(
105+
mult2_val >= std::numeric_limits<int32_t>::min() &&
106+
mult2_val <= std::numeric_limits<int32_t>::max(),
107+
"Multiplier 2 must be in int32 range [Value: %d]",
108+
mult2_val);
109+
110+
ET_CHECK_MSG(
111+
out_mult_val >= std::numeric_limits<int32_t>::min() &&
112+
out_mult_val <= std::numeric_limits<int32_t>::max(),
113+
"Output multiplier must be in int32 range [Value: %d]",
114+
out_mult_val);
115+
116+
ET_CHECK_MSG(
117+
shift1_val >= -31 && shift1_val <= 31,
118+
"Shift 1 must be in range [-31, 31] [Value: %d]",
119+
shift1_val);
120+
121+
ET_CHECK_MSG(
122+
shift2_val >= -31 && shift2_val <= 31,
123+
"Shift 2 must be in range [-31, 31] [Value: %d]",
124+
shift2_val);
125+
126+
ET_CHECK_MSG(
127+
out_shift_val >= -31 && out_shift_val <= 31,
128+
"Output shift must be in range [-31, 31] [Value: %d]",
129+
out_shift_val);
130+
}
131+
132+
inline Error resize_to_broadcast_target_size_quantized(
133+
const Tensor& input1,
134+
const Tensor& input2,
135+
Tensor& output) {
136+
static constexpr int kTensorDimensionLimit = 5;
137+
138+
int inp1_shape[kTensorDimensionLimit];
139+
int inp2_shape[kTensorDimensionLimit];
140+
int out_shape[kTensorDimensionLimit];
141+
142+
int max_dim = std::max({input1.dim(), input2.dim(), output.dim()});
143+
max_dim = std::min(max_dim, kTensorDimensionLimit);
144+
145+
// Initialize shapes with 1s for padding
146+
for (int i = 0; i < max_dim; i++) {
147+
inp1_shape[i] = 1;
148+
inp2_shape[i] = 1;
149+
out_shape[i] = 1;
150+
}
151+
152+
int offset_inp1 = max_dim - input1.dim();
153+
int offset_inp2 = max_dim - input2.dim();
154+
int offset_out = max_dim - output.dim();
155+
156+
for (int i = 0; i < input1.dim(); i++) {
157+
inp1_shape[i + offset_inp1] = input1.size(i);
158+
}
159+
for (int i = 0; i < input2.dim(); i++) {
160+
inp2_shape[i + offset_inp2] = input2.size(i);
161+
}
162+
for (int i = 0; i < output.dim(); i++) {
163+
out_shape[i + offset_out] = output.size(i);
164+
}
165+
166+
// Compute broadcasted shape (use existing get_broadcast_target_size or
167+
// equivalent)
168+
Tensor::SizesType expected_output_size[kTensorDimensionLimit];
169+
size_t expected_output_dim = 0;
170+
171+
auto err = torch::executor::get_broadcast_target_size(
172+
input1,
173+
input2,
174+
expected_output_size,
175+
kTensorDimensionLimit,
176+
&expected_output_dim);
177+
if (err != Error::Ok) {
178+
return err;
179+
}
180+
181+
// Resize output tensor to broadcasted shape
182+
return executorch::runtime::resize_tensor(
183+
output, {expected_output_size, expected_output_dim});
184+
}

0 commit comments

Comments
 (0)