Skip to content

Commit 18907e6

Browse files
psiddhsidart
andauthored
Summary: Initial CMSS-NN Add Op (#13296)
Test Plan: 1. examples/arm/run.sh - No regressions ==> Ok 2. examples/arm/run.sh now runs 'qadd2' in quantize only mode ==> Ok 3. python -m unittest test_replace_quant_nodes.py ==> Ok 4. python -m unittest test_quantize_op_fusion_pass.py ==> Ok Reviewers: Subscribers: Tasks: Tags: ### Summary [PLEASE REMOVE] See [CONTRIBUTING.md's Pull Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests) for ExecuTorch PR guidelines. [PLEASE REMOVE] If this PR closes an issue, please add a `Fixes #<issue-id>` line. [PLEASE REMOVE] If this PR introduces a fix or feature that should be the upcoming release notes, please add a "Release notes: <area>" label. For a list of available release notes labels, check out [CONTRIBUTING.md's Pull Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests). ### Test plan [PLEASE REMOVE] How did you test this PR? Please write down any manual commands you used and note down tests that you have written if applicable. cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 Co-authored-by: sidart <[email protected]>
1 parent 99e6349 commit 18907e6

15 files changed

+1348
-116
lines changed

backends/cortex_m/CMakeLists.txt

Lines changed: 41 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,6 @@
55
# This source code is licensed under the BSD-style license found in the
66
# LICENSE file in the root directory of this source tree.
77

8-
# Kernel library for Cortex-M operators. Please keep this file formatted by
9-
# running:
10-
# ~~~
11-
# cmake-format -i CMakeLists.txt
12-
# ~~~
138
cmake_minimum_required(VERSION 3.19)
149

1510
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -24,29 +19,65 @@ endif()
2419

2520
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
2621
include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
22+
include(FetchContent)
23+
24+
# CMSIS-NN version to download
25+
set(CMSIS_NN_VERSION
26+
"v4.1.0"
27+
CACHE STRING "CMSIS-NN version to download"
28+
)
29+
30+
# Declare CMSIS-NN as a FetchContent project
31+
FetchContent_Declare(
32+
cmsis_nn
33+
GIT_REPOSITORY https://github.com/ARM-software/CMSIS-NN.git
34+
GIT_TAG ${CMSIS_NN_VERSION}
35+
)
36+
37+
# Download and make CMSIS-NN available
38+
FetchContent_MakeAvailable(cmsis_nn)
39+
40+
# Print paths for debugging
41+
message(STATUS "CMSIS-NN source dir: ${cmsis_nn_SOURCE_DIR}")
42+
message(STATUS "CMSIS-NN binary dir: ${cmsis_nn_BINARY_DIR}")
2743

2844
# Cortex-M ops kernel sources
2945
set(_cortex_m_kernels__srcs
3046
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantize_per_tensor.cpp
3147
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_dequantize_per_tensor.cpp
48+
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_add.cpp
3249
)
3350

34-
# Generate C++ bindings to register kernels into Executorch (for runtime). Here
35-
# select all ops in operators.yaml
51+
# Generate C++ bindings to register kernels into Executorch (for runtime)
3652
set(_yaml_file ${CMAKE_CURRENT_LIST_DIR}/ops/operators.yaml)
3753
gen_selected_ops(LIB_NAME "cortex_m_ops_lib" OPS_SCHEMA_YAML "${_yaml_file}")
3854

39-
# Generate bindings for the kernels
4055
generate_bindings_for_kernels(
4156
LIB_NAME "cortex_m_ops_lib" CUSTOM_OPS_YAML "${_yaml_file}"
4257
)
4358
message("Generated files ${gen_command_sources}")
4459

45-
# Build a library for _cortex_m_kernels_srcs
60+
# Build a library for cortex_m_kernels
4661
add_library(cortex_m_kernels ${_cortex_m_kernels__srcs})
47-
target_link_libraries(cortex_m_kernels PRIVATE executorch)
4862
target_compile_options(cortex_m_kernels PUBLIC ${_common_compile_options})
4963

64+
# Include directories for cortex_m_kernels
65+
target_include_directories(
66+
cortex_m_kernels
67+
PRIVATE ${EXECUTORCH_ROOT}/..
68+
${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
69+
${cmsis_nn_SOURCE_DIR}/Include
70+
)
71+
72+
# Link directly to the CMSIS-NN static library file
73+
target_link_libraries(
74+
cortex_m_kernels PUBLIC ${cmsis_nn_BINARY_DIR}/libcmsis-nn.a executorch
75+
)
76+
77+
# Add dependency to ensure CMSIS-NN builds before we try to link. Use the actual
78+
# CMSIS-NN target name (usually 'cmsis-nn')
79+
add_dependencies(cortex_m_kernels cmsis-nn)
80+
5081
# cortex_m_ops_lib: Register Cortex-M ops kernels into Executorch runtime
5182
gen_operators_lib(
5283
LIB_NAME "cortex_m_ops_lib" KERNEL_LIBS cortex_m_kernels DEPS executorch

backends/cortex_m/ops/TARGETS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ python_library(
1616
],
1717
deps = [
1818
"fbcode//caffe2:torch",
19+
"//executorch/backends/cortex_m/passes:passes_utils",
1920
],
2021
)
2122

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#pragma once
10+
11+
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
12+
#include <executorch/runtime/kernel/kernel_includes.h>
13+
14+
#include <executorch/kernels/portable/cpu/scalar_utils.h>
15+
#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
16+
#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
17+
#include <executorch/runtime/kernel/kernel_includes.h>
18+
#include <executorch/runtime/platform/assert.h>
19+
20+
// Include CMSIS-NN headers with C linkage
21+
extern "C" {
22+
#include "arm_nnfunctions.h"
23+
}
24+
25+
using Tensor = torch::executor::Tensor;
26+
using ScalarType = executorch::aten::ScalarType;
27+
using Scalar = torch::executor::Scalar;
28+
using Error = executorch::runtime::Error;
29+
30+
// Basic tensor type / layout validation and dimension order checking
31+
inline void validate_cmsis_nn_tensor_requirements(
32+
const Tensor& input1,
33+
const Tensor& input2,
34+
Tensor& output,
35+
ScalarType expected_dtype = ScalarType::Char,
36+
bool require_channels_last = false) {
37+
// Basic dtype validation
38+
ET_CHECK_MSG(
39+
input1.scalar_type() == expected_dtype,
40+
"Input1 dtype must be %hhd",
41+
expected_dtype);
42+
ET_CHECK_MSG(
43+
input2.scalar_type() == expected_dtype,
44+
"Input2 dtype must be %hhd",
45+
expected_dtype);
46+
ET_CHECK_MSG(
47+
output.scalar_type() == expected_dtype,
48+
"Output dtype must be %hhd",
49+
expected_dtype);
50+
51+
// Dim order consistency
52+
ET_CHECK_MSG(
53+
executorch::runtime::tensors_have_same_dim_order(input1, input2, output),
54+
"Tensors must have same dimension order");
55+
56+
// TBD: Validate memory alignment (CMSIS-NN requirement)
57+
}
58+
59+
inline void validate_single_quant_params(
60+
const Scalar& zero_point,
61+
const Scalar& multiplier,
62+
const Scalar& shift,
63+
const char* param_name) {
64+
int64_t zp_val = zero_point.to<int64_t>();
65+
int64_t mult_val = multiplier.to<int64_t>();
66+
int64_t shift_val = shift.to<int64_t>();
67+
68+
ET_CHECK_MSG(
69+
zp_val >= std::numeric_limits<int8_t>::min() &&
70+
zp_val <= std::numeric_limits<int8_t>::max(),
71+
"%s zero point must be in int8 range [Value: %d]",
72+
param_name,
73+
zp_val);
74+
75+
ET_CHECK_MSG(
76+
mult_val >= std::numeric_limits<int32_t>::min() &&
77+
mult_val <= std::numeric_limits<int32_t>::max(),
78+
"%s multiplier must be in int32 range [Value: %d]",
79+
param_name,
80+
mult_val);
81+
82+
ET_CHECK_MSG(
83+
shift_val >= -31 && shift_val <= 31,
84+
"%s shift must be in range [-31, 31] [Value: %d]",
85+
param_name,
86+
shift_val);
87+
}
88+
89+
/**
90+
* Validate quantization parameters for inputs and output.
91+
*
92+
* Checks that zero points fit in int8 range, multipliers fit in int32 range,
93+
* and shifts are within a valid bit-shift range (0-31).
94+
*
95+
* Ensures parameters comply with Ahead-Of-Time (AOT) quantization requirements
96+
* and CMSIS-NN kernel expectations.
97+
*
98+
* Raises errors via ET_KERNEL_CHECK if any check fails.
99+
*/
100+
inline void validate_quantization_params(
101+
const Scalar& zero_point1,
102+
const Scalar& multiplier1,
103+
const Scalar& shift1,
104+
const Scalar& zero_point2,
105+
const Scalar& multiplier2,
106+
const Scalar& shift2,
107+
const Scalar& output_zero_point,
108+
const Scalar& output_multiplier,
109+
const Scalar& output_shift,
110+
Tensor& output) {
111+
validate_single_quant_params(
112+
zero_point1, multiplier1, shift1, "Single quant Input1");
113+
validate_single_quant_params(
114+
zero_point2, multiplier2, shift2, "Single quant Input2");
115+
validate_single_quant_params(
116+
output_zero_point,
117+
output_multiplier,
118+
output_shift,
119+
"Single quant Output");
120+
}
121+
122+
inline Error resize_to_broadcast_target_size(
123+
const Tensor& input1,
124+
const Tensor& input2,
125+
Tensor& output) {
126+
static constexpr int kTensorDimensionLimit = 5;
127+
Tensor::SizesType expected_output_size[kTensorDimensionLimit];
128+
size_t expected_output_dim = 0;
129+
auto err = torch::executor::get_broadcast_target_size(
130+
input1,
131+
input2,
132+
expected_output_size,
133+
kTensorDimensionLimit,
134+
&expected_output_dim);
135+
136+
if (err != Error::Ok)
137+
return err;
138+
139+
return executorch::runtime::resize_tensor(
140+
output, {expected_output_size, expected_output_dim});
141+
}
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include "cortex_m_ops_common.h"
10+
11+
namespace cortex_m {
12+
namespace native {
13+
using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
14+
15+
Tensor& quantized_add_out(
16+
KernelRuntimeContext& context,
17+
const Tensor& input1_int8,
18+
const Scalar& input1_zero_point,
19+
const Scalar& input1_multiplier,
20+
const Scalar& input1_shift,
21+
const Tensor& input2_int8,
22+
const Scalar& input2_zero_point,
23+
const Scalar& input2_multiplier,
24+
const Scalar& input2_shift,
25+
const Scalar& output_zero_point,
26+
const Scalar& output_multiplier,
27+
const Scalar& output_shift,
28+
Tensor& out) {
29+
// Validate tensor types and dim order
30+
validate_cmsis_nn_tensor_requirements(input1_int8, input2_int8, out);
31+
32+
// Validate quantization parameters
33+
validate_quantization_params(
34+
input1_zero_point,
35+
input1_multiplier,
36+
input1_shift,
37+
input2_zero_point,
38+
input2_multiplier,
39+
input2_shift,
40+
output_zero_point,
41+
output_multiplier,
42+
output_shift,
43+
out);
44+
45+
// Broadcast if needed
46+
auto result = resize_to_broadcast_target_size(input1_int8, input2_int8, out);
47+
ET_CHECK_MSG(
48+
(result == Error::Ok),
49+
"Failed to resize output tensor. Status: [%d]",
50+
result);
51+
52+
ET_LOG(
53+
Info,
54+
"quantized_add_out: input1_int8.sizes() = %zu",
55+
input1_int8.sizes().size());
56+
57+
// FIX: Use template types that ExecutorTorch definitely provides
58+
// Use to<int64_t>() and to<double>() which are commonly instantiated
59+
int32_t zp1 = static_cast<int32_t>(input1_zero_point.to<int64_t>());
60+
int32_t input1_mult = static_cast<int32_t>(input1_multiplier.to<int64_t>());
61+
int input1_shift_val = static_cast<int>(input1_shift.to<int64_t>());
62+
63+
int32_t zp2 = static_cast<int32_t>(input2_zero_point.to<int64_t>());
64+
int32_t input2_mult = static_cast<int32_t>(input2_multiplier.to<int64_t>());
65+
int input2_shift_val = static_cast<int>(input2_shift.to<int64_t>());
66+
67+
int32_t out_zp = static_cast<int32_t>(output_zero_point.to<int64_t>());
68+
int32_t output_mult = static_cast<int32_t>(output_multiplier.to<int64_t>());
69+
int output_shift_val = static_cast<int>(output_shift.to<int64_t>());
70+
71+
// Left shift to maximize precision (tune as needed)
72+
const int32_t left_shift = 20;
73+
const int32_t activation_min = std::numeric_limits<int8_t>::min();
74+
const int32_t activation_max = std::numeric_limits<int8_t>::max();
75+
76+
ET_LOG(
77+
Info,
78+
"Using AoT-computed parameters: input1[mult=%d, shift=%d], input2[mult=%d, shift=%d], output[mult=%d, shift=%d]",
79+
input1_mult,
80+
input1_shift_val,
81+
input2_mult,
82+
input2_shift_val,
83+
output_mult,
84+
output_shift_val);
85+
86+
// Call CMSIS-NN kernel with precomputed parameters
87+
arm_cmsis_nn_status status = arm_elementwise_add_s8(
88+
input1_int8.const_data_ptr<int8_t>(),
89+
input2_int8.const_data_ptr<int8_t>(),
90+
static_cast<int32_t>(zp1),
91+
input1_mult,
92+
input1_shift_val,
93+
static_cast<int32_t>(zp2),
94+
input2_mult,
95+
input2_shift_val,
96+
left_shift,
97+
out.mutable_data_ptr<int8_t>(),
98+
static_cast<int32_t>(out_zp),
99+
output_mult,
100+
output_shift_val,
101+
static_cast<int32_t>(out.numel()),
102+
activation_min,
103+
activation_max);
104+
105+
if (status != ARM_CMSIS_NN_SUCCESS) {
106+
ET_LOG(
107+
Error,
108+
"quantized_add_out: arm_elementwise_add_s8 failed with status [%d]",
109+
status);
110+
111+
context.fail(Error::Internal); // Fail the execution context
112+
return out;
113+
}
114+
ET_LOG(
115+
Info,
116+
"quantized_add_out: Successfully completed with AoT-computed parameters!");
117+
118+
return out;
119+
}
120+
121+
// Stub Implementation: Non-out variant for compatibility (functional variant)
122+
// EXIR/ExecuTorch runs an out-variant pass that converts
123+
// .default operations to .out variants before memory planning.
124+
// In the pass we are calling quantized_add's default variant
125+
// but ExecuTorch's kernel dispatch mechanism will end up calling the out
126+
// variant. This stub is to make sure that compiler doesn't complain.
127+
Tensor quantized_add(
128+
KernelRuntimeContext& context,
129+
const Tensor& input1_int8,
130+
const Scalar& input1_zero_point,
131+
const Scalar& input1_multiplier,
132+
const Scalar& input1_shift,
133+
const Tensor& input2_int8,
134+
const Scalar& input2_zero_point,
135+
const Scalar& input2_multiplier,
136+
const Scalar& input2_shift,
137+
const Scalar& output_zero_point,
138+
const Scalar& output_multiplier,
139+
const Scalar& output_shift) {
140+
ET_LOG(Info, "quantized_add: input1_int8.sizes() = %zu", input1_int8.sizes());
141+
142+
// Crash on Debug builds if invoked
143+
assert(False);
144+
// This is to make sure compiler doesn't complain.
145+
return const_cast<Tensor&>(input1_int8);
146+
}
147+
148+
} // namespace native
149+
} // namespace cortex_m

0 commit comments

Comments
 (0)