Skip to content

Commit d873063

Browse files
psiddhGithub Executorch
andauthored
Summary: Add Stateful FC Cortex-m linearOps (#14252)
Integrate with CMSIS-NN with per-channel quantization support Test Plan: With local changes :Run e2e test on FVP simulator ./examples/arm/run_mcu_models_fvp.sh --target=cortex-m55 --models=qlinear Reviewers: Subscribers: Tasks: Tags: ### Summary [PLEASE REMOVE] See [CONTRIBUTING.md's Pull Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests) for ExecuTorch PR guidelines. [PLEASE REMOVE] If this PR closes an issue, please add a `Fixes #<issue-id>` line. [PLEASE REMOVE] If this PR introduces a fix or feature that should be the upcoming release notes, please add a "Release notes: <area>" label. For a list of available release notes labels, check out [CONTRIBUTING.md's Pull Requests](https://github.com/pytorch/executorch/blob/main/CONTRIBUTING.md#pull-requests). ### Test plan [PLEASE REMOVE] How did you test this PR? Please write down any manual commands you used and note down tests that you have written if applicable. Co-authored-by: Github Executorch <[email protected]>
1 parent 5fd66ee commit d873063

10 files changed

+1400
-51
lines changed

backends/cortex_m/CMakeLists.txt

Lines changed: 52 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ if(NOT CMAKE_CXX_STANDARD)
1212
set(CMAKE_CXX_STANDARD 17)
1313
endif()
1414

15-
# Source root directory for executorch.
15+
# Source root directory for executorch
1616
if(NOT EXECUTORCH_ROOT)
1717
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
1818
endif()
@@ -21,70 +21,90 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
2121
include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
2222
include(FetchContent)
2323

24-
# CMSIS-NN version to download
24+
# CMSIS-NN configuration with dynamic path detection
2525
set(CMSIS_NN_VERSION
26-
"v4.1.0"
26+
"v7.0.0"
2727
CACHE STRING "CMSIS-NN version to download"
2828
)
29-
30-
# Declare CMSIS-NN as a FetchContent project
31-
FetchContent_Declare(
32-
cmsis_nn
33-
GIT_REPOSITORY https://github.com/ARM-software/CMSIS-NN.git
34-
GIT_TAG ${CMSIS_NN_VERSION}
29+
set(CMSIS_NN_LOCAL_PATH
30+
""
31+
CACHE PATH "Path to existing local CMSIS-NN installation"
3532
)
3633

37-
# Download and make CMSIS-NN available
38-
FetchContent_MakeAvailable(cmsis_nn)
34+
# Try to find existing / local CMSIS-NN installation. This is useful for
35+
# debugging and testing with local changes. This is not common, as the CMSIS-NN
36+
# library is downloaded via FetchContent in the default/regular case.
37+
if(CMSIS_NN_LOCAL_PATH AND EXISTS "${CMSIS_NN_LOCAL_PATH}")
38+
message(STATUS "Using CMSIS-NN from specified path: ${CMSIS_NN_LOCAL_PATH}")
39+
add_subdirectory(${CMSIS_NN_LOCAL_PATH} cmsis_nn_build)
40+
else()
41+
# Use FetchContent with automatic fallback
42+
message(STATUS "Using CMSIS-NN via FetchContent")
43+
44+
FetchContent_Declare(
45+
cmsis_nn
46+
GIT_REPOSITORY https://github.com/ARM-software/CMSIS-NN.git
47+
GIT_TAG ${CMSIS_NN_VERSION}
48+
GIT_SHALLOW TRUE
49+
)
50+
51+
FetchContent_GetProperties(cmsis_nn)
52+
if(NOT cmsis_nn_POPULATED)
53+
FetchContent_Populate(cmsis_nn)
54+
add_subdirectory(${cmsis_nn_SOURCE_DIR} ${cmsis_nn_BINARY_DIR})
55+
endif()
56+
endif()
3957

40-
# Print paths for debugging
41-
message(STATUS "CMSIS-NN source dir: ${cmsis_nn_SOURCE_DIR}")
42-
message(STATUS "CMSIS-NN binary dir: ${cmsis_nn_BINARY_DIR}")
58+
# Add MVEI define to cmsis-nn target
59+
if(TARGET cmsis-nn)
60+
target_compile_definitions(cmsis-nn PUBLIC ARM_MATH_MVEI=1)
61+
get_target_property(CMSIS_NN_INCLUDES cmsis-nn INTERFACE_INCLUDE_DIRECTORIES)
62+
message(STATUS "CMSIS-NN include dirs: ${CMSIS_NN_INCLUDES}")
63+
else()
64+
message(
65+
FATAL_ERROR
66+
"CMSIS-NN target not found. Check your CMSIS_NN_LOCAL_PATH or network connection."
67+
)
68+
endif()
4369

4470
# Cortex-M ops kernel sources
4571
set(_cortex_m_kernels__srcs
4672
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantize_per_tensor.cpp
4773
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_dequantize_per_tensor.cpp
4874
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_add.cpp
75+
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_linear.cpp
4976
)
5077

51-
# Generate C++ bindings to register kernels into Executorch (for runtime)
78+
# Generate C++ bindings to register kernels into Executorch
5279
set(_yaml_file ${CMAKE_CURRENT_LIST_DIR}/ops/operators.yaml)
5380
gen_selected_ops(LIB_NAME "cortex_m_ops_lib" OPS_SCHEMA_YAML "${_yaml_file}")
54-
5581
generate_bindings_for_kernels(
5682
LIB_NAME "cortex_m_ops_lib" CUSTOM_OPS_YAML "${_yaml_file}"
5783
)
58-
message("Generated files ${gen_command_sources}")
5984

60-
# Build a library for cortex_m_kernels
85+
# Build library for cortex_m_kernels
6186
add_library(cortex_m_kernels ${_cortex_m_kernels__srcs})
62-
target_compile_options(cortex_m_kernels PUBLIC ${_common_compile_options})
6387

64-
# Include directories for cortex_m_kernels
65-
target_include_directories(
88+
# Use PRIVATE for implementation dependencies to avoid INTERFACE pollution
89+
target_link_libraries(
6690
cortex_m_kernels
67-
PRIVATE ${EXECUTORCH_ROOT}/..
68-
${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
69-
${cmsis_nn_SOURCE_DIR}/Include
91+
PRIVATE cmsis-nn
92+
PRIVATE executorch
7093
)
7194

72-
# Link directly to the CMSIS-NN static library file
73-
target_link_libraries(
74-
cortex_m_kernels PUBLIC ${cmsis_nn_BINARY_DIR}/libcmsis-nn.a executorch
95+
# Include directories for cortex_m_kernels
96+
target_include_directories(
97+
cortex_m_kernels PRIVATE ${EXECUTORCH_ROOT}/..
98+
${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
7599
)
76100

77-
# Add dependency to ensure CMSIS-NN builds before we try to link. Use the actual
78-
# CMSIS-NN target name (usually 'cmsis-nn')
79-
add_dependencies(cortex_m_kernels cmsis-nn)
80-
81101
# cortex_m_ops_lib: Register Cortex-M ops kernels into Executorch runtime
82102
gen_operators_lib(
83103
LIB_NAME "cortex_m_ops_lib" KERNEL_LIBS cortex_m_kernels DEPS executorch
84104
)
85105

86106
install(
87-
TARGETS cortex_m_kernels cortex_m_ops_lib
107+
TARGETS cortex_m_kernels cortex_m_ops_lib cmsis-nn
88108
EXPORT ExecuTorchTargets
89109
DESTINATION lib
90110
PUBLIC_HEADER DESTINATION include/executorch/backends/cortex_m/ops/
Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
#pragma once
9+
10+
#include "cortex_m_ops_common.h"
11+
extern "C" {
12+
#include "arm_nnfunctions.h"
13+
}
14+
15+
namespace cortex_m {
16+
namespace native {
17+
18+
// During AOT phase, quantized_linear_fusion_pass allocates total buffer
19+
// and passes in as 'Tensor'. (Total buffer = 8-byte header + x bytes)
20+
// ┌─────────────────┬─────────────────────────────────────┐
21+
// │ KernelSum Header│ CMSIS Workspace │
22+
// │ (8 bytes) │ (x bytes) │
23+
// └─────────────────┴─────────────────────────────────────┘
24+
// │ │
25+
// │ └─> Passed to CMSIS API
26+
//
27+
// └─> State for kernel sum
28+
29+
// C++ Runtime:
30+
// ┌─────────────────┬─────────────────────────────────────┐
31+
// │ KernelSum Header│ CMSIS Workspace │
32+
// │ (8 bytes) │ (x bytes) │
33+
// └─────────────────┴─────────────────────────────────────┘
34+
// ^ ^
35+
// │ │
36+
// scratch_ptr cmsis_workspace_ptr
37+
// │ │
38+
// ▼ ▼
39+
// arm_vector_sum_s8() writes kernel sums (with bias if avail):
40+
// [sum₀+bias₀][sum₁+bias₁][sum₂+bias₂]...[sum_{n-1}+bias_{n-1}]
41+
// (n * 4-byte int32_t values = x bytes)
42+
//
43+
// - n = out_features (number of output features)
44+
// - x = n * 4 bytes (total CMSIS buffer size)
45+
// - Total buffer = 8 + x bytes
46+
47+
class CMSISScratchBufferContext final {
48+
public:
49+
CMSISScratchBufferContext(
50+
Tensor& scratch_buffer,
51+
const Tensor& weights,
52+
const Tensor& weight_zero_point,
53+
const torch::executor::optional<Tensor>& bias)
54+
: scratch_ptr_(scratch_buffer.mutable_data_ptr<int8_t>()),
55+
total_size_(scratch_buffer.size(0)),
56+
base_ptr_(reinterpret_cast<uint8_t*>(scratch_ptr_)),
57+
in_features_(weights.size(1)),
58+
out_features_(weights.size(0)),
59+
is_per_channel_(weight_zero_point.numel() > 1),
60+
weight_data_offset_(calculate_offset(weights.const_data_ptr<int8_t>())),
61+
weight_zp_data_offset_(
62+
calculate_offset(weight_zero_point.const_data_ptr<int32_t>())),
63+
bias_data_offset_(
64+
bias.has_value()
65+
? calculate_offset(bias.value().const_data_ptr<int32_t>())
66+
: 0),
67+
header_(reinterpret_cast<KernelSumHeader*>(scratch_ptr_)),
68+
cmsis_workspace_ptr_(scratch_ptr_ + KERNEL_SUM_HEADER_SIZE) {
69+
cmsis_nn_dims filter_dims = {in_features_, 1, 1, out_features_};
70+
validate_size(filter_dims);
71+
}
72+
73+
cmsis_nn_context get_cmsis_ctx() const {
74+
cmsis_nn_context ctx;
75+
ET_CHECK_MSG(
76+
reinterpret_cast<uintptr_t>(cmsis_workspace_ptr_) % 4 == 0,
77+
"CMSIS workspace not 4-byte aligned");
78+
ctx.buf = cmsis_workspace_ptr_;
79+
ctx.size = get_cmsis_workspace_size();
80+
return ctx;
81+
}
82+
83+
bool is_kernel_sum_updated() const {
84+
return header_->updated;
85+
}
86+
87+
void compute_kernel_sums_if_needed() {
88+
if (!header_->updated) {
89+
arm_vector_sum_s8(
90+
reinterpret_cast<int32_t*>(cmsis_workspace_ptr_),
91+
in_features_,
92+
out_features_,
93+
get_weight_data(),
94+
get_weight_zp_data()[0],
95+
0,
96+
get_bias_data());
97+
header_->updated = true;
98+
ET_LOG(
99+
Info,
100+
"Computed kernel sums. [required_bytes : %d]",
101+
header_->required_size);
102+
}
103+
}
104+
105+
const int8_t* get_weight_data() const {
106+
return reinterpret_cast<const int8_t*>(base_ptr_ + weight_data_offset_);
107+
}
108+
109+
const int32_t* get_weight_zp_data() const {
110+
return reinterpret_cast<const int32_t*>(base_ptr_ + weight_zp_data_offset_);
111+
}
112+
113+
const int32_t* get_bias_data() const {
114+
return bias_data_offset_ == 0
115+
? nullptr
116+
: reinterpret_cast<const int32_t*>(base_ptr_ + bias_data_offset_);
117+
}
118+
119+
bool is_per_channel_quant() const {
120+
return is_per_channel_;
121+
}
122+
int32_t get_in_features() const {
123+
return in_features_;
124+
}
125+
int32_t get_out_features() const {
126+
return out_features_;
127+
}
128+
129+
private:
130+
static constexpr size_t KERNEL_SUM_HEADER_SIZE = 8;
131+
132+
// Header for kernel sum computation state only
133+
struct KernelSumHeader {
134+
bool updated = false;
135+
int32_t required_size = 0;
136+
};
137+
static_assert(
138+
sizeof(KernelSumHeader) == KERNEL_SUM_HEADER_SIZE,
139+
"KernelSumHeader must be exactly 8 bytes");
140+
141+
int8_t* scratch_ptr_;
142+
size_t total_size_;
143+
uint8_t* base_ptr_;
144+
145+
// Context members
146+
const int32_t in_features_;
147+
const int32_t out_features_;
148+
const bool is_per_channel_;
149+
const uint32_t weight_data_offset_;
150+
const uint32_t weight_zp_data_offset_;
151+
const uint32_t bias_data_offset_;
152+
153+
KernelSumHeader* header_;
154+
int8_t* cmsis_workspace_ptr_;
155+
156+
uint32_t calculate_offset(const void* ptr) const {
157+
if (ptr == nullptr)
158+
return 0;
159+
160+
const uint8_t* ptr_bytes = reinterpret_cast<const uint8_t*>(ptr);
161+
ET_CHECK_MSG(ptr_bytes >= base_ptr_, "Pointer is before base address");
162+
163+
const std::ptrdiff_t offset = ptr_bytes - base_ptr_;
164+
ET_CHECK_MSG(
165+
offset >= 0 && offset <= UINT32_MAX, "Offset out of valid range");
166+
return static_cast<uint32_t>(offset);
167+
}
168+
169+
size_t get_cmsis_workspace_size() const {
170+
return total_size_ - KERNEL_SUM_HEADER_SIZE;
171+
}
172+
173+
void validate_size(const cmsis_nn_dims& filter_dims) const {
174+
header_->required_size =
175+
arm_fully_connected_s8_get_buffer_size(&filter_dims);
176+
177+
ET_CHECK_MSG(
178+
get_cmsis_workspace_size() >=
179+
static_cast<size_t>(header_->required_size),
180+
"Scratch buffer size %zu insufficient for required size %d",
181+
get_cmsis_workspace_size(),
182+
header_->required_size);
183+
}
184+
};
185+
186+
} // namespace native
187+
} // namespace cortex_m

backends/cortex_m/ops/cortex_m_ops_common.h

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ using ScalarType = executorch::aten::ScalarType;
2222
using Scalar = torch::executor::Scalar;
2323
using Error = executorch::runtime::Error;
2424

25+
// From arm_nn_math_types.h
26+
#define ARM_NN_Q31_MAX ((int32_t)(0x7FFFFFFFL))
27+
#define ARM_NN_Q31_MIN ((int32_t)(0x80000000L))
28+
2529
// Basic tensor type / layout validation and dimension order checking
2630
inline void validate_cmsis_nn_tensor_requirements(
2731
const Tensor& input1,
@@ -32,16 +36,19 @@ inline void validate_cmsis_nn_tensor_requirements(
3236
// Basic dtype validation
3337
ET_CHECK_MSG(
3438
input1.scalar_type() == expected_dtype,
35-
"Input1 dtype must be %hhd",
36-
expected_dtype);
39+
"Input1 dtype must be %hhd, got %hhd",
40+
expected_dtype,
41+
input1.scalar_type());
3742
ET_CHECK_MSG(
3843
input2.scalar_type() == expected_dtype,
39-
"Input2 dtype must be %hhd",
40-
expected_dtype);
44+
"Input2 dtype must be %hhd, got %hhd",
45+
expected_dtype,
46+
input2.scalar_type());
4147
ET_CHECK_MSG(
4248
output.scalar_type() == expected_dtype,
43-
"Output dtype must be %hhd",
44-
expected_dtype);
49+
"Output dtype must be %hhd, got %hhd",
50+
expected_dtype,
51+
output.scalar_type());
4552

4653
// Dim order consistency
4754
ET_CHECK_MSG(
@@ -114,6 +121,33 @@ inline void validate_quantization_params(
114121
"Single quant Output");
115122
}
116123

124+
// Refer to CMSIS-NN 'arm_nn_requantize' implementation for details:
125+
// https://github.com/ARM-software/CMSIS-NN/blob/main/Include/arm_nnsupportfunctions.h#L1625
126+
// multiplier: Range {ARM_NN_Q31_MIN + 1, Q32_MAX}
127+
// shift : Range {-31, 30}
128+
inline bool validate_per_channel_quant_params(
129+
const int32_t* multipliers,
130+
const int32_t* shifts,
131+
int num_channels) {
132+
for (int i = 0; i < num_channels; ++i) {
133+
// Multiplier: {ARM_NN_Q31_MIN + 1, ARM_NN_Q31_MAX}
134+
if (multipliers[i] <= ARM_NN_Q31_MIN || multipliers[i] > ARM_NN_Q31_MAX) {
135+
ET_LOG(
136+
Error,
137+
"weight_multiplier[%d] out of CMSIS-NN range: %d",
138+
i,
139+
multipliers[i]);
140+
return false;
141+
}
142+
// Shift: {-31, 30} for arm_nn_requantize
143+
if (shifts[i] < -31 || shifts[i] > 30) {
144+
ET_LOG(Error, "weight_shift[%d] out of range: %d", i, shifts[i]);
145+
return false;
146+
}
147+
}
148+
return true;
149+
}
150+
117151
inline Error resize_to_broadcast_target_size(
118152
const Tensor& input1,
119153
const Tensor& input2,

0 commit comments

Comments
 (0)