Summary: Initial CMSIS-NN custom kernels port (Take #2)

sidart · sidart · commit 649ad2fb3e91 · 2025-07-31T09:47:21.000-07:00
Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -530,6 +530,7 @@ endif()
 
 if(EXECUTORCH_BUILD_CORTEX_M)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cortex_m)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cortex_m/cmsis-nn/ops)
 endif()
 
 if(EXECUTORCH_BUILD_DEVTOOLS)
diff --git a/backends/cortex_m/cmsis-nn/cmsis-operators.py b/backends/cortex_m/cmsis-nn/cmsis-operators.py
@@ -0,0 +1,48 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.dialects._ops import (
+    ops as exir_ops,
+)  # To provide the implementation of the operators
+from torch.library import impl, Library, register_fake
+
+# New operator library with a custom namespace to allow fusion etc.
+lib = Library("cortex_m", "DEF")
+
+###
+# add.Tensor
+###
+
+lib.define("aten_add_tensor(Tensor self, Tensor other, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)")
+
+@impl(lib, "aten_add_tensor", "CompositeExplicitAutograd")
+def aten_add_tensor_impl(input1, input2, dtype, out):
+    return exir_ops.edge.cortex_m.aten_add_tensor.default(input1, input2, dtype, dtype)
+
+
+###
+# add.out
+###
+
+lib.define(
+    "add.out(Tensor input1, Tensor input2, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)"
+)
+
+@impl(lib, "add.out", "CompositeExplicitAutograd")
+def add_out_impl(
+    input1: torch.Tensor,
+    input2: torch.Tensor,
+    dtype: torch.dtype,
+    out: torch.Tensor,
+) -> torch.Tensor:
+    """
+    The implementation of cmsis-nn add.out.
+    """
+
+    return exir_ops.edge.cortex_m.add.default(
+        input1, input2, dtype, dtype
+    )
diff --git a/backends/cortex_m/cmsis-nn/cmsis.yaml b/backends/cortex_m/cmsis-nn/cmsis.yaml
@@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+- op: aten::add.out
+  kernels:
+    - arg_meta: null
+      kernel_name: cortex_m::aten_add_tensor
+
+- op: aten::_softmax.out
+  kernels:
+    - arg_meta: null
+      kernel_name: cortex_m::aten_softmax
diff --git a/backends/cortex_m/cmsis-nn/ops/CMakeLists.txt b/backends/cortex_m/cmsis-nn/ops/CMakeLists.txt
@@ -0,0 +1,87 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+cmake_minimum_required(VERSION 3.19)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+set(CMAKE_VERBOSE_MAKEFILE ON)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../)
+endif()
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
+
+set(EXECUTORCH_ENABLE_LOGGING ON CACHE BOOL "Enable ExecuTorch logging")
+set(EXECUTORCH_LOG_LEVEL "DEBUG" CACHE STRING "ExecuTorch log level")
+
+# Path to CMSIS-NN root - adjust as needed
+set(CMSIS_NN_ROOT /home/sidart/working/CMSIS-NN)
+
+# Cortex-M CMSIS ops sources
+set(_cortex_m_kernels_cmsis__srcs
+    "${EXECUTORCH_ROOT}/backends/cortex_m/cmsis-nn/ops/op_aten_add_tensor.cpp"
+    "${EXECUTORCH_ROOT}/backends/cortex_m/cmsis-nn/ops/op_aten_softmax.cpp"
+)
+
+# Common include directories
+set(_common_include_directories
+    ${EXECUTORCH_ROOT}/..
+    ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
+    ${CMSIS_NN_ROOT}/Include
+    ${CMSIS_NN_ROOT}  # For any CMake or config includes
+)
+
+# Import CMSIS-NN static library as a target
+add_library(cmsis_nn STATIC IMPORTED)
+set_target_properties(cmsis_nn PROPERTIES
+  IMPORTED_LOCATION "${CMSIS_NN_ROOT}/build/libcmsis-nn.a"
+  INTERFACE_INCLUDE_DIRECTORIES "${CMSIS_NN_ROOT}/Include"
+)
+
+# Build cortex_m_cmsis_kernels static library
+add_library(cortex_m_cmsis_kernels ${_cortex_m_kernels_cmsis__srcs})
+
+# Include directories for cortex_m_cmsis_kernels
+target_include_directories(cortex_m_cmsis_kernels
+  PRIVATE
+    ${_common_include_directories}
+)
+
+# Link libraries: executorch and CMSIS-NN imported target
+target_link_libraries(cortex_m_cmsis_kernels
+  PRIVATE
+    cmsis_nn
+    executorch
+)
+
+# Generate C++ bindings for kernels and operators
+gen_selected_ops(
+  LIB_NAME "cortex_m_cmsis_nn_ops_lib" OPS_SCHEMA_YAML
+  "${CMAKE_CURRENT_LIST_DIR}/../cmsis.yaml" "" ""
+)
+generate_bindings_for_kernels(
+  LIB_NAME "cortex_m_cmsis_nn_ops_lib" FUNCTIONS_YAML
+  ${CMAKE_CURRENT_SOURCE_DIR}/../cmsis.yaml
+)
+
+gen_operators_lib(
+  LIB_NAME "cortex_m_cmsis_nn_ops_lib" KERNEL_LIBS cortex_m_cmsis_kernels DEPS executorch
+)
+set(CMAKE_EXE_LINKER_FLAGS "-Wl,--gc-sections")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffunction-sections -fdata-sections")
+
+# Install targets and headers
+install(
+  TARGETS cortex_m_cmsis_kernels cortex_m_cmsis_nn_ops_lib
+  DESTINATION lib
+  PUBLIC_HEADER DESTINATION include/executorch/backends/cortex_m/cmsis-nn/ops/
+)
diff --git a/backends/cortex_m/cmsis-nn/ops/op_aten_add_tensor.cpp b/backends/cortex_m/cmsis-nn/ops/op_aten_add_tensor.cpp
@@ -0,0 +1,68 @@
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/core/portable_type/tensor.h>  // for torch::executor::Tensor
+#include <executorch/runtime/core/portable_type/scalar.h>  // for torch::executor::Scalar
+#include <iostream>
+
+namespace cortex_m {
+namespace native {
+
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
+using Scalar = executorch::aten::Scalar;
+using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
+
+torch::executor::Tensor& aten_add_tensor(
+    torch::executor::KernelRuntimeContext& ctx,
+    const torch::executor::Tensor&  input1,
+    const torch::executor::Tensor&  input2,
+    const torch::executor::Scalar& alpha,
+    torch::executor::Tensor& out) {
+  // Your CMSIS-NN optimized implementation here
+  // Return 'out' tensor as per Executorch kernel signature
+  std::cout << "add_out kernel called" << std::endl;
+  ET_LOG(Info, "xxxxxxxxxx add_out kernel called");
+
+  assert(false);
+  assert(true);
+  return out;
+}
+
+torch::executor::Tensor& add_out(
+    torch::executor::KernelRuntimeContext& ctx,
+    const torch::executor::Tensor&  input1,
+    const torch::executor::Tensor&  input2,
+    const torch::executor::Scalar& alpha,
+    torch::executor::Tensor& out) {
+  std::cout << "add_out kernel called" << std::endl;
+  ET_LOG(Info, "xxxxxxxxxx add_out kernel called");
+
+  // Ensure input is char type
+  ET_CHECK_MSG(
+      input1.scalar_type() == ScalarType::Char,
+      "input1.scalar_type() %" PRId8 " is not char type",
+      static_cast<int8_t>(input1.scalar_type()));
+
+  ET_CHECK_MSG(
+      input2.scalar_type() == ScalarType::Char,
+      "input2.scalar_type() %" PRId8 " is not char type",
+      static_cast<int8_t>(input2.scalar_type()));
+
+  // Check output dtype is float
+  ET_CHECK_MSG(
+      out.scalar_type() == ScalarType::Float,
+      "out.scalar_type() %" PRId8 " is not float",
+      static_cast<int8_t>(out.scalar_type()));
+
+  // Check dtype is int8 (Char)
+  /*ET_CHECK_MSG(
+      dtype == ScalarType::Char,
+      "dtype %" PRId8 " is not int8 (Char)",
+      static_cast<int8_t>(dtype));*/
+  
+  assert(false);
+
+  return out;
+}
+
+} // namespace native
+} // namespace cortex_m
diff --git a/backends/cortex_m/cmsis-nn/ops/op_aten_softmax.cpp b/backends/cortex_m/cmsis-nn/ops/op_aten_softmax.cpp
@@ -0,0 +1,116 @@
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/core/portable_type/tensor.h>  // for torch::executor::Tensor
+#include <executorch/runtime/core/portable_type/scalar.h>  // for torch::executor::Scalar
+
+#include <vector>
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+extern "C" {
+#include "Include/arm_nnfunctions.h"
+}
+
+namespace cortex_m {
+namespace native {
+
+using Tensor = torch::executor::Tensor;
+using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
+
+// Determine quantization scale from fp32 data
+float determine_input_scale(const float* data, int size) {
+    float min_val = *std::min_element(data, data + size);
+    float max_val = *std::max_element(data, data + size);
+    return (max_val - min_val) / 255.0f; // For int8 range [-128, 127]
+}
+// Quantize fp32 to int8
+void quantize_tensor(const float* input, int8_t* output, int size,
+                    float scale, int32_t zero_point) {
+    for (int i = 0; i < size; i++) {
+        int32_t quantized = std::round(input[i] / scale) + zero_point;
+        // This ensures that the value quantized stays within the specified bounds — in this case, between -128 and 127, 
+        // which are the limits of int8_t.
+        output[i] = std::clamp(quantized, static_cast<int32_t>(-128), static_cast<int32_t>(127));
+    }
+}
+// Dequantize int8 to fp32
+void dequantize_tensor(const int8_t* input, float* output, int size,
+                      float scale, int32_t zero_point) {
+    for (int i = 0; i < size; i++) {
+        output[i] = (input[i] - zero_point) * scale;
+    }
+}
+
+// Converts a floating-point scale to CMSIS-NN fixed-point multiplier and shift
+// scale: the floating-point scale factor from ExecuTorch quantization
+// multiplier: output fixed-point multiplier (Q31 format)
+// shift: output left shift amount (positive means left shift)
+// diff_min: output minimum difference threshold (usually -128 for int8)
+void convert_scale_to_cmsis_params(float scale, int32_t* multiplier, int32_t* shift, int32_t* diff_min) {
+    if (scale == 0.0f) {
+        *multiplier = 0;
+        *shift = 0;
+        *diff_min = -128;
+        return;
+    }
+    // Decompose scale into mantissa and exponent: scale = mantissa * 2^exponent
+    int exponent;
+    float mantissa = std::frexp(scale, &exponent); // mantissa in [0.5, 1)
+    // Convert mantissa to Q31 fixed-point format
+    int64_t q_fixed = static_cast<int64_t>(std::round(mantissa * (1ll << 31)));
+    // Adjust multiplier and shift for CMSIS-NN
+    *multiplier = static_cast<int32_t>(q_fixed);
+    // CMSIS-NN expects a left shift, so negate exponent to get shift
+    *shift = -exponent;
+    // Typical diff_min for int8 softmax
+    *diff_min = -128;
+}
+
+torch::executor::Tensor& aten_softmax(
+    KernelRuntimeContext& context,
+    const Tensor& self,
+    int64_t dim,
+    bool half_to_float,
+    Tensor& out) {
+
+    ET_LOG(Info, "CMSIS-NN quantized softmax kernel called");
+    
+    // Step 1: Extract fp32 data
+    const float* input_data_fp32 = self.data_ptr<float>();
+    float* output_data_fp32 = out.data_ptr<float>();
+    
+    // Step 2: Get tensor dimensions
+    int rows = self.sizes()[0];
+    int cols = self.sizes()[1];
+    
+    // Step 3: Quantize input (fp32 -> int8)
+    // Determine appropriate scale/zero_point
+    float input_scale = determine_input_scale(input_data_fp32, rows * cols);
+
+    // '0' a reasonable default for symmetric quantization in int8, 
+    // especially if the input data is centered around zero else TBD
+    int32_t input_zero_point = 0;
+    
+    std::vector<int8_t> input_quantized(rows * cols);
+    quantize_tensor(input_data_fp32, input_quantized.data(), 
+                   rows * cols, input_scale, input_zero_point);
+    
+    // Step 4: Convert to CMSIS-NN parameters
+    int32_t input_mult, input_shift, diff_min;
+    convert_scale_to_cmsis_params(input_scale, &input_mult, &input_shift, &diff_min);
+    
+    // Step 5: Call CMSIS-NN kernel
+    std::vector<int8_t> output_quantized(rows * cols);
+    arm_softmax_s8(input_quantized.data(), rows, cols, 
+                   input_mult, input_shift, diff_min,
+                   output_quantized.data());
+    
+    // Step 6: Dequantize output (int8 -> fp32)
+    dequantize_tensor(output_quantized.data(), output_data_fp32,
+                     rows * cols, input_scale, input_zero_point);
+    
+    return out;
+}
+
+} // namespace native
+} // namespace cortex_m
diff --git a/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake b/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake
@@ -77,7 +77,7 @@ elseif(
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m4(\\+|$)"
        OR CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m7(\\+|$)"
 )
-  set(FLOAT hard)
+  set(FLOAT soft)
   set(FPU_CONFIG "fpv4-sp-d16")
   add_compile_options(-mfpu=${FPU_CONFIG})
   add_link_options(-mfpu=${FPU_CONFIG})
diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt
diff --git a/examples/arm/run.sh b/examples/arm/run.sh
diff --git a/runtime/kernel/operator_registry.cpp b/runtime/kernel/operator_registry.cpp

Original file line number	Diff line number	Diff line change
`@@ -77,7 +77,7 @@ elseif(`
`77`	`77`	`elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m4(\\+\|$)"`
`78`	`78`	`OR CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m7(\\+\|$)"`
`79`	`79`	`)`
`80`		`- set(FLOAT hard)`
	`80`	`+ set(FLOAT soft)`
`81`	`81`	`set(FPU_CONFIG "fpv4-sp-d16")`
`82`	`82`	`add_compile_options(-mfpu=${FPU_CONFIG})`
`83`	`83`	`add_link_options(-mfpu=${FPU_CONFIG})`