Summary: Initial CMSIS-NN custom kernels port (Take #2)

sidart · sidart · commit a523a5b623dd · 2025-07-28T01:21:57.000-07:00
Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -530,6 +530,7 @@ endif()
 
 if(EXECUTORCH_BUILD_CORTEX_M)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cortex_m)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cortex_m/cmsis-nn/ops)
 endif()
 
 if(EXECUTORCH_BUILD_DEVTOOLS)
diff --git a/backends/cortex_m/cmsis-nn/cmsis-operators.py b/backends/cortex_m/cmsis-nn/cmsis-operators.py
@@ -0,0 +1,48 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.dialects._ops import (
+    ops as exir_ops,
+)  # To provide the implementation of the operators
+from torch.library import impl, Library, register_fake
+
+# New operator library with a custom namespace to allow fusion etc.
+lib = Library("cortex_m", "DEF")
+
+###
+# add.Tensor
+###
+
+lib.define("aten_add_tensor(Tensor self, Tensor other, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)")
+
+@impl(lib, "aten_add_tensor", "CompositeExplicitAutograd")
+def aten_add_tensor_impl(input1, input2, dtype, out):
+    return exir_ops.edge.cortex_m.aten_add_tensor.default(input1, input2, dtype, dtype)
+
+
+###
+# add.out
+###
+
+lib.define(
+    "add.out(Tensor input1, Tensor input2, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)"
+)
+
+@impl(lib, "add.out", "CompositeExplicitAutograd")
+def add_out_impl(
+    input1: torch.Tensor,
+    input2: torch.Tensor,
+    dtype: torch.dtype,
+    out: torch.Tensor,
+) -> torch.Tensor:
+    """
+    The implementation of cmsis-nn add.out.
+    """
+
+    return exir_ops.edge.cortex_m.add.default(
+        input1, input2, dtype, dtype
+    )
diff --git a/backends/cortex_m/cmsis-nn/cmsis.yaml b/backends/cortex_m/cmsis-nn/cmsis.yaml
@@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+- op: aten::add.out
+  kernels:
+    - arg_meta: null
+      kernel_name: cortex_m::aten_add_tensor
+
+- op: aten::_softmax.out
+  kernels:
+    - arg_meta: null
+      kernel_name: cortex_m::aten_softmax
diff --git a/backends/cortex_m/cmsis-nn/ops/CMakeLists.txt b/backends/cortex_m/cmsis-nn/ops/CMakeLists.txt
@@ -0,0 +1,87 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+cmake_minimum_required(VERSION 3.19)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+set(CMAKE_VERBOSE_MAKEFILE ON)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../)
+endif()
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
+
+set(EXECUTORCH_ENABLE_LOGGING ON CACHE BOOL "Enable ExecuTorch logging")
+set(EXECUTORCH_LOG_LEVEL "DEBUG" CACHE STRING "ExecuTorch log level")
+
+# Path to CMSIS-NN root - adjust as needed
+set(CMSIS_NN_ROOT /home/sidart/working/CMSIS-NN)
+
+# Cortex-M CMSIS ops sources
+set(_cortex_m_kernels_cmsis__srcs
+    "${EXECUTORCH_ROOT}/backends/cortex_m/cmsis-nn/ops/op_aten_add_tensor.cpp"
+    "${EXECUTORCH_ROOT}/backends/cortex_m/cmsis-nn/ops/op_aten_softmax.cpp"
+)
+
+# Common include directories
+set(_common_include_directories
+    ${EXECUTORCH_ROOT}/..
+    ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
+    ${CMSIS_NN_ROOT}/Include
+    ${CMSIS_NN_ROOT}  # For any CMake or config includes
+)
+
+# Import CMSIS-NN static library as a target
+add_library(cmsis_nn STATIC IMPORTED)
+set_target_properties(cmsis_nn PROPERTIES
+  IMPORTED_LOCATION "${CMSIS_NN_ROOT}/build/libcmsis-nn.a"
+  INTERFACE_INCLUDE_DIRECTORIES "${CMSIS_NN_ROOT}/Include"
+)
+
+# Build cortex_m_cmsis_kernels static library
+add_library(cortex_m_cmsis_kernels ${_cortex_m_kernels_cmsis__srcs})
+
+# Include directories for cortex_m_cmsis_kernels
+target_include_directories(cortex_m_cmsis_kernels
+  PRIVATE
+    ${_common_include_directories}
+)
+
+# Link libraries: executorch and CMSIS-NN imported target
+target_link_libraries(cortex_m_cmsis_kernels
+  PRIVATE
+    cmsis_nn
+    executorch
+)
+
+# Generate C++ bindings for kernels and operators
+gen_selected_ops(
+  LIB_NAME "cortex_m_cmsis_nn_ops_lib" OPS_SCHEMA_YAML
+  "${CMAKE_CURRENT_LIST_DIR}/../cmsis.yaml" "" ""
+)
+generate_bindings_for_kernels(
+  LIB_NAME "cortex_m_cmsis_nn_ops_lib" FUNCTIONS_YAML
+  ${CMAKE_CURRENT_SOURCE_DIR}/../cmsis.yaml
+)
+
+gen_operators_lib(
+  LIB_NAME "cortex_m_cmsis_nn_ops_lib" KERNEL_LIBS cortex_m_cmsis_kernels DEPS executorch
+)
+set(CMAKE_EXE_LINKER_FLAGS "-Wl,--gc-sections")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffunction-sections -fdata-sections")
+
+# Install targets and headers
+install(
+  TARGETS cortex_m_cmsis_kernels cortex_m_cmsis_nn_ops_lib
+  DESTINATION lib
+  PUBLIC_HEADER DESTINATION include/executorch/backends/cortex_m/cmsis-nn/ops/
+)
diff --git a/backends/cortex_m/cmsis-nn/ops/op_aten_add_tensor.cpp b/backends/cortex_m/cmsis-nn/ops/op_aten_add_tensor.cpp
@@ -0,0 +1,68 @@
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/core/portable_type/tensor.h>  // for torch::executor::Tensor
+#include <executorch/runtime/core/portable_type/scalar.h>  // for torch::executor::Scalar
+#include <iostream>
+
+namespace cortex_m {
+namespace native {
+
+using Tensor = executorch::aten::Tensor;
+using ScalarType = executorch::aten::ScalarType;
+using Scalar = executorch::aten::Scalar;
+using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
+
+torch::executor::Tensor& aten_add_tensor(
+    torch::executor::KernelRuntimeContext& ctx,
+    const torch::executor::Tensor&  input1,
+    const torch::executor::Tensor&  input2,
+    const torch::executor::Scalar& alpha,
+    torch::executor::Tensor& out) {
+  // Your CMSIS-NN optimized implementation here
+  // Return 'out' tensor as per Executorch kernel signature
+  std::cout << "add_out kernel called" << std::endl;
+  ET_LOG(Info, "xxxxxxxxxx add_out kernel called");
+
+  assert(false);
+  assert(true);
+  return out;
+}
+
+torch::executor::Tensor& add_out(
+    torch::executor::KernelRuntimeContext& ctx,
+    const torch::executor::Tensor&  input1,
+    const torch::executor::Tensor&  input2,
+    const torch::executor::Scalar& alpha,
+    torch::executor::Tensor& out) {
+  std::cout << "add_out kernel called" << std::endl;
+  ET_LOG(Info, "xxxxxxxxxx add_out kernel called");
+
+  // Ensure input is char type
+  ET_CHECK_MSG(
+      input1.scalar_type() == ScalarType::Char,
+      "input1.scalar_type() %" PRId8 " is not char type",
+      static_cast<int8_t>(input1.scalar_type()));
+
+  ET_CHECK_MSG(
+      input2.scalar_type() == ScalarType::Char,
+      "input2.scalar_type() %" PRId8 " is not char type",
+      static_cast<int8_t>(input2.scalar_type()));
+
+  // Check output dtype is float
+  ET_CHECK_MSG(
+      out.scalar_type() == ScalarType::Float,
+      "out.scalar_type() %" PRId8 " is not float",
+      static_cast<int8_t>(out.scalar_type()));
+
+  // Check dtype is int8 (Char)
+  /*ET_CHECK_MSG(
+      dtype == ScalarType::Char,
+      "dtype %" PRId8 " is not int8 (Char)",
+      static_cast<int8_t>(dtype));*/
+  
+  assert(false);
+
+  return out;
+}
+
+} // namespace native
+} // namespace cortex_m
diff --git a/backends/cortex_m/cmsis-nn/ops/op_aten_softmax.cpp b/backends/cortex_m/cmsis-nn/ops/op_aten_softmax.cpp
@@ -0,0 +1,66 @@
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/core/portable_type/tensor.h>  // for torch::executor::Tensor
+#include <executorch/runtime/core/portable_type/scalar.h>  // for torch::executor::Scalar
+
+extern "C" {
+#include "Include/arm_nnfunctions.h"
+}
+
+namespace cortex_m {
+namespace native {
+
+using Tensor = torch::executor::Tensor;
+using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
+
+//__attribute__((section(".text_ddr")))
+void softmax_wrapper(
+    const int8_t* input_data,
+    int rows,
+    int cols,
+    int32_t input_mult,
+    int32_t input_shift,
+    int32_t diff_min,
+    int8_t* output_data) {
+      arm_softmax_s8(
+        input_data,
+        rows,
+        cols,
+        input_mult,
+        input_shift,
+        diff_min,
+        output_data);
+}
+
+torch::executor::Tensor& aten_softmax(
+    KernelRuntimeContext& context,
+    const Tensor& self,
+    int64_t dim,
+    bool half_to_float,
+    Tensor& out) {
+
+  ET_LOG(Info, "CMSIS-NN softmax kernel called");
+  const int8_t* input_data = self.data_ptr<int8_t>();
+  int8_t* output_data = out.data_ptr<int8_t>();
+
+  int rows = self.sizes()[0];
+  int cols = self.sizes()[1];
+  ET_LOG(Info, "Input shape: %d x %d", rows, cols);
+  // Quantization params - dummy values for now, refine later
+  int32_t input_mult = 1 << 4;  // or something from qparams
+  int32_t input_shift = 0;
+  int32_t diff_min = -128;
+
+  softmax_wrapper(
+    input_data,
+    rows,
+    cols,
+    input_mult,
+    input_shift,
+    diff_min,
+    output_data);
+
+  return out;
+}
+
+} // namespace native
+} // namespace cortex_m
diff --git a/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake b/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake
@@ -73,11 +73,11 @@ elseif(
   OR CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m55(\\+|$)"
   OR CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m85(\\+|$)"
 )
-  set(FLOAT hard)
+  set(FLOAT soft)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m4(\\+|$)"
        OR CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m7(\\+|$)"
 )
-  set(FLOAT hard)
+  set(FLOAT soft)
   set(FPU_CONFIG "fpv4-sp-d16")
   add_compile_options(-mfpu=${FPU_CONFIG})
   add_link_options(-mfpu=${FPU_CONFIG})
diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt
@@ -13,7 +13,7 @@ option(ET_ATOL "Set atol to use for BundleIO testing" OFF)
 option(ET_RTOL "Set rtol to use for BundleIO testing" OFF)
 option(ET_DUMP_INPUT "Dump input in log" OFF)
 option(ET_DUMP_OUTPUT "Dump output in log" ON)
-option(FETCH_ETHOS_U_CONTENT "Fetch ethos_u dependencies instead of relying on pre-downloads" ON)
+option(FETCH_ETHOS_U_CONTENT "Fetch ethos_u dependencies instead of relying on pre-downloads" OFF)
 
 if(NOT DEFINED ET_PTE_FILE_PATH AND NOT ${SEMIHOSTING})
   message(
@@ -539,6 +539,26 @@ set_property(
   PROPERTY IMPORTED_LOCATION
            "${ET_BUILD_DIR_PATH}/backends/cortex_m/libcortex_m_kernels.a"
 )
+add_library(cortex_m_cmsis_nn_ops_lib STATIC IMPORTED)
+set_property(
+  TARGET cortex_m_cmsis_nn_ops_lib
+  PROPERTY IMPORTED_LOCATION
+           "${ET_BUILD_DIR_PATH}/backends/cortex_m/cmsis-nn/ops/libcortex_m_cmsis_nn_ops_lib.a"
+)
+add_library(cortex_m_cmsis_kernels STATIC IMPORTED)
+set_property(
+  TARGET cortex_m_cmsis_kernels
+   PROPERTY IMPORTED_LOCATION
+           "${ET_BUILD_DIR_PATH}/backends/cortex_m/cmsis-nn/ops/libcortex_m_cmsis_kernels.a"
+)
+
+add_library(cmsis_nn STATIC IMPORTED)
+set_property(
+  TARGET cmsis_nn
+   PROPERTY IMPORTED_LOCATION
+           "/home/sidart/working/CMSIS-NN/build/libcmsis-nn.a"
+)
+
 add_library(extension_runner_util STATIC IMPORTED)
 set_property(
   TARGET extension_runner_util
@@ -580,11 +600,14 @@ list(APPEND arm_executor_runner_link
   "-Wl,--whole-archive"
   executorch_delegate_ethos_u
   cortex_m_ops_lib
+  cortex_m_cmsis_nn_ops_lib
   quantized_ops_lib
   portable_ops_lib
   quantized_kernels
-  cortex_m_kernels
   portable_kernels
+  cortex_m_kernels
+  cortex_m_cmsis_kernels
+  cmsis_nn
   "-Wl,--no-whole-archive"
   -Xlinker -Map=arm_executor_runner.map
 )
@@ -674,6 +697,10 @@ if(ET_DUMP_OUTPUT)
   target_compile_definitions(arm_executor_runner PUBLIC -DET_DUMP_OUTPUT)
 endif()
 
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffunction-sections -fdata-sections -fno-exceptions -fno-unwind-tables")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffunction-sections -fdata-sections -fno-exceptions -fno-unwind-tables")
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--gc-sections")
+
 # Fixup compilation of retarget.c
 if(SEMIHOSTING)
   # Remove this when MLBEDSW-8910 is closed.
diff --git a/examples/arm/run.sh b/examples/arm/run.sh
diff --git a/runtime/kernel/operator_registry.cpp b/runtime/kernel/operator_registry.cpp

Original file line number	Diff line number	Diff line change
`@@ -73,11 +73,11 @@ elseif(`
`73`	`73`	`OR CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m55(\\+\|$)"`
`74`	`74`	`OR CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m85(\\+\|$)"`
`75`	`75`	`)`
`76`		`- set(FLOAT hard)`
	`76`	`+ set(FLOAT soft)`
`77`	`77`	`elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m4(\\+\|$)"`
`78`	`78`	`OR CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m7(\\+\|$)"`
`79`	`79`	`)`
`80`		`- set(FLOAT hard)`
	`80`	`+ set(FLOAT soft)`
`81`	`81`	`set(FPU_CONFIG "fpv4-sp-d16")`
`82`	`82`	`add_compile_options(-mfpu=${FPU_CONFIG})`
`83`	`83`	`add_link_options(-mfpu=${FPU_CONFIG})`