pytorch
diff --git a/‎backends/cortex_m/CMakeLists.txt‎
Lines changed: 45 additions & 1 deletion b/‎backends/cortex_m/CMakeLists.txt‎
Lines changed: 45 additions & 1 deletion
diff --git a/‎backends/cortex_m/ops/TARGETS‎
Lines changed: 1 addition & 0 deletions b/‎backends/cortex_m/ops/TARGETS‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/cortex_m/ops/cortex_m_ops_common.h‎
Lines changed: 43 additions & 0 deletions b/‎backends/cortex_m/ops/cortex_m_ops_common.h‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎backends/cortex_m/ops/op_quantized_add.cpp‎
Lines changed: 137 additions & 0 deletions b/‎backends/cortex_m/ops/op_quantized_add.cpp‎
Lines changed: 137 additions & 0 deletions
diff --git a/‎backends/cortex_m/ops/operators.py‎
Lines changed: 110 additions & 1 deletion b/‎backends/cortex_m/ops/operators.py‎
Lines changed: 110 additions & 1 deletion
diff --git a/‎backends/cortex_m/ops/operators.yaml‎
Lines changed: 12 additions & 0 deletions b/‎backends/cortex_m/ops/operators.yaml‎
Lines changed: 12 additions & 0 deletions
@@ -24,11 +24,41 @@ endif()
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
+include(ExternalProject)
+
+# Download and build CMSIS-NN from GitHub
+set(CMSIS_NN_VERSION
+    "v4.1.0"
+    CACHE STRING "CMSIS-NN version to download"
+)
+set(CMSIS_NN_ROOT ${CMAKE_CURRENT_BINARY_DIR}/cmsis-nn)
+set(CMSIS_NN_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmsis-nn-build)
+set(CMSIS_NN_LIB_PATH ${CMSIS_NN_BINARY_DIR}/libcmsis-nn.a)
+
+set(TARGET_CPU
+    "cortex-m55"
+    CACHE STRING "Target CPU for CMSIS-NN build"
+)
+ExternalProject_Add(
+  cmsis_nn_external
+  GIT_REPOSITORY https://github.com/ARM-software/CMSIS-NN.git
+  GIT_TAG ${CMSIS_NN_VERSION}
+  SOURCE_DIR ${CMSIS_NN_ROOT}
+  BINARY_DIR ${CMSIS_NN_BINARY_DIR}
+  CMAKE_ARGS
+    -DCMAKE_TOOLCHAIN_FILE=${EXECUTORCH_ROOT}/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake
+    -DTARGET_CPU=${TARGET_CPU}
+    -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+  BUILD_COMMAND ${CMAKE_COMMAND} --build <BINARY_DIR> --parallel
+  INSTALL_COMMAND ""
+  BUILD_BYPRODUCTS ${CMSIS_NN_LIB_PATH}
+)
 
 # Cortex-M ops kernel sources
 set(_cortex_m_kernels__srcs
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantize_per_tensor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_dequantize_per_tensor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_add.cpp
 )
 
 # Generate C++ bindings to register kernels into Executorch (for runtime). Here
@@ -44,9 +74,23 @@ message("Generated files ${gen_command_sources}")
 
 # Build a library for _cortex_m_kernels_srcs
 add_library(cortex_m_kernels ${_cortex_m_kernels__srcs})
-target_link_libraries(cortex_m_kernels PRIVATE executorch)
 target_compile_options(cortex_m_kernels PUBLIC ${_common_compile_options})
 
+# Add dependency on CMSIS-NN external project
+add_dependencies(cortex_m_kernels cmsis_nn_external)
+
+# Set include directories - Include is directly in CMSIS-NN root
+target_include_directories(
+  cortex_m_kernels
+  PRIVATE ${EXECUTORCH_ROOT}/..
+          ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
+          $<BUILD_INTERFACE:${CMSIS_NN_ROOT}/Include>
+          $<BUILD_INTERFACE:${CMSIS_NN_ROOT}>
+)
+
+# Link against the CMSIS-NN static library directly
+target_link_libraries(cortex_m_kernels PUBLIC ${CMSIS_NN_LIB_PATH} executorch)
+
 # cortex_m_ops_lib: Register Cortex-M ops kernels into Executorch runtime
 gen_operators_lib(
   LIB_NAME "cortex_m_ops_lib" KERNEL_LIBS cortex_m_kernels DEPS executorch
 
@@ -16,6 +16,7 @@ python_library(
     ],
     deps = [
         "fbcode//caffe2:torch",
+        "//executorch/backends/cortex_m/passes:passes_utils",
     ],
 )
 
 
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+// Include CMSIS-NN headers with C linkage
+extern "C" {
+#include "arm_nnfunctions.h"
+}
+
+using Tensor = torch::executor::Tensor;
+using ScalarType = executorch::aten::ScalarType;
+using Scalar = torch::executor::Scalar;
+using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
+using Error = executorch::runtime::Error;
+
+inline void validate_quantized_inputs(
+    KernelRuntimeContext& context,
+    const Tensor& input1,
+    const Tensor& input2,
+    Tensor& output) {
+  ET_CHECK_MSG(input1.scalar_type() == ScalarType::Char, "Input1 must be int8");
+  ET_CHECK_MSG(input2.scalar_type() == ScalarType::Char, "Input2 must be int8");
+  ET_CHECK_MSG(output.scalar_type() == ScalarType::Char, "Output must be int8");
+  ET_CHECK_MSG(
+      input1.sizes() == input2.sizes(), "Input tensors must be the same shape");
+  ET_CHECK_MSG(
+      input1.scalar_type() == input2.scalar_type(),
+      "Input tensors must be the same dtype");
+  ET_CHECK_MSG(
+      (torch::executor::resize_to_broadcast_target_size(
+           input1, input2, output) == Error::Ok),
+      "Broadcast error: resize_to_broadcast_target_size failed");
+  return;
+}
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "cortex_m_ops_common.h"
+
+namespace cortex_m {
+namespace native {
+
+Tensor& quantized_add_out(
+    KernelRuntimeContext& context,
+    const Tensor& input1_int8,
+    const Scalar& input1_zero_point,
+    const Scalar& input1_multiplier,
+    const Scalar& input1_shift,
+    const Tensor& input2_int8,
+    const Scalar& input2_zero_point,
+    const Scalar& input2_multiplier,
+    const Scalar& input2_shift,
+    const Scalar& output_zero_point,
+    const Scalar& output_multiplier,
+    const Scalar& output_shift,
+    Tensor& out) {
+  validate_quantized_inputs(context, input1_int8, input2_int8, out);
+
+  ET_LOG(
+      Info,
+      "quantized_add_out: input1_int8.sizes() = %zu",
+      input1_int8.sizes().size());
+
+  // FIX: Use template types that ExecutorTorch definitely provides
+  // Use to<int64_t>() and to<double>() which are commonly instantiated
+  int32_t zp1 = static_cast<int32_t>(input1_zero_point.to<int64_t>());
+  int32_t input1_mult = static_cast<int32_t>(input1_multiplier.to<int64_t>());
+  int input1_shift_val = static_cast<int>(input1_shift.to<int64_t>());
+
+  int32_t zp2 = static_cast<int32_t>(input2_zero_point.to<int64_t>());
+  int32_t input2_mult = static_cast<int32_t>(input2_multiplier.to<int64_t>());
+  int input2_shift_val = static_cast<int>(input2_shift.to<int64_t>());
+
+  int32_t out_zp = static_cast<int32_t>(output_zero_point.to<int64_t>());
+  int32_t output_mult = static_cast<int32_t>(output_multiplier.to<int64_t>());
+  int output_shift_val = static_cast<int>(output_shift.to<int64_t>());
+
+  // Left shift to maximize precision (tune as needed)
+  const int32_t left_shift = 20;
+  const int32_t activation_min = std::numeric_limits<int8_t>::min();
+  const int32_t activation_max = std::numeric_limits<int8_t>::max();
+
+  // Resize output tensor to match input shape
+  auto err = torch::executor::resize_tensor(out, input1_int8.sizes());
+  if (err != executorch::runtime::Error::Ok) {
+    ET_LOG(
+        Error,
+        "quantized_add_out: resize_tensor failed with error code [%d]",
+        static_cast<int>(err));
+    std::memset(out.mutable_data_ptr<int8_t>(), 0, out.nbytes());
+    return out;
+  }
+
+  ET_LOG(
+      Info,
+      "Using AoT-computed parameters: input1[mult=%d, shift=%d], input2[mult=%d, shift=%d], output[mult=%d, shift=%d]",
+      input1_mult,
+      input1_shift_val,
+      input2_mult,
+      input2_shift_val,
+      output_mult,
+      output_shift_val);
+
+  // Call CMSIS-NN kernel with precomputed parameters
+  arm_cmsis_nn_status status = arm_elementwise_add_s8(
+      input1_int8.const_data_ptr<int8_t>(),
+      input2_int8.const_data_ptr<int8_t>(),
+      static_cast<int32_t>(zp1),
+      input1_mult,
+      input1_shift_val,
+      static_cast<int32_t>(zp2),
+      input2_mult,
+      input2_shift_val,
+      left_shift,
+      out.mutable_data_ptr<int8_t>(),
+      static_cast<int32_t>(out_zp),
+      output_mult,
+      output_shift_val,
+      static_cast<int32_t>(out.numel()),
+      activation_min,
+      activation_max);
+
+  if (status != ARM_CMSIS_NN_SUCCESS) {
+    ET_LOG(
+        Error,
+        "quantized_add_out: arm_elementwise_add_s8 failed with status [%d]",
+        status);
+    std::memset(out.mutable_data_ptr<int8_t>(), 0, out.nbytes());
+  } else {
+    ET_LOG(
+        Info,
+        "quantized_add_out: Successfully completed with AoT-computed parameters!");
+  }
+
+  return out;
+}
+
+// Stub Implementation: Non-out variant for compatibility (functional variant)
+// EXIR/ExecuTorch runs an out-variant pass that converts
+// .default operations to .out variants before memory planning.
+// In the pass we are calling quantized_add's default variant
+// but ExecuTorch's kernel dispatch mechanism will end up calling the out
+// variant. This stub is to make sure that compiler doesn't complain.
+Tensor quantized_add(
+    KernelRuntimeContext& context,
+    const Tensor& input1_int8,
+    const Scalar& input1_zero_point,
+    const Scalar& input1_multiplier,
+    const Scalar& input1_shift,
+    const Tensor& input2_int8,
+    const Scalar& input2_zero_point,
+    const Scalar& input2_multiplier,
+    const Scalar& input2_shift,
+    const Scalar& output_zero_point,
+    const Scalar& output_multiplier,
+    const Scalar& output_shift) {
+  ET_LOG(Info, "quantized_add: input1_int8.sizes() = %zu", input1_int8.sizes());
+
+  // Crash on Debug builds if invoked
+  assert(False);
+  // This is to make sure compiler doesn't complain.
+  return const_cast<Tensor&>(input1_int8);
+}
+
+} // namespace native
+} // namespace cortex_m
@@ -7,9 +7,14 @@
 import torch
 from executorch.exir.dialects._ops import (
     ops as exir_ops,
-)  # To provide the implementation of the operators
+)
+# To provide the implementation of the operators
 from torch.library import impl, Library, register_fake
 
+from executorch.backends.cortex_m.passes.passes_utils import (
+    dequantize_tensor, quantize_tensor,
+)
+
 # New operator library with a custom namespace to allow fusion etc.
 lib = Library("cortex_m", "DEF")
 
@@ -96,3 +101,107 @@ def dequantize_per_tensor_impl(
     return exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default(
         input, scale, zero_point, quant_min, quant_max, dtype
     )
+
+
+# Define the operator schema with multipliers and shifts (11 args)
+lib.define(
+    "quantized_add("
+    "Tensor self, Scalar self_zero_point, Scalar self_multiplier, Scalar self_shift, "
+    "Tensor other, Scalar other_zero_point, Scalar other_multiplier, Scalar other_shift, "
+    "Scalar output_zero_point, Scalar output_multiplier, Scalar output_shift) -> Tensor"
+)
+
+@register_fake("cortex_m::quantized_add")
+def quantized_add_meta(
+    self: torch.Tensor,
+    self_zero_point: int,
+    self_multiplier: int,
+    self_shift: int,
+    other: torch.Tensor,
+    other_zero_point: int,
+    other_multiplier: int,
+    other_shift: int,
+    output_zero_point: int,
+    output_multiplier: int,
+    output_shift: int,
+) -> torch.Tensor:
+    return torch.empty_like(self, dtype=torch.int8)
+
+@impl(lib, "quantized_add", "CompositeExplicitAutograd")
+def quantized_add_impl(
+    self: torch.Tensor,
+    self_zero_point: int,
+    self_multiplier: int,
+    self_shift: int,
+    other: torch.Tensor,
+    other_zero_point: int,
+    other_multiplier: int,
+    other_shift: int,
+    output_zero_point: int,
+    output_multiplier: int,
+    output_shift: int,
+) -> torch.Tensor:
+    self_fp = dequantize_tensor(self, self_zero_point, self_multiplier, self_shift)
+    other_fp = dequantize_tensor(other, other_zero_point, other_multiplier, other_shift)
+    result_fp = self_fp + other_fp
+    result_quantized = quantize_tensor(result_fp, output_zero_point, output_multiplier, output_shift)
+    return result_quantized
+
+# Define the operator schema with multipliers and shifts (11 args + out tensor)
+lib.define(
+    "quantized_add.out("
+    "Tensor self, Scalar self_zero_point, Scalar self_multiplier, Scalar self_shift, "
+    "Tensor other, Scalar other_zero_point, Scalar other_multiplier, Scalar other_shift, "
+    "Scalar output_zero_point, Scalar output_multiplier, Scalar output_shift, "
+    "*, Tensor(a!) out) -> Tensor(a!)"
+)
+
+
+# Fake meta function for shape and dtype inference during compilation
+@register_fake("cortex_m::quantized_add.out")
+def quantized_add_out_meta(
+    self: torch.Tensor,
+    self_zero_point: int,
+    self_multiplier: int,
+    self_shift: int,
+    other: torch.Tensor,
+    other_zero_point: int,
+    other_multiplier: int,
+    other_shift: int,
+    output_zero_point: int,
+    output_multiplier: int,
+    output_shift: int,
+    out: torch.Tensor,
+) -> torch.Tensor:
+    # Validate shape compatibility if needed
+    assert out.shape == self.shape, "Output shape must match input shape"
+    # Output dtype is int8
+    return out
+
+
+# Actual implementation delegating to backend or custom kernel
+@impl(lib, "quantized_add.out", "CompositeExplicitAutograd")
+def quantized_add_out_impl(
+    self: torch.Tensor,
+    self_zero_point: int,
+    self_multiplier: int,
+    self_shift: int,
+    other: torch.Tensor,
+    other_zero_point: int,
+    other_multiplier: int,
+    other_shift: int,
+    output_zero_point: int,
+    output_multiplier: int,
+    output_shift: int,
+    *,
+    out: torch.Tensor,
+) -> torch.Tensor:
+    self_fp = dequantize_tensor(self, self_zero_point, self_multiplier, self_shift)
+    other_fp = dequantize_tensor(other, other_zero_point, other_multiplier, other_shift)
+    result_fp = self_fp + other_fp
+    result_quantized = quantize_tensor(result_fp, output_zero_point, output_multiplier, output_shift)
+
+    # Write into the provided output tensor
+    out.copy_(result_quantized)
+
+    return out
@@ -15,3 +15,15 @@
   kernels:
     - arg_meta: null
       kernel_name: cortex_m::dequantize_per_tensor_out
+
+- func: cortex_m::quantized_add(Tensor self, Scalar self_zero_point, Scalar self_multiplier, Scalar self_shift, Tensor other, Scalar other_zero_point, Scalar other_multiplier, Scalar other_shift, Scalar output_zero_point, Scalar output_multiplier, Scalar output_shift) -> Tensor
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: cortex_m::quantized_add
+
+- func: cortex_m::quantized_add.out(Tensor self, Scalar self_zero_point, Scalar self_multiplier, Scalar self_shift, Tensor other, Scalar other_zero_point, Scalar other_multiplier, Scalar other_shift, Scalar output_zero_point, Scalar output_multiplier, Scalar output_shift, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: cortex_m::quantized_add_out
Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,7 @@ python_library(`
`16`	`16`	`],`
`17`	`17`	`deps = [`
`18`	`18`	`"fbcode//caffe2:torch",`
	`19`	`+ "//executorch/backends/cortex_m/passes:passes_utils",`
`19`	`20`	`],`
`20`	`21`	`)`
`21`	`22`