pytorch
diff --git a/‎.lintrunner.toml‎
Lines changed: 7 additions & 0 deletions b/‎.lintrunner.toml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎backends/arm/scripts/build_executorch.sh‎
Lines changed: 1 addition & 0 deletions b/‎backends/arm/scripts/build_executorch.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/scripts/pre-push‎
Lines changed: 7 additions & 5 deletions b/‎backends/arm/scripts/pre-push‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎backends/arm/test/test_arm_baremetal.sh‎
Lines changed: 7 additions & 0 deletions b/‎backends/arm/test/test_arm_baremetal.sh‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎backends/cadence/aot/fuse_ops.py‎
Lines changed: 11 additions & 2 deletions b/‎backends/cadence/aot/fuse_ops.py‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎backends/cadence/aot/tests/test_fusion_ops_passes.py‎
Lines changed: 44 additions & 0 deletions b/‎backends/cadence/aot/tests/test_fusion_ops_passes.py‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎backends/cortex_m/CMakeLists.txt‎
Lines changed: 61 additions & 0 deletions b/‎backends/cortex_m/CMakeLists.txt‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎backends/cortex_m/ops/op_dequantize_per_tensor.cpp‎
Lines changed: 70 additions & 23 deletions b/‎backends/cortex_m/ops/op_dequantize_per_tensor.cpp‎
Lines changed: 70 additions & 23 deletions
@@ -220,6 +220,13 @@ exclude_patterns = [
     'extension/**',
     'kernels/optimized/**',
     # Justified <functional> include.
+    'kernels/portable/cpu/op_bitwise*.cpp',
+    'kernels/portable/cpu/op_eq.cpp',
+    'kernels/portable/cpu/op_ge.cpp',
+    'kernels/portable/cpu/op_gt.cpp',
+    'kernels/portable/cpu/op_le.cpp',
+    'kernels/portable/cpu/op_lt.cpp',
+    'kernels/portable/cpu/op_ne.cpp',
     'runtime/kernel/thread_parallel_interface.h',
     'scripts/**',
     'third-party/**',
 
@@ -242,6 +242,8 @@ option(EXECUTORCH_USE_DL "Use libdl library" ON)
 
 option(EXECUTORCH_BUILD_CADENCE "Build the Cadence DSP backend" OFF)
 
+option(EXECUTORCH_BUILD_CORTEX_M "Build the Cortex-M backend" OFF)
+
 #
 # pthreadpool: build pthreadpool library. Disable on unsupported platforms
 #
@@ -717,6 +719,10 @@ if(EXECUTORCH_BUILD_XNNPACK)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/xnnpack)
 endif()
 
+if(EXECUTORCH_BUILD_CORTEX_M)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cortex_m)
+endif()
+
 if(EXECUTORCH_BUILD_DEVTOOLS)
   if(NOT EXECUTORCH_BUILD_ARM_BAREMETAL)
     set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
 
@@ -129,6 +129,7 @@ cmake                                                 \
     -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON               \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON           \
     -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON       \
+    -DEXECUTORCH_BUILD_CORTEX_M=ON                    \
     -DEXECUTORCH_ENABLE_LOGGING=ON                    \
     ${build_devtools_flags}                           \
     ${build_with_etdump_flags}                        \
 
@@ -8,11 +8,13 @@
 # non-interactive mode. "$#" gives the number of positional arguments.
 [ "$#" -eq 0 ] && is_script_interactive=1 || is_script_interactive=0
 
-RESET='\e[0m'
-RED='\e[31m'
-GREEN='\e[32m'
-YELLOW='\e[33m'
-BLUE='\e[34m'
+if [ $is_script_interactive -eq 1 ]; then
+    RESET='\e[0m'
+    RED='\e[31m'
+    GREEN='\e[32m'
+    YELLOW='\e[33m'
+    BLUE='\e[34m'
+fi
 
 INFO="${BLUE}[INFO]${RESET}"
 WARNING="${YELLOW}[WARNING]${RESET}"
 
@@ -154,6 +154,13 @@ test_run_ethosu_fvp() { # End to End model tests using run.sh
     echo "${TEST_SUITE_NAME}: Test ethos-u target Ethos-U85"
     examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u85-128 --model_name=add
     examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u85-128 --model_name=mul
+
+    # Cortex-M op tests
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u55-128 --model_name=qadd --bundleio
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u55-128 --model_name=qops --bundleio
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u55-128 --model_name=qops --bundleio --no_delegate --portable_kernels="aten::sub.out,aten::add.out,aten::mul.out"
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u85-128 --model_name=qops --bundleio
+
     echo "${TEST_SUITE_NAME}: PASS"
     }
 
 
@@ -885,6 +885,9 @@ class FuseTransposeOrPermuteOpPairsPass(FuseOpPairsAcrossBranchesPass):
     """
     Fuse transpose or permute op pairs to a single view op.
     (transpose or permutation) -> (quant or dequant) -> (transpose or permutation)
+    This happens when op2(op1) == identity, modulo unitary dimensions.
+    'unitary dimensions' example: a tensor of shape [1, 5, 30] is equivalent (in memory) to [5, 1, 30]
+    so transpose(1, 2) then transpose(0, 2) is a pseudo identity and should be fused.
     """
 
     # A list of ops that can be bypassed when looking for a
@@ -908,7 +911,7 @@ def can_fuse_for_chain(
         if not super().can_fuse_for_chain(producer, consumer, consumer_op_packets):
             return False
 
-        # checking that permut2(permut1(identify)) == identity
+        # checking that permut2(permut1(identity)) == identity, modulo unitary dimensions
         input_shape = cast(torch.fx.Node, producer.args[0]).meta["val"].shape
         ident_dims = list(range(len(input_shape)))
         # this mapping helps to handle both transpose and permutations
@@ -918,14 +921,20 @@ def can_fuse_for_chain(
         }
         in_dims = f[producer.target](producer, ident_dims)
         out_dims = f[consumer.target](consumer, in_dims)
-        return out_dims == ident_dims
+        # Filtering out unitary dimensions
+        non_unit_ident_dims = [dim for dim in ident_dims if input_shape[dim] != 1]
+        non_unit_out_dims = [dim for dim in out_dims if input_shape[dim] != 1]
+        return non_unit_out_dims == non_unit_ident_dims
 
     def get_fused_node(
         self,
         producer: torch.fx.Node,
         consumer: torch.fx.Node,
         graph_module: torch.fx.GraphModule,
     ) -> torch.fx.Node:
+        # This step is important because of how we can fuse transpositions that are not perfectly
+        # reverse one of another but will be fused if there are unitary dimensions.
+        # The fused operation must have the same output shape as the consumer.
         output_shape = consumer.meta["val"].shape
         with graph_module.graph.inserting_after(consumer):
             view = graph_module.graph.call_function(
 
@@ -584,6 +584,28 @@ def _create_operator(
                 exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
                 False,
             ),
+            # transpose -> quant -> transpose is not the reverse BUT there is a UNITARY dimension
+            # so it ends up being the same on memory => fuse
+            (
+                True,
+                [0, 1],
+                True,
+                [0, 2],
+                exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+                True,
+                [5, 40, 1],
+            ),
+            # transpose -> quant -> transpose is not the reverse, and unitary dimensions
+            # don't help => don't fuse
+            (
+                True,
+                [0, 1],
+                True,
+                [1, 3],
+                exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+                False,
+                [5, 40, 1, 4],
+            ),
             # permutation -> quant -> opposite permutation => fuse
             (
                 False,
@@ -622,6 +644,28 @@ def _create_operator(
                 False,
                 [4, 4, 4],
             ),
+            # permutation -> quant -> a non reverse permutation BUT there is a UNITARY dimension
+            # so it ends up being the same on memory => fuse
+            (
+                False,
+                [1, 3, 2, 0],
+                False,
+                [3, 2, 1, 0],
+                exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+                True,
+                [3, 1, 8, 10],
+            ),
+            # permutation -> quant -> a non reverse permutation, and unitary dimensions
+            # don't help => don't fuse
+            (
+                False,
+                [1, 3, 2, 0],
+                False,
+                [3, 1, 2, 0],
+                exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+                False,
+                [3, 1, 8, 10],
+            ),
             # transpose -> quant -> transpose as a permutation => fuse
             (
                 True,
 
@@ -0,0 +1,61 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Kernel library for Cortex-M operators. Please keep this file formatted by running:
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+cmake_minimum_required(VERSION 3.19)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
+
+if(NOT PYTHON_EXECUTABLE)
+  resolve_python_executable()
+endif()
+
+# Cortex-M ops kernel sources
+set(_cortex_m_kernels__srcs
+    ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantize_per_tensor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_dequantize_per_tensor.cpp
+)
+
+# Generate C++ bindings to register kernels into Executorch (for runtime).
+# Here select all ops in operators.yaml
+set(_yaml_file ${CMAKE_CURRENT_LIST_DIR}/ops/operators.yaml)
+gen_selected_ops(LIB_NAME "cortex_m_ops_lib" OPS_SCHEMA_YAML "${_yaml_file}")
+
+# Generate bindings for the kernels
+generate_bindings_for_kernels(
+  LIB_NAME "cortex_m_ops_lib" CUSTOM_OPS_YAML "${_yaml_file}"
+)
+message("Generated files ${gen_command_sources}")
+
+# Build a library for _cortex_m_kernels_srcs
+add_library(cortex_m_kernels ${_cortex_m_kernels__srcs})
+target_link_libraries(cortex_m_kernels PRIVATE executorch)
+target_compile_options(cortex_m_kernels PUBLIC ${_common_compile_options})
+
+# cortex_m_ops_lib: Register Cortex-M ops kernels into Executorch runtime
+gen_operators_lib(
+  LIB_NAME "cortex_m_ops_lib" KERNEL_LIBS cortex_m_kernels DEPS executorch
+)
+
+install(
+  TARGETS cortex_m_kernels cortex_m_ops_lib
+  DESTINATION lib
+  PUBLIC_HEADER DESTINATION include/executorch/backends/cortex_m/ops/
+)
@@ -29,6 +29,7 @@ namespace {
  */
 void check_dequantize_args(
     const Tensor& input,
+    int64_t zero_point,
     int64_t quant_min,
     int64_t quant_max,
     ScalarType dtype,
@@ -39,6 +40,18 @@ void check_dequantize_args(
       "input.scalar_type() %" PRId8 " is not char type",
       static_cast<int8_t>(input.scalar_type()));
 
+  // Check zp range
+  ET_CHECK_MSG(
+      zero_point >= quant_min,
+      "zero_point must be %" PRId64 " <= quant_min %" PRId64,
+      zero_point,
+      quant_min);
+  ET_CHECK_MSG(
+      zero_point <= quant_max,
+      "zero_point must be %" PRId64 " >= quant_max %" PRId64,
+      zero_point,
+      quant_max);
+
   // Check output dtype is float
   ET_CHECK_MSG(
       out.scalar_type() == ScalarType::Float,
@@ -73,18 +86,10 @@ void check_dequantize_args(
 /**
  * Scalar implementation of quantization for a single value.
  */
-template <typename K, typename T>
-T dequantize_val(
-    float scale,
-    int32_t zero_point,
-    K value,
-    int64_t quant_min,
-    int64_t quant_max) {
-  (void)quant_min;
-  (void)quant_max;
-  return static_cast<T>((static_cast<int32_t>(value) - zero_point) * scale);
+template <typename Q, typename F>
+F dequantize_val(float scale, int32_t zero_point, Q qvalue) {
+  return static_cast<F>((static_cast<int32_t>(qvalue) - zero_point) * scale);
 }
-
 } // namespace
 
 Tensor& dequantize_per_tensor_out(
@@ -106,29 +111,71 @@ Tensor& dequantize_per_tensor_out(
       "Failed to resize out Tensor in dequantize_per_tensor_out");
 
   // Validate input parameters
-  check_dequantize_args(input, quant_min, quant_max, dtype, out);
+  check_dequantize_args(input, zero_point, quant_min, quant_max, dtype, out);
 
-  // Pre-compute inverse scale for better performance
   int32_t zp = static_cast<int32_t>(zero_point);
-  int32_t qmin = static_cast<int32_t>(quant_min);
-  int32_t qmax = static_cast<int32_t>(quant_max);
 
   // Get pointers to input and output data
   const int8_t* input_data = input.const_data_ptr<int8_t>();
   float* out_data = out.mutable_data_ptr<float>();
   const size_t numel = input.numel();
 
+  size_t i = 0;
 #if defined(HAS_HELIUM_SIMD)
-// Helium MVE implementation for float32 to int8 quantization
-#Error "Implement MVE version!"
-#else
-  // Scalar implementation for float32 to int8 quantization
-  for (size_t i = 0; i < numel; i++) {
-    out_data[i] =
-        dequantize_val<int8_t, float>(scale, zp, input_data[i], qmin, qmax);
+  // Helium MVE implementation for int8 to float quantization
+  static uint8x16_t voffset{
+      0x0,
+      0x8,
+      0x4,
+      0xC,
+      0x1,
+      0x9,
+      0x5,
+      0xD,
+      0x2,
+      0xA,
+      0x6,
+      0xE,
+      0x3,
+      0xB,
+      0x7,
+      0xF};
+
+  int16x8_t vzp = vdupq_n_s16(static_cast<int16_t>(zp));
+  float32x4_t vscale = vdupq_n_f32(static_cast<float>(scale));
+
+  for (; i + 15 < numel; i += 16) {
+    int8x16_t in_084C195D2A6E3B7F =
+        vldrbq_gather_offset_s8(input_data, voffset);
+
+    int16x8_t in_04152637 = vsubq_s16(vmovlbq_s8(in_084C195D2A6E3B7F), vzp);
+    int16x8_t in_8C9DAEBF = vsubq_s16(vmovltq_s8(in_084C195D2A6E3B7F), vzp);
+
+    float32x4_t inf_0123 = vcvtq_f32_s32(vmovlbq_s16(in_04152637));
+    float32x4_t inf_4567 = vcvtq_f32_s32(vmovltq_s16(in_04152637));
+    float32x4_t inf_89AB = vcvtq_f32_s32(vmovlbq_s16(in_8C9DAEBF));
+    float32x4_t inf_CDEF = vcvtq_f32_s32(vmovltq_s16(in_8C9DAEBF));
+
+    float32x4_t out_0123 = vmulq_f32(inf_0123, vscale);
+    float32x4_t out_4567 = vmulq_f32(inf_4567, vscale);
+    float32x4_t out_89AB = vmulq_f32(inf_89AB, vscale);
+    float32x4_t out_CDEF = vmulq_f32(inf_CDEF, vscale);
+
+    vstrwq_f32(out_data + 0, out_0123);
+    vstrwq_f32(out_data + 4, out_4567);
+    vstrwq_f32(out_data + 8, out_89AB);
+    vstrwq_f32(out_data + 12, out_CDEF);
+
+    input_data += 16;
+    out_data += 16;
   }
-#endif
+#endif // defined(HAS_HELIUM_SIMD)
 
+  for (; i < numel; i++) {
+    *out_data = dequantize_val<int8_t, float>(scale, zp, *input_data);
+    *input_data++;
+    *out_data++;
+  }
   return out;
 }