Cortex_m backend: Add mul op

AdrianLundell · AdrianLundell · commit 1348f0c37f4f · 2025-11-07T13:37:28.000+01:00
Signed-off-by: Adrian Lundell &lt;adrian.lundell@arm.com&gt;
Change-Id: Ic116e5294d9362f3a43655629d2a3c0f338a2fd5
diff --git a/backends/cortex_m/CMakeLists.txt b/backends/cortex_m/CMakeLists.txt
@@ -57,6 +57,7 @@ set(_cortex_m_kernels__srcs
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_dequantize_per_tensor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_add.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_linear.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_mul.cpp
 )
 
 # Generate C++ bindings to register kernels into Executorch
diff --git a/backends/cortex_m/ops/op_quantized_mul.cpp b/backends/cortex_m/ops/op_quantized_mul.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2025 Arm Limited and/or its affiliates.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "cortex_m_ops_common.h"
+
+// Include CMSIS-NN headers with C linkage
+extern "C" {
+#include "arm_nnfunctions.h"
+}
+
+namespace cortex_m {
+namespace native {
+namespace {
+
+constexpr int32_t kInt8ActivationMin = std::numeric_limits<int8_t>::min();
+constexpr int32_t kInt8ActivationMax = std::numeric_limits<int8_t>::max();
+
+} // namespace
+
+using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
+
+Tensor& quantized_mul_out(
+    KernelRuntimeContext& context,
+    const Tensor& input1_int8,
+    const Scalar& input1_zero_point,
+    const Tensor& input2_int8,
+    const Scalar& input2_zero_point,
+    const Scalar& output_zero_point,
+    const Scalar& output_multiplier,
+    const Scalar& output_shift,
+    Tensor& out) {
+  // Validate tensor types and quantization parameters
+  validate_cmsis_nn_tensor_requirements(input1_int8, input2_int8, out);
+
+  const Scalar kIdentityMultiplier(/*value=*/1);
+  const Scalar kZeroShift(/*value=*/0);
+  validate_quantization_params(
+      input1_zero_point,
+      kIdentityMultiplier,
+      kZeroShift,
+      input2_zero_point,
+      kIdentityMultiplier,
+      kZeroShift,
+      output_zero_point,
+      output_multiplier,
+      output_shift,
+      out);
+
+  // Extract quantization parameters
+  const int32_t zp1 = extractScalarToInt32(input1_zero_point);
+  const int32_t zp2 = extractScalarToInt32(input2_zero_point);
+  const int32_t out_zp = extractScalarToInt32(output_zero_point);
+  const int32_t output_mult = extractScalarToInt32(output_multiplier);
+  const int32_t output_shift_val = extractScalarToInt(output_shift);
+
+  // Call CMSIS-NN elementwise multiply kernel
+  arm_cmsis_nn_status status = arm_elementwise_mul_s8(
+      input1_int8.const_data_ptr<int8_t>(),
+      input2_int8.const_data_ptr<int8_t>(),
+      -static_cast<int32_t>(zp1),
+      -static_cast<int32_t>(zp2),
+      out.mutable_data_ptr<int8_t>(),
+      static_cast<int32_t>(out_zp),
+      output_mult,
+      output_shift_val,
+      kInt8ActivationMin,
+      kInt8ActivationMax,
+      static_cast<int32_t>(out.numel()));
+
+  if (status != ARM_CMSIS_NN_SUCCESS) {
+    ET_LOG(
+        Error,
+        "quantized_mul_out: arm_elementwise_mul_s8 failed with status [%d]",
+        status);
+    context.fail(Error::Internal);
+    return out;
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace cortex_m
diff --git a/backends/cortex_m/ops/operators.py b/backends/cortex_m/ops/operators.py
@@ -168,6 +168,60 @@ def quantized_add_impl(
     return result
 
 
+# ===================================================================
+# QUANTIZED MUL OPERATION DEFINITION
+# ===================================================================
+lib.define(
+    "quantized_mul("
+    "Tensor self, Scalar self_zero_point, "
+    "Tensor other, Scalar other_zero_point, "
+    "Scalar output_zero_point, Scalar output_multiplier, Scalar output_shift) -> Tensor"
+)
+lib.define(
+    "quantized_mul.out("
+    "Tensor self, Scalar self_zero_point, "
+    "Tensor other, Scalar other_zero_point, "
+    "Scalar output_zero_point, Scalar output_multiplier, Scalar output_shift, "
+    "*, Tensor(a!) out) -> Tensor(a!)"
+)
+
+
+@register_fake("cortex_m::quantized_mul")
+def quantized_mul_meta(
+    self: torch.Tensor,
+    self_zero_point: int,
+    other: torch.Tensor,
+    other_zero_point: int,
+    output_zero_point: int,
+    output_multiplier: int,
+    output_shift: int,
+) -> torch.Tensor:
+    # Broadcast to output shape
+    broadcasted_shape = torch.broadcast_shapes(self.shape, other.shape)
+    return torch.empty(broadcasted_shape, dtype=torch.int8, device=self.device)
+
+
+@impl(lib, "quantized_mul", "CompositeExplicitAutograd")
+def quantized_mul_impl(
+    self: torch.Tensor,
+    self_zero_point: int,
+    other: torch.Tensor,
+    other_zero_point: int,
+    output_zero_point: int,
+    output_multiplier: int,
+    output_shift: int,
+) -> torch.Tensor:
+    # CMSIS-NN kernel multiplies raw int8 tensors (after zero-point offset) and
+    # only uses the output multiplier/shift for rescaling. Mirror that here to
+    # keep the composite implementation numerically aligned with the backend.
+    self_int = self.to(torch.int32) - self_zero_point
+    other_int = other.to(torch.int32) - other_zero_point
+    result_fp = self_int * other_int
+    result_quantized = requantize_cmsis(result_fp, output_multiplier, output_shift)
+    result = torch.clamp(result_quantized + output_zero_point, -128, 127).to(torch.int8)
+    return result
+
+
 # ===================================================================
 # QUANTIZED LINEAR OPERATION DEFINITION
 # ===================================================================
diff --git a/backends/cortex_m/ops/operators.yaml b/backends/cortex_m/ops/operators.yaml
@@ -23,8 +23,14 @@
     - arg_meta: null
       kernel_name: cortex_m::quantized_add_out
 
+- func: cortex_m::quantized_mul.out(Tensor self, Scalar self_zero_point, Tensor other, Scalar other_zero_point, Scalar output_zero_point, Scalar output_multiplier, Scalar output_shift, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: cortex_m::quantized_mul_out
+
 - func: cortex_m::quantized_linear.out(Tensor input, Tensor weights, Tensor? bias, Tensor? kernel_sum, Scalar input_offset, Scalar filter_offset, Scalar output_offset, int[] requantize_multipliers, int[] requantize_shifts, Scalar activation_max, Scalar activation_min, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   kernels:
     - arg_meta: null
-      kernel_name: cortex_m::quantized_linear_out
+      kernel_name: cortex_m::quantized_linear_out
diff --git a/backends/cortex_m/passes/cortex_m_pass_manager.py b/backends/cortex_m/passes/cortex_m_pass_manager.py
@@ -5,7 +5,6 @@
 
 
 from executorch.backends.arm._passes import (
-    DecorateFp32toInt32CastingPass,
     FoldAndAnnotateQParamsPass,
     ScalarsToAttributePass,
 )
@@ -29,7 +28,6 @@ class CortexMPassManager(XNNPACKPassManager):
         ReplaceQuantNodesPass,
         QuantizedOpFusionPass,
         QuantizedLinearFusionPass,
-        DecorateFp32toInt32CastingPass,
     ]
 
     pass_list_transform_for_annotation: list[ExportPass] = [
diff --git a/backends/cortex_m/passes/passes_utils.py b/backends/cortex_m/passes/passes_utils.py
@@ -50,14 +50,32 @@ def requantize_cmsis(
     multiplier: int,
     shift: int,
 ) -> torch.Tensor:
-    """
-    Simulate CMSIS-NN fixed-point requantization:
-    result = round(tensor * multiplier / (2 ^ shift))
-    with double rounding
-    """
-    multiplied = torch.round(tensor.to(torch.int64) * multiplier)
-    shifted = torch.round(multiplied / (2 ** (31 - shift)))
-    return shifted.to(torch.int32)
+    """Simulate CMSIS-NN's arm_nn_requantize helper."""
+
+    tensor_64 = tensor.to(torch.int64)
+    left_shift = max(shift, 0)
+    right_shift = max(-shift, 0)
+
+    # Equivalent to val * (1 << LEFT_SHIFT(shift))
+    value = tensor_64 << left_shift
+
+    # arm_nn_doubling_high_mult_no_sat(value, multiplier)
+    product = value * int(multiplier)
+    product = product + (1 << 30)
+    result = product >> 31
+
+    if right_shift:
+        remainder_mask = (1 << right_shift) - 1
+        remainder = torch.bitwise_and(result, remainder_mask)
+        result = result >> right_shift
+        threshold = remainder_mask >> 1
+        threshold_tensor = torch.full_like(result, threshold, dtype=torch.int64)
+        threshold_tensor = torch.where(
+            result < 0, threshold_tensor + 1, threshold_tensor
+        )
+        result = result + torch.where(remainder > threshold_tensor, 1, 0)
+
+    return result.to(torch.int32)
 
 
 def extract_scalar_value(node_arg) -> float:
diff --git a/backends/cortex_m/passes/quantized_op_fusion_pass.py b/backends/cortex_m/passes/quantized_op_fusion_pass.py
@@ -64,6 +64,31 @@ def _get_add_replacement(self, args, meta):
 
         return exir_ops.edge.cortex_m.quantized_add.default, args
 
+    def _get_mul_replacement(self, args, meta) -> int:
+
+        # Extract values
+        scale1 = meta["input_qparams"][0].scale
+        zero_point1 = meta["input_qparams"][0].zp
+        scale2 = meta["input_qparams"][1].scale
+        zero_point2 = meta["input_qparams"][1].zp
+        output_scale = meta["output_qparams"][0].scale
+        output_zero_point = meta["output_qparams"][0].zp
+
+        scale_factor = (scale1 * scale2) / output_scale
+        output_mult, output_shift = quantize_multiplier_aot(scale_factor)
+
+        args = (
+            args[0],
+            zero_point1,
+            args[1],
+            zero_point2,
+            output_zero_point,
+            output_mult,
+            output_shift,
+        )
+
+        return exir_ops.edge.cortex_m.quantized_mul.default, args
+
     def call_operator(
         self,
         op: EdgeOpOverload,
@@ -80,6 +105,8 @@ def call_operator(
         match op:
             case exir_ops.edge.aten.add.Tensor:
                 op, args = self._get_add_replacement(args, meta)
+            case exir_ops.edge.aten.mul.Tensor:
+                op, args = self._get_mul_replacement(args, meta)
             case _:
                 pass
 
diff --git a/backends/cortex_m/quantizer/operator_configs.py b/backends/cortex_m/quantizer/operator_configs.py
@@ -17,6 +17,7 @@
 # ----------------- OPERATOR PATTERN PRESETS -----------------
 BINARY_OP_PATTERNS = [
     [torch.ops.aten.add.Tensor],
+    [torch.ops.aten.mul.Tensor],
 ]
 
 LINEAR_OP_PATTERNS = [
diff --git a/backends/cortex_m/quantizer/quantizer.py b/backends/cortex_m/quantizer/quantizer.py
@@ -6,13 +6,12 @@
 
 from typing import Callable, List, Optional
 
-import torch
-
 from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
 
 from executorch.backends.arm.quantizer.quantization_config import QuantizationConfig
 from executorch.backends.cortex_m.passes.cortex_m_pass_manager import CortexMPassManager
 from executorch.backends.cortex_m.quantizer.operator_configs import (
+    BINARY_OP_PATTERNS,
     INT8_BINARY_OPS_OPERATOR_CONFIG,
     INT8_LINEAR_OPERATOR_CONFIG,
 )
@@ -37,7 +36,7 @@ def broadcasting_filter(self, node: Optional[Node]) -> bool:
         """
         if node is None:
             return False
-        if node.target not in [torch.ops.aten.add.Tensor]:
+        if [node.target] not in BINARY_OP_PATTERNS:
             return False
 
         if len(node.all_input_nodes) == 2:
diff --git a/backends/cortex_m/test/ops/test_mul.py b/backends/cortex_m/test/ops/test_mul.py
@@ -4,7 +4,6 @@
 # LICENSE file in the root directory of this source tree.
 
 
-import pytest
 import torch
 from executorch.backends.arm.test.common import parametrize
 from executorch.backends.cortex_m.test.tester import (
@@ -60,6 +59,16 @@ class CortexMTensorMul(Model):
     }
 
 
+class CortexMTensorMulBroadCast(Model):
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1,
+    }
+
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1,
+    }
+
+
 test_cases = {
     "self_scalar": McuTestCase(
         CortexMSelfMul(),
@@ -91,22 +100,22 @@ class CortexMTensorMul(Model):
     ),
     "tensor_scalar": McuTestCase(
         CortexMScalarMul(),
-        (torch.ones(2, 2), 1.0),
+        (torch.ones(1), 1.0),
     ),
     "scalar_tensor": McuTestCase(
         CortexMScalarMul(),
-        (1000.0, torch.ones(2, 2)),
+        (1000.0, torch.ones(1)),
     ),
     "broadcast_1": McuTestCase(
-        CortexMTensorMul(),
+        CortexMTensorMulBroadCast(),
         (torch.ones(1), torch.ones(2, 2, 2, 2)),
     ),
     "broadcast_2": McuTestCase(
-        CortexMTensorMul(),
+        CortexMTensorMulBroadCast(),
         (torch.ones((2, 1, 1, 1)), torch.ones(1)),
     ),
     "broadcast_3": McuTestCase(
-        CortexMTensorMul(),
+        CortexMTensorMulBroadCast(),
         (
             ramp_tensor(-2, 2, (2, 1, 2, 1)),
             ramp_tensor(-5, 5, (1, 2, 1, 2)),
@@ -115,17 +124,23 @@ class CortexMTensorMul(Model):
 }
 
 
-@pytest.mark.skip(reason="Not implemented yet")
-@parametrize("test_case", test_cases)
+xfail_cases = {
+    "self_scalar": "lift_constant_tensor_pass assumes fake tensors for scalars",
+    "scalar_scalar": "lift_constant_tensor_pass assumes fake tensors for scalars",
+}
+
+
+@parametrize("test_case", test_cases, xfails=xfail_cases)
 def test_dialect_mul(test_case):
     tester = CortexMTester(test_case.model, test_case.example_inputs)
     tester.test_dialect(
-        test_case.model.ops_before_transforms, test_case.model.ops_after_transforms
+        test_case.model.ops_before_transforms,
+        test_case.model.ops_after_transforms,
+        qtol=1,
     )
 
 
-@pytest.mark.skip(reason="Not implemented yet")
-@parametrize("test_case", test_cases)
+@parametrize("test_case", test_cases, xfails=xfail_cases)
 def test_implementation_mul(test_case):
     tester = CortexMTester(test_case.model, test_case.example_inputs)
-    tester.test_implementation()
+    tester.test_implementation(qtol=1)
diff --git a/backends/cortex_m/test/tester.py b/backends/cortex_m/test/tester.py
@@ -22,6 +22,7 @@
     ToEdge,
     ToExecutorch,
 )
+
 from executorch.exir import EdgeCompileConfig
 
 

Original file line number	Diff line number	Diff line change
`@@ -57,6 +57,7 @@ set(_cortex_m_kernels__srcs`
`57`	`57`	`${CMAKE_CURRENT_SOURCE_DIR}/ops/op_dequantize_per_tensor.cpp`
`58`	`58`	`${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_add.cpp`
`59`	`59`	`${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_linear.cpp`
	`60`	`+ ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_mul.cpp`
`60`	`61`	`)`
`61`	`62`
`62`	`63`	`# Generate C++ bindings to register kernels into Executorch`
Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,6 @@`
`5`	`5`
`6`	`6`
`7`	`7`	`from executorch.backends.arm._passes import (`
`8`		`- DecorateFp32toInt32CastingPass,`
`9`	`8`	`FoldAndAnnotateQParamsPass,`
`10`	`9`	`ScalarsToAttributePass,`
`11`	`10`	`)`
`@@ -29,7 +28,6 @@ class CortexMPassManager(XNNPACKPassManager):`
`29`	`28`	`ReplaceQuantNodesPass,`
`30`	`29`	`QuantizedOpFusionPass,`
`31`	`30`	`QuantizedLinearFusionPass,`
`32`		`- DecorateFp32toInt32CastingPass,`
`33`	`31`	`]`
`34`	`32`
`35`	`33`	`pass_list_transform_for_annotation: list[ExportPass] = [`
Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@`
`17`	`17`	`# ----------------- OPERATOR PATTERN PRESETS -----------------`
`18`	`18`	`BINARY_OP_PATTERNS = [`
`19`	`19`	`[torch.ops.aten.add.Tensor],`
	`20`	`+ [torch.ops.aten.mul.Tensor],`
`20`	`21`	`]`
`21`	`22`
`22`	`23`	`LINEAR_OP_PATTERNS = [`