Add Cortex-M as a first-class target in aot_arm_compiler

Github Executorch · psiddh · commit 105498e3dc25 · 2026-02-25T18:59:04.000-08:00
Previously, Cortex-M op conversion was applied as an afterthought to all
non-vgf targets via transform_for_cortex_m_backend(). This made the flow
hard to follow, used a bare EdgeCompileConfig that decomposed ops like
linear into addmm (requiring unnecessary workarounds), and didn't use the
CortexMQuantizer or CortexMPassManager.

Add a dedicated to_edge_cortex_m() path selected via --target=cortex-m that
owns the full pipeline: CortexMQuantizer for INT8 quantization, correct
EdgeCompileConfig with preserve_ops to prevent premature decomposition, and
CortexMPassManager.pass_list for op conversion. Remove the old scattered
transform_for_cortex_m_backend() function.

Verified all ops fully lowered to cortex_m::quantized_* operators for both
MobileNetV2 (70 nodes) and MobileNetV3 (122 nodes). E2E inference tested
on Alif E8 board.

Test Plan:
python3 -m examples.arm.aot_arm_compiler -m mv2 --target=cortex-m55+int8 --quantize --intermediates=./mv2_intermediates --output=./mv2_cortex_m.pte
python3 -m examples.arm.aot_arm_compiler -m mv3 --target=cortex-m55+int8 --quantize --intermediates=./mv3_intermediates --output=./mv3_cortex_m.pte

Also ran E2E inference on Alif E8 board
diff --git a/backends/cortex_m/passes/convert_to_cortex_m_pass.py b/backends/cortex_m/passes/convert_to_cortex_m_pass.py
@@ -71,7 +71,7 @@ def _get_batch_size_from_conv(self, conv_node: torch.fx.Node):
 
     def _get_linear_replacement(self, node):
         """
-         Let
+        Let
         - yi be the output activations (y1, ... yn)
         - xj be the input activations (x1, ... xm)
         - wij be the weights (w11, ... wnm)
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
@@ -36,19 +36,8 @@
 from executorch.backends.arm.util._factory import create_partitioner, create_quantizer
 
 from executorch.backends.arm.vgf import VgfCompileSpec
-
-# To use Cortex-M backend
-from executorch.backends.cortex_m.passes.convert_to_cortex_m_pass import (
-    ConvertToCortexMPass,
-)
-
-from executorch.backends.cortex_m.passes.quantized_op_fusion_pass import (
-    QuantizedOpFusionPass,
-)
-
-from executorch.backends.cortex_m.passes.replace_quant_nodes_pass import (
-    ReplaceQuantNodesPass,
-)
+from executorch.backends.cortex_m.passes.cortex_m_pass_manager import CortexMPassManager
+from executorch.backends.cortex_m.quantizer.quantizer import CortexMQuantizer
 
 from executorch.devtools import generate_etrecord
 from executorch.devtools.backend_debug import get_delegation_info
@@ -396,6 +385,7 @@ def forward(self, x):
     "TOSA-1.0+INT",
     "TOSA-1.0+FP",
     "TOSA-1.0+INT+int16",
+    "cortex-m55+int8",
 ]
 
 
@@ -528,7 +518,7 @@ def get_args():
         required=False,
         default="ethos-u55-128",
         choices=TARGETS,
-        help=f"For ArmBackend delegated models, pick the target, and therefore the instruction set generated. valid targets are {TARGETS}",
+        help=f"Target backend. For delegated models: Ethos-U/VGF/TOSA variants. For non-delegated: cortex-m55+int8 (CMSIS-NN portable kernels). Valid targets: {TARGETS}",
     )
     parser.add_argument(
         "-e",
@@ -790,6 +780,75 @@ def to_edge_TOSA_delegate(
     return model_quant, edge
 
 
+def to_edge_cortex_m(
+    exported_program: ExportedProgram,
+    args,
+    model: GraphModule,
+    example_inputs: Tuple[torch.Tensor],
+):
+    """Cortex-M/CMSIS-NN compilation path with no delegation."""
+    logging.info("Using Cortex-M/CMSIS-NN compilation path (no delegation)")
+
+    def _to_channels_last(x):
+        if isinstance(x, torch.Tensor):
+            if x.dim() == 4 and not x.is_contiguous(memory_format=torch.channels_last):
+                logging.warning(
+                    "Converting input tensor with shape %s to channels_last",
+                    list(x.shape),
+                )
+                return x.to(memory_format=torch.channels_last)
+            return x
+        elif isinstance(x, tuple):
+            return tuple(_to_channels_last(t) for t in x)
+        return x
+
+    if not args.quantize:
+        logging.warning(
+            "Quantization is DISABLED. Cortex-M typically requires quantization."
+        )
+    else:
+        model = model.to(memory_format=torch.channels_last)
+        example_inputs = tuple(_to_channels_last(x) for x in example_inputs)
+
+        quantizer = CortexMQuantizer()
+        prepared = prepare_pt2e(model, quantizer)
+
+        dataset = get_calibration_data(
+            args.model_name, example_inputs, args.evaluate, args.evaluate_config
+        )
+
+        if isinstance(dataset, DataLoader):
+            for sample, _ in dataset:
+                prepared(_to_channels_last(sample))
+        else:
+            prepared(*tuple(_to_channels_last(x) for x in dataset))
+
+        model_quant = convert_pt2e(prepared)
+
+        exported_program = torch.export.export(
+            model_quant, example_inputs, strict=args.strict_export
+        )
+
+    edge = to_edge_transform_and_lower(
+        exported_program,
+        compile_config=EdgeCompileConfig(
+            preserve_ops=[
+                torch.ops.aten.linear.default,
+                torch.ops.aten.hardsigmoid.default,
+                torch.ops.aten.hardsigmoid_.default,
+                torch.ops.aten.hardswish.default,
+                torch.ops.aten.hardswish_.default,
+            ],
+            _check_ir_validity=False,
+        ),
+    )
+
+    pass_manager = CortexMPassManager(edge.exported_program())
+    edge._edge_programs["forward"] = pass_manager.transform()
+
+    return model_quant if args.quantize else None, edge
+
+
 def to_edge_no_delegate(
     exported_program: ExportedProgram,
     args,
@@ -825,26 +884,6 @@ def to_edge_no_delegate(
     return model_quant, edge
 
 
-def transform_for_cortex_m_backend(edge_program_manager, args):
-    # Let's make sure we are using optimized Cortex M backend
-    # NB: If we can't find and replace ops those are expected to be replaced,
-    # bad things will happen at runtime, like "missing operator" errors!
-
-    # Instantiate the mandatory ReplaceQuantNodesPass
-    passes = [ReplaceQuantNodesPass]
-    if args.enable_qdq_fusion_pass:
-        passes += [ConvertToCortexMPass, QuantizedOpFusionPass]
-    current_edge = edge_program_manager
-    for pass_cls in passes:
-        transform_pass = (
-            pass_cls(current_edge.exported_program())
-            if pass_cls.__name__ == "QuantizedLinearFusionPass"
-            else pass_cls()
-        )
-        current_edge = current_edge.transform([transform_pass])
-    return current_edge
-
-
 if __name__ == "__main__":  # noqa: C901
     args = get_args()
 
@@ -876,7 +915,12 @@ def transform_for_cortex_m_backend(edge_program_manager, args):
 
     # Quantize if required
     model_quant = None
-    if args.delegate:
+    if args.target == "cortex-m55+int8":
+        # Cortex-M path: CMSIS-NN portable kernels, no delegation
+        model_quant, edge = to_edge_cortex_m(
+            exported_program, args, model, example_inputs
+        )
+    elif args.delegate:
         model_quant, edge = to_edge_TOSA_delegate(
             exported_program, args, model, example_inputs
         )
@@ -885,11 +929,6 @@ def transform_for_cortex_m_backend(edge_program_manager, args):
             exported_program, args, model, example_inputs
         )
 
-    # Cortex-m ops are never included in vgf or direct-drive
-    if args.target != "vgf" and not args.direct_drive:
-        # Transform so we can use ops from the Cortex M backend
-        edge = transform_for_cortex_m_backend(edge, args)
-
     dump_delegation_info(edge, args.intermediates)
 
     edge_program_manager_copy = copy.deepcopy(edge)