Skip to content

Commit a7a30da

Browse files
author
Github Executorch
committed
Add Cortex-M as a first-class target in aot_arm_compiler
Previously, Cortex-M op conversion was applied as an afterthought to all non-vgf targets via transform_for_cortex_m_backend(). This made the flow hard to follow, used a bare EdgeCompileConfig that decomposed ops like linear into addmm (requiring unnecessary workarounds), and didn't use the CortexMQuantizer or CortexMPassManager. Add a dedicated to_edge_cortex_m() path selected via --target=cortex-m that owns the full pipeline: CortexMQuantizer for INT8 quantization, correct EdgeCompileConfig with preserve_ops to prevent premature decomposition, and CortexMPassManager.pass_list for op conversion. Remove the old scattered transform_for_cortex_m_backend() function. Verified all ops fully lowered to cortex_m::quantized_* operators for both MobileNetV2 (70 nodes) and MobileNetV3 (122 nodes). E2E inference tested on Alif E8 board. Test Plan: python3 -m examples.arm.aot_arm_compiler -m mv2 --target=cortex-m --quantize --intermediates=./mv2_intermediates --output=./mv2_cortex_m.pte python3 -m examples.arm.aot_arm_compiler -m mv3 --target=cortex-m --quantize --intermediates=./mv3_intermediates --output=./mv3_cortex_m.pte Also ran E2E inference on Alif E8 board
1 parent f48a600 commit a7a30da

File tree

2 files changed

+88
-40
lines changed

2 files changed

+88
-40
lines changed

backends/cortex_m/passes/convert_to_cortex_m_pass.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def _get_batch_size_from_conv(self, conv_node: torch.fx.Node):
7171

7272
def _get_linear_replacement(self, node):
7373
"""
74-
Let
74+
Let
7575
- yi be the output activations (y1, ... yn)
7676
- xj be the input activations (x1, ... xm)
7777
- wij be the weights (w11, ... wnm)

examples/arm/aot_arm_compiler.py

Lines changed: 87 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
import argparse
1111
import copy
12+
import inspect
1213
import logging
1314
import os
1415
import sys
@@ -36,19 +37,8 @@
3637
from executorch.backends.arm.util._factory import create_partitioner, create_quantizer
3738

3839
from executorch.backends.arm.vgf import VgfCompileSpec
39-
40-
# To use Cortex-M backend
41-
from executorch.backends.cortex_m.passes.convert_to_cortex_m_pass import (
42-
ConvertToCortexMPass,
43-
)
44-
45-
from executorch.backends.cortex_m.passes.quantized_op_fusion_pass import (
46-
QuantizedOpFusionPass,
47-
)
48-
49-
from executorch.backends.cortex_m.passes.replace_quant_nodes_pass import (
50-
ReplaceQuantNodesPass,
51-
)
40+
from executorch.backends.cortex_m.passes.cortex_m_pass_manager import CortexMPassManager
41+
from executorch.backends.cortex_m.quantizer.quantizer import CortexMQuantizer
5242

5343
from executorch.devtools import generate_etrecord
5444
from executorch.devtools.backend_debug import get_delegation_info
@@ -399,6 +389,7 @@ def forward(self, x):
399389
"TOSA-1.0+INT",
400390
"TOSA-1.0+FP",
401391
"TOSA-1.0+INT+int16",
392+
"cortex-m",
402393
]
403394

404395

@@ -795,6 +786,83 @@ def to_edge_TOSA_delegate(
795786
return model_quant, edge
796787

797788

789+
def to_edge_cortex_m(
790+
exported_program: ExportedProgram,
791+
args,
792+
model: GraphModule,
793+
example_inputs: Tuple[torch.Tensor],
794+
):
795+
"""Cortex-M/CMSIS-NN compilation path with no delegation."""
796+
logging.info("Using Cortex-M/CMSIS-NN compilation path (no delegation)")
797+
798+
model_quant = None
799+
800+
if args.quantize:
801+
model_channels_last = model.to(memory_format=torch.channels_last)
802+
example_inputs_cl = tuple(
803+
x.to(memory_format=torch.channels_last) if x.dim() == 4 else x
804+
for x in example_inputs
805+
)
806+
807+
quantizer = CortexMQuantizer()
808+
prepared = prepare_pt2e(model_channels_last, quantizer)
809+
810+
dataset = get_calibration_data(
811+
args.model_name, example_inputs_cl, args.evaluate, args.evaluate_config
812+
)
813+
814+
if isinstance(dataset, DataLoader):
815+
for sample, _ in dataset:
816+
if isinstance(sample, torch.Tensor) and sample.dim() == 4:
817+
sample = sample.to(memory_format=torch.channels_last)
818+
prepared(sample)
819+
else:
820+
dataset_cl = tuple(
821+
(
822+
x.to(memory_format=torch.channels_last)
823+
if isinstance(x, torch.Tensor) and x.dim() == 4
824+
else x
825+
)
826+
for x in dataset
827+
)
828+
prepared(*dataset_cl)
829+
830+
model_quant = convert_pt2e(prepared)
831+
832+
exported_program = torch.export.export(
833+
model_quant, example_inputs_cl, strict=args.strict_export
834+
)
835+
else:
836+
logging.warning(
837+
"Quantization is DISABLED. Cortex-M typically requires quantization."
838+
)
839+
840+
edge = to_edge_transform_and_lower(
841+
exported_program,
842+
compile_config=EdgeCompileConfig(
843+
preserve_ops=[
844+
torch.ops.aten.linear.default,
845+
torch.ops.aten.hardsigmoid.default,
846+
torch.ops.aten.hardsigmoid_.default,
847+
torch.ops.aten.hardswish.default,
848+
torch.ops.aten.hardswish_.default,
849+
],
850+
_check_ir_validity=False,
851+
),
852+
)
853+
854+
pass_instances = []
855+
for pass_cls in CortexMPassManager.pass_list:
856+
sig = inspect.signature(pass_cls.__init__)
857+
if "exported_program" in sig.parameters:
858+
pass_instances.append(pass_cls(edge.exported_program()))
859+
else:
860+
pass_instances.append(pass_cls())
861+
edge = edge.transform(pass_instances)
862+
863+
return model_quant, edge
864+
865+
798866
def to_edge_no_delegate(
799867
exported_program: ExportedProgram,
800868
args,
@@ -830,26 +898,6 @@ def to_edge_no_delegate(
830898
return model_quant, edge
831899

832900

833-
def transform_for_cortex_m_backend(edge_program_manager, args):
834-
# Let's make sure we are using optimized Cortex M backend
835-
# NB: If we can't find and replace ops those are expected to be replaced,
836-
# bad things will happen at runtime, like "missing operator" errors!
837-
838-
# Instantiate the mandatory ReplaceQuantNodesPass
839-
passes = [ReplaceQuantNodesPass]
840-
if args.enable_qdq_fusion_pass:
841-
passes += [ConvertToCortexMPass, QuantizedOpFusionPass]
842-
current_edge = edge_program_manager
843-
for pass_cls in passes:
844-
transform_pass = (
845-
pass_cls(current_edge.exported_program())
846-
if pass_cls.__name__ == "QuantizedLinearFusionPass"
847-
else pass_cls()
848-
)
849-
current_edge = current_edge.transform([transform_pass])
850-
return current_edge
851-
852-
853901
if __name__ == "__main__": # noqa: C901
854902
args = get_args()
855903

@@ -881,7 +929,12 @@ def transform_for_cortex_m_backend(edge_program_manager, args):
881929

882930
# Quantize if required
883931
model_quant = None
884-
if args.delegate:
932+
if args.target == "cortex-m":
933+
# Cortex-M path: CMSIS-NN portable kernels, no delegation
934+
model_quant, edge = to_edge_cortex_m(
935+
exported_program, args, model, example_inputs
936+
)
937+
elif args.delegate:
885938
model_quant, edge = to_edge_TOSA_delegate(
886939
exported_program, args, model, example_inputs
887940
)
@@ -890,11 +943,6 @@ def transform_for_cortex_m_backend(edge_program_manager, args):
890943
exported_program, args, model, example_inputs
891944
)
892945

893-
# Cortex-m ops are never included in vgf or direct-drive
894-
if args.target != "vgf" and not args.direct_drive:
895-
# Transform so we can use ops from the Cortex M backend
896-
edge = transform_for_cortex_m_backend(edge, args)
897-
898946
dump_delegation_info(edge, args.intermediates)
899947

900948
edge_program_manager_copy = copy.deepcopy(edge)

0 commit comments

Comments
 (0)