Title:

Github Executorch · Github Executorch · commit c2be5dbb4666 · 2026-01-22T21:47:57.000-08:00
[cortex_m] Fix linear weight layout: transpose in AOT pass, align meta/ref impl

  Summary:
   The linear path in ConvertToCortexMPass was not transposing weights unlike
   conv2d, causing inconsistency with the C++ runtime which expects weights in
   [in_features, out_features] format per CMSIS-NN.

  Changes:
   - convert_to_cortex_m_pass.py: Transpose linear weights [out, in] -&gt; [in, out]
   - operators.py: Update meta to use weights.shape[1] for output dimension
   - operators.py: Remove .T from ref impl (weights pre-transposed by pass)

  Fixes MV2 output shape mismatch: [1, 1280] -&gt; [1, 1000]

  MV2 on Corstone-300/E8 with CMSIS-NN kernels
  This fix ensures the AOT-compiled .pte file has correctly shaped output
  tensors for any model using quantized_linear (MV2, ResNet, MV3, etc.).
diff --git a/backends/cortex_m/ops/operators.py b/backends/cortex_m/ops/operators.py
@@ -352,7 +352,7 @@ def quantized_linear_meta(
     activation_min,
 ) -> torch.Tensor:
 
-    shape = (*input.shape[:-1], weights.shape[0])
+    shape = (*input.shape[:-1], weights.shape[1])
     return torch.empty(shape, dtype=input.dtype, device=input.device)
 
 
@@ -386,7 +386,7 @@ def quantized_linear_impl(
         input_reshaped = input_int32.reshape(new_shape)
 
         lhs_sum = torch.sum(input_reshaped, dim=-1, keepdim=True) * filter_offset
-        output = torch.mm(input_reshaped, weights_int32.T) + lhs_sum + kernel_sum
+        output = torch.mm(input_reshaped, weights_int32) + lhs_sum + kernel_sum
         output_shape = (*input.shape[:-1], output.shape[-1])
         output_reshaped = output.reshape(output_shape)
     else:
@@ -396,7 +396,7 @@ def quantized_linear_impl(
         new_shape = (prod(input.shape[:-1]), input.shape[-1])
         input_reshaped = input_int32.reshape(new_shape)
 
-        output = torch.mm(input_reshaped, weights_int32.T)
+        output = torch.mm(input_reshaped, weights_int32)
         if bias is not None:
             output = output + bias
         output_shape = (*input.shape[:-1], output.shape[-1])
diff --git a/backends/cortex_m/passes/convert_to_cortex_m_pass.py b/backends/cortex_m/passes/convert_to_cortex_m_pass.py
@@ -113,6 +113,11 @@ def _get_linear_replacement(self, node):
         kernel_sum_tensor = self._compute_kernel_sum(
             weights_tensor, bias_tensor, -input_zp, -weight_zp
         )
+
+        # Transpose weights from PyTorch format [out_features, in_features]
+        # to CMSIS-NN format [in_features, out_features]
+        weights_transposed = weights_tensor.T.contiguous()
+
         with node.graph.inserting_after(weights):
             kernel_sum = create_constant_placeholder(
                 self.exported_program,
@@ -122,9 +127,17 @@ def _get_linear_replacement(self, node):
                 kernel_sum_tensor,
             )
 
+            weights_transposed_node = create_constant_placeholder(
+                self.exported_program,
+                node.graph,
+                node.name + "_weights_transposed",
+                InputKind.PARAMETER,
+                weights_transposed,
+            )
+
         args = (
             node.args[0],
-            weights,
+            weights_transposed_node,
             None,
             kernel_sum,
             -input_zp,