[MoE training] Assert expert weights are column-major; preserve subclass with transpose (#2663)

danielvegamyhre · web-flow · commit b757fb92a1d7 · 2025-08-04T16:43:49.000-07:00
* assert B is col-major

* preserve subclass with transpose
diff --git a/torchao/prototype/moe_training/scaled_grouped_mm.py b/torchao/prototype/moe_training/scaled_grouped_mm.py
@@ -95,10 +95,7 @@ def forward(
         assert not _is_column_major(A), "A must be row-major"
 
         # Due to hardware requirements, the right operand in a scaled grouped GEMM must be column-major.
-        if not _is_column_major(B_t):
-            # FSDP will complain if B_t (weights) is not contiguous, we can't require B_t to be column-major.
-            # TODO: figure out better solution than transposing for each forward pass.
-            B_t = B_t.transpose(-2, -1).contiguous().transpose(-2, -1)
+        assert _is_column_major(B_t), "B must be column-major"
 
         # Convert high precision input tensor to float8, row-major for left operand of grouped GEMM.
         # A shape: (M, K) or (B, M, K)
diff --git a/torchao/prototype/moe_training/tensor.py b/torchao/prototype/moe_training/tensor.py
@@ -30,6 +30,7 @@
     torch.ops.aten._pin_memory.default,
     torch.ops.aten.split.Tensor,
     torch.ops.aten.clone.default,
+    torch.ops.aten.transpose.int,
 }
 
 

Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,7 @@`
`30`	`30`	`torch.ops.aten._pin_memory.default,`
`31`	`31`	`torch.ops.aten.split.Tensor,`
`32`	`32`	`torch.ops.aten.clone.default,`
	`33`	`+ torch.ops.aten.transpose.int,`
`33`	`34`	`}`
`34`	`35`
`35`	`36`