[llama4] store expert weights such that we can transpose before grouped mm to have col-major memory layout (#1517)

danielvegamyhre · web-flow · commit ed288bc9f287 · 2025-08-03T09:13:44.000-07:00
# Summary Rather than store experts weights pre-transposed (E, in_dim, out_dim), we should store the expert weights non-transposed (E, out_dim, in_dim) then transpose before grouped gemm for (1) compatible dims for gemm, and (2) column-major memory layout required for right operand in grouped gemm. Doing this simple transpose (metadata change only) is must more efficient than doing this [inefficient memory layout transformation before every GEMM in fp8](https://github.com/pytorch/ao/blob/6e941c87c4d9fb9a74e6f979dd522605c696ca42/torchao/prototype/moe_training/scaled_grouped_mm.py#L96). # Eager Performance Llama4 debug model with FSDP=8, using config: ```python "debugmodel": TransformerModelArgs( dim=5120, n_layers=4, n_heads=40, n_kv_heads=8, ffn_dim_multiplier=1.2, multiple_of=2048, rope_theta=500000, max_seq_len=10485760, num_experts=16, interleave_moe_layer_step=1, ), ``` ### bfloat16 With change: ``` ===================================================== Calculating training performance metrics ===================================================== Median Tokens/Second (excluding step 1): 2147.0 Max Memory Usage: 92.67 GiB ``` Without change: ``` ===================================================== Calculating training performance metrics ===================================================== Median Tokens/Second (excluding step 1): 1711.0 Max Memory Usage: 92.67 GiB ``` ### fp8 rowwise With change: ``` (torchtitan) [danvm@devgpu007.eag6 ~/ao/benchmarks/float8/training (metdata)]$ TORCHTITAN_ROOT=/home/danvm/torchtitan NGPU=8 EXTRA_ARGS="--model.converters="float8" --float8.recipe_name="rowwise" --float8.filter_fqns="output,auto_filter_small_kn" --float8.moe_fqns_prototype="experts"" ./llama4.sh ===================================================== Calculating training performance metrics ===================================================== Median Tokens/Second (excluding step 1): 2675.0 Max Memory Usage: 90.35 GiB ``` Without change: ``` (torchtitan) [danvm@devgpu007.eag6 ~/ao/benchmarks/float8/training (metdata)]$ TORCHTITAN_ROOT=/home/danvm/torchtitan NGPU=8 EXTRA_ARGS="--model.converters="float8" --float8.recipe_name="rowwise" --float8.filter_fqns="output,auto_filter_small_kn" --float8.moe_fqns_prototype="experts"" ./llama4.sh ===================================================== Calculating training performance metrics ===================================================== Median Tokens/Second (excluding step 1): 2360.0 Max Memory Usage: 90.35 GiB ```
diff --git a/torchtitan/experiments/llama4/infra/expert_parallel.py b/torchtitan/experiments/llama4/infra/expert_parallel.py
@@ -54,16 +54,21 @@ def set_token_group_alignment_size_m(
 # implementation of Tensor Parallel for the GroupedExperts in MoE
 class TensorParallel(ParallelStyle):
     def _partition_fn(self, name, module, device_mesh):
+        # w1 shape = (experts, out_dim, in_dim)
         module.register_parameter(
-            "w1", nn.Parameter(distribute_tensor(module.w1, device_mesh, [Shard(2)]))
+            "w1", nn.Parameter(distribute_tensor(module.w1, device_mesh, [Shard(1)]))
         )  # Column-wise sharding
+
+        # w2 shape = (experts, in_dim, out_dim)
         module.register_parameter(
             "w2",
-            nn.Parameter(distribute_tensor(module.w2, device_mesh, [Shard(1)])),
+            nn.Parameter(distribute_tensor(module.w2, device_mesh, [Shard(2)])),
         )  # Row-wise sharding
+
+        # w3 shape = (experts, out_dim, in_dim)
         module.register_parameter(
             "w3",
-            nn.Parameter(distribute_tensor(module.w3, device_mesh, [Shard(2)])),
+            nn.Parameter(distribute_tensor(module.w3, device_mesh, [Shard(1)])),
         )  # Column-wise sharding
 
     def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
@@ -223,17 +228,22 @@ def _token_dispatch(self, mod, inputs, device_mesh):
         return super()._token_dispatch(mod, inputs, self.ep_mesh)
 
     def _partition_fn_2d(self, name, mod, ep_tp_mesh):
+        # w1 shape = (experts, out_dim, in_dim)
         mod.register_parameter(
             "w1",
-            nn.Parameter(distribute_tensor(mod.w1, ep_tp_mesh, [Shard(0), Shard(2)])),
+            nn.Parameter(distribute_tensor(mod.w1, ep_tp_mesh, [Shard(0), Shard(1)])),
         )  # Column-wise sharding
+
+        # w2 shape = (experts, in_dim, out_dim)
         mod.register_parameter(
             "w2",
-            nn.Parameter(distribute_tensor(mod.w2, ep_tp_mesh, [Shard(0), Shard(1)])),
+            nn.Parameter(distribute_tensor(mod.w2, ep_tp_mesh, [Shard(0), Shard(2)])),
         )  # Row-wise sharding
+
+        # w3 shape = (experts, out_dim, in_dim)
         mod.register_parameter(
             "w3",
-            nn.Parameter(distribute_tensor(mod.w3, ep_tp_mesh, [Shard(0), Shard(2)])),
+            nn.Parameter(distribute_tensor(mod.w3, ep_tp_mesh, [Shard(0), Shard(1)])),
         )  # Column-wise sharding
 
     def _token_combine(self, mod, routed_output, device_mesh):
diff --git a/torchtitan/experiments/llama4/model/moe.py b/torchtitan/experiments/llama4/model/moe.py
@@ -23,9 +23,9 @@ def __init__(
     ):
         super().__init__()
         self.num_experts = num_experts
-        self.w1 = nn.Parameter(torch.empty(num_experts, dim, hidden_dim))
-        self.w2 = nn.Parameter(torch.empty(num_experts, hidden_dim, dim))
-        self.w3 = nn.Parameter(torch.empty(num_experts, dim, hidden_dim))
+        self.w1 = nn.Parameter(torch.empty(num_experts, hidden_dim, dim))
+        self.w2 = nn.Parameter(torch.empty(num_experts, dim, hidden_dim))
+        self.w3 = nn.Parameter(torch.empty(num_experts, hidden_dim, dim))
         self.use_grouped_mm = use_grouped_mm
 
     def forward(
@@ -69,9 +69,9 @@ def _run_experts_for_loop(
             )
             out_experts_splits = []
             for expert_idx, x_expert in enumerate(x):
-                h = F.silu(torch.matmul(x_expert, w1[expert_idx]))
-                h = h * torch.matmul(x_expert, w3[expert_idx])
-                h = torch.matmul(h, w2[expert_idx])
+                h = F.silu(torch.matmul(x_expert, w1[expert_idx].transpose(-2, -1)))
+                h = h * torch.matmul(x_expert, w3[expert_idx].transpose(-2, -1))
+                h = torch.matmul(h, w2[expert_idx].transpose(-2, -1))
                 # h shape (tokens_per_expert(varying), dim)
                 out_experts_splits.append(h)
             out = torch.cat(out_experts_splits, dim=0)
@@ -80,10 +80,10 @@ def _run_experts_for_loop(
             out = torch.vstack((out, out.new_zeros((num_padding, out.shape[-1]))))
         else:
             # x shape (num_experts, tokens_per_expert, dim)
-            h = F.silu(torch.bmm(x, w1))
-            h = h * torch.bmm(x, w3)
+            h = F.silu(torch.bmm(x, w1.transpose(-2, -1)))
+            h = h * torch.bmm(x, w3.transpose(-2, -1))
             # out shape (num_experts, tokens_per_expert, dim)
-            out = torch.bmm(h, w2)
+            out = torch.bmm(h, w2.transpose(-2, -1))
 
         return out
 
@@ -105,9 +105,17 @@ def _run_experts_grouped_mm(
             # fall back to regular bmm between 3D tensors
             assert x.dim() == 3
 
-        h = F.silu(torch._grouped_mm(x.bfloat16(), w1.bfloat16(), offs=offsets))
-        h = h * torch._grouped_mm(x.bfloat16(), w3.bfloat16(), offs=offsets)
-        out = torch._grouped_mm(h, w2.bfloat16(), offs=offsets).type_as(x)
+        h = F.silu(
+            torch._grouped_mm(
+                x.bfloat16(), w1.bfloat16().transpose(-2, -1), offs=offsets
+            )
+        )
+        h = h * torch._grouped_mm(
+            x.bfloat16(), w3.bfloat16().transpose(-2, -1), offs=offsets
+        )
+        out = torch._grouped_mm(
+            h, w2.bfloat16().transpose(-2, -1), offs=offsets
+        ).type_as(x)
 
         return out