[moe training] set token group alignment size to 16 for fp8 training test (#2678)

danielvegamyhre · web-flow · commit be40518d0b79 · 2025-08-04T19:22:41.000-07:00
diff --git a/test/prototype/moe_training/test_training.py b/test/prototype/moe_training/test_training.py
@@ -19,6 +19,9 @@
 
 # this test requires torchtitan
 try:
+    from torchtitan.experiments.llama4.infra.expert_parallel import (
+        set_token_group_alignment_size_m,
+    )
     from torchtitan.experiments.llama4.model.args import TransformerModelArgs
     from torchtitan.experiments.llama4.model.moe import MoE
 except ImportError:
@@ -36,6 +39,11 @@
 )
 @pytest.mark.parametrize("compile", [False, True])
 def test_moe_float8_training(target_fqns: list[str], compile: bool):
+    # Set token group alignment size to 16. This is required so that
+    # each logically distinct gemm in the grouped gemm `grad_weight = grad_output_t @ input`
+    # has the contraction dim be divisible by 16. 16 byte alignment is required
+    # for the slowest moving dim (stride 1), so 16 bytes / 1 byte per element in fp8 = 16 elements.
+    set_token_group_alignment_size_m(16)
     model_args = TransformerModelArgs(
         moe_enabled=True,
         num_experts=8,