[MoE training] torch.compile support for ScaledGroupedMMTensor (#2509)

danielvegamyhre · web-flow · commit 7dbc816fb4a5 · 2025-08-02T08:43:53.000-07:00
diff --git a/test/prototype/moe_training/test_scaled_grouped_mm.py b/test/prototype/moe_training/test_scaled_grouped_mm.py
@@ -8,24 +8,18 @@
 import torch
 from torch.nn import functional as F
 
-pytest.importorskip("triton", reason="Triton required to run this test")
-
-from torchao.prototype.moe_training.utils import (
-    _to_mxfp8_per_group_colwise,
-    _to_mxfp8_per_group_rowwise,
-    generate_jagged_offs,
-)
-from torchao.utils import TORCH_VERSION_AT_LEAST_2_5
+from torchao.utils import TORCH_VERSION_AT_LEAST_2_7
 
 # We need to skip before doing any imports which would use triton, since
 # triton won't be available on CPU builds and torch < 2.5
 if not (
-    TORCH_VERSION_AT_LEAST_2_5
+    TORCH_VERSION_AT_LEAST_2_7
     and torch.cuda.is_available()
     and torch.cuda.get_device_capability()[0] >= 9
 ):
     pytest.skip("Unsupported PyTorch version", allow_module_level=True)
 
+pytest.importorskip("triton", reason="Triton required to run this test")
 
 from torchao.float8.config import (
     Float8LinearConfig,
@@ -39,6 +33,11 @@
     _emulated_mxfp8_scaled_grouped_mm_2d_3d,
     _scaled_grouped_mm,
 )
+from torchao.prototype.moe_training.utils import (
+    _to_mxfp8_per_group_colwise,
+    _to_mxfp8_per_group_rowwise,
+    generate_jagged_offs,
+)
 from torchao.prototype.mx_formats.mx_tensor import to_mx
 from torchao.testing.utils import skip_if_rocm
 
diff --git a/test/prototype/moe_training/test_training.py b/test/prototype/moe_training/test_training.py
@@ -34,7 +34,8 @@
         ["does.not.exist"],
     ],
 )
-def test_moe_float8_training(target_fqns: list[str]):
+@pytest.mark.parametrize("compile", [False, True])
+def test_moe_float8_training(target_fqns: list[str], compile: bool):
     model_args = TransformerModelArgs(
         moe_enabled=True,
         num_experts=8,
@@ -72,6 +73,11 @@ def moe_module_filter_fn(mod: nn.Module, cur_fqn: str) -> bool:
         target_fqns=target_fqns,
     )
 
+    if compile:
+        # TODO: compile with fullgraph=True when torchtitan llama4 moe supports it
+        model = torch.compile(model, fullgraph=False)
+        ref_model = torch.compile(ref_model, fullgraph=False)
+
     # inputs
     batch, seq, dim = 8, 2048, 256
     ref_x = torch.randn(
diff --git a/torchao/prototype/moe_training/kernels/jagged_float8_scales.py b/torchao/prototype/moe_training/kernels/jagged_float8_scales.py
@@ -42,7 +42,10 @@
     for block_size_cols in block_sizes
 ]
 
+from torch.library import triton_op, wrap_triton
 
+
+@triton_op("torchao::triton_fp8_row_major_jagged_rowwise_scales", mutates_args={})
 def triton_fp8_row_major_jagged_rowwise_scales(
     hp_tensor: torch.Tensor,
     offsets: torch.Tensor,
@@ -90,7 +93,7 @@ def triton_fp8_row_major_jagged_rowwise_scales(
         triton.cdiv(m, meta["BLOCK_SIZE_ROWS"]),
         offsets.numel(),
     )
-    _triton_fp8_row_major_jagged_rowwise_scales[grid](
+    wrap_triton(_triton_fp8_row_major_jagged_rowwise_scales)[grid](
         hp_tensor,
         offsets,
         output_buffer,
@@ -204,6 +207,7 @@ def _triton_fp8_row_major_jagged_rowwise_scales(
         tl.store(out_ptr + out_offs, fp8_data, mask=block_mask)
 
 
+@triton_op("torchao::triton_fp8_col_major_jagged_colwise_scales", mutates_args={})
 def triton_fp8_col_major_jagged_colwise_scales(
     hp_tensor: torch.Tensor,
     offsets: torch.Tensor,
@@ -251,7 +255,7 @@ def triton_fp8_col_major_jagged_colwise_scales(
         triton.cdiv(n, meta["BLOCK_SIZE_COLS"]),
         offsets.numel(),
     )
-    _triton_fp8_col_major_jagged_colwise_scales[grid](
+    wrap_triton(_triton_fp8_col_major_jagged_colwise_scales)[grid](
         hp_tensor,
         offsets,
         output_buffer,
diff --git a/torchao/prototype/moe_training/tensor.py b/torchao/prototype/moe_training/tensor.py
@@ -123,7 +123,9 @@ def __repr__(self):
         return f"ScaledGroupedMMTensor(data={self._data})"
 
     def __tensor_flatten__(self):
-        return ["_data"]
+        # Metadata is empty but needed to make the subclass traceable for torch.compile.
+        metadata = {}
+        return ["_data"], metadata
 
     @staticmethod
     def __tensor_unflatten__(inner_tensors, flatten_spec, outer_size, outer_stride):