[moe training] use llama4 shapes for kernel benchmarks (#2756)

danielvegamyhre · web-flow · commit f600b83fbd3f · 2025-08-15T09:25:51.000-07:00
diff --git a/benchmarks/float8/bench_grouped_mm.py b/benchmarks/float8/bench_grouped_mm.py
@@ -64,7 +64,7 @@ def run(
 
         # Run bf16 torch._grouped_mm baseline.
         A = torch.randn(M, K, device=device, dtype=dtype)
-        B = torch.randn(E, K, N, device=device, dtype=dtype)
+        B = torch.randn(E, N, K, device=device, dtype=dtype)
         offs = generate_jagged_offs(E, M)
         print(f"offs: {offs}")
         ref_time_sec, ref_tops_sec, ref_pct_top_peak = do_benchmarks(
@@ -73,7 +73,7 @@ def run(
             use_gpu_kernel_time,
             torch._grouped_mm,
             A,
-            B,
+            B.transpose(-2, -1),
             offs,
         )
         print(
@@ -84,12 +84,7 @@ def run(
 
         # Run scaled_grouped_mm.
         A_hp = torch.randn(M, K, device=device)
-        B_hp_t = (
-            torch.randn(E, K, N, device=device)
-            .transpose(-2, -1)
-            .contiguous()
-            .transpose(-2, -1)
-        )
+        B_hp_t = torch.randn(E, N, K, device=device).transpose(-2, -1)
 
         if recipe == "rowwise":
             # TODO: add e5m2
diff --git a/benchmarks/float8/utils.py b/benchmarks/float8/utils.py
@@ -219,7 +219,7 @@ def get_name_to_moe_shapes_iter(
     N: Optional[int] = None,
     E: Optional[int] = None,
 ):
-    M = 8192 if M is None else M
+    M = 16640 if M is None else M
     if shape_gen_name == "llama4_17bx16e":
         # num_experts=16, dim=5120
         names_to_shapes = {
@@ -232,8 +232,8 @@ def get_name_to_moe_shapes_iter(
         # num_experts=128, dim=5120
         names_to_shapes = {
             # M, K, N, E
-            "moe.experts.w1": (M, 5120, 8192, 128),
-            "moe.experts.w2": (M, 8192, 5120, 128),
+            "moe.experts.w1": (M, 5120, 4 * 5120, 128),
+            "moe.experts.w2": (M, 4 * 5120, 5120, 128),
         }
         return names_to_shapes.items()
     elif shape_gen_name == "custom":
diff --git a/benchmarks/prototype/moe_training/benchmark_kernels.py b/benchmarks/prototype/moe_training/benchmark_kernels.py
@@ -49,8 +49,8 @@ class Experiment:
 
 
 def get_configs() -> List[ExperimentConfig]:
-    input_shapes = [(2**8, 4096), (2**12, 4096), (2**16, 4096)]
-    n_groups_list = [4, 8, 16]
+    input_shapes = [(16640, 5120)]  # (Mg, K)
+    n_groups_list = [16, 128]
     high_precision_dtypes = [torch.bfloat16]
     configs = []
     for input_shape, n_groups, high_precision_dtype in itertools.product(
@@ -129,6 +129,7 @@ def run_triton(
 
     # bench torch
     compiled_run_torch = torch.compile(run_torch)
+    warmup(compiled_run_torch, input_row_major, input_col_major, offs)
     torch_time_us = benchmark_cuda_function_in_microseconds(
         compiled_run_torch, input_row_major, input_col_major, offs
     )
@@ -152,6 +153,7 @@ def print_results(experiments: List[Experiment]):
         "high_precision_dtype",
         "torch_time_us",
         "triton_time_us",
+        "triton_speedup",
     ]
     rows = []
     for experiment in experiments:
@@ -165,6 +167,7 @@ def print_results(experiments: List[Experiment]):
                 experiment.config.high_precision_dtype,
                 experiment.result.torch_time_us,
                 experiment.result.triton_time_us,
+                f"{experiment.result.torch_time_us / experiment.result.triton_time_us:.2f}x",
             ]
         )
     print(tabulate(rows, headers=headers))
diff --git a/benchmarks/prototype/moe_training/benchmark_rowwise_3d_quant_kernels.py b/benchmarks/prototype/moe_training/benchmark_rowwise_3d_quant_kernels.py
@@ -46,8 +46,11 @@ class Experiment:
 
 
 def get_configs() -> List[ExperimentConfig]:
-    # Llama4 and DeepSeekV3 shapes
-    input_shapes = [(8, 4096, 1024), (16, 5120 * 4, 5120)]
+    # Llama4 shapes
+    input_shapes = [
+        (16, 8192, 5120),  # w1, w3
+        (16, 5120, 8192),  # w2
+    ]
     high_precision_dtypes = [torch.bfloat16]
     configs = []
     for input_shape, high_precision_dtype in itertools.product(
@@ -117,6 +120,7 @@ def print_results(experiments: List[Experiment]):
         "input_shape",
         "torch_time_us",
         "triton_time_us",
+        "triton_speedup",
     ]
     rows = []
     for experiment in experiments:
@@ -126,6 +130,7 @@ def print_results(experiments: List[Experiment]):
                 input_shape,
                 experiment.result.torch_time_us,
                 experiment.result.triton_time_us,
+                f"{experiment.result.torch_time_us / experiment.result.triton_time_us:.2f}x",
             ]
         )
     print(tabulate(rows, headers=headers))
diff --git a/torchao/prototype/moe_training/kernels/float8_rowwise.py b/torchao/prototype/moe_training/kernels/float8_rowwise.py
@@ -29,7 +29,7 @@
 block_sizes_n = [32, 128, 512]  # large dim (output_features)
 block_sizes_k = [32, 128, 512]  # small dim (input_features)
 num_warps = [8]
-num_stages = [2, 3]
+num_stages = [2, 4]
 kernel_configs_2D = [
     triton.Config(
         {"BLOCK_SIZE_N": block_size_n, "BLOCK_SIZE_K": block_size_k},
diff --git a/torchao/prototype/moe_training/kernels/jagged_float8_scales.py b/torchao/prototype/moe_training/kernels/jagged_float8_scales.py
@@ -32,9 +32,9 @@
 }
 
 block_sizes = [1, 16, 32, 64]
-block_sizes_iter = [32, 64, 128, 256]
-num_warps = [1, 4]
-num_stages = [2, 3]
+block_sizes_iter = [64, 128, 256]
+num_warps = [4]
+num_stages = [3]
 kernel_configs_2D = [
     triton.Config(
         {"BLOCK_SIZE": block_size, "BLOCK_SIZE_ITER": block_size_iter},