pytorch
diff --git a/‎benchmarks/prototype/moe_training/benchmark_kernels.py
Lines changed: 0 additions & 193 deletions b/‎benchmarks/prototype/moe_training/benchmark_kernels.py
Lines changed: 0 additions & 193 deletions
diff --git a/‎benchmarks/prototype/moe_training/benchmark_per_group_scaling_kernels.py
Lines changed: 5 additions & 2 deletions b/‎benchmarks/prototype/moe_training/benchmark_per_group_scaling_kernels.py
Lines changed: 5 additions & 2 deletions
@@ -49,8 +49,8 @@ class Experiment:
 
 
 def get_configs() -> List[ExperimentConfig]:
-    input_shapes = [(2**8, 4096), (2**12, 4096), (2**16, 4096)]
-    n_groups_list = [4, 8, 16]
+    input_shapes = [(16640, 5120)]  # (Mg, K)
+    n_groups_list = [16, 128]
     high_precision_dtypes = [torch.bfloat16]
     configs = []
     for input_shape, n_groups, high_precision_dtype in itertools.product(
@@ -129,6 +129,7 @@ def run_triton(
 
     # bench torch
     compiled_run_torch = torch.compile(run_torch)
+    warmup(compiled_run_torch, input_row_major, input_col_major, offs)
     torch_time_us = benchmark_cuda_function_in_microseconds(
         compiled_run_torch, input_row_major, input_col_major, offs
     )
@@ -152,6 +153,7 @@ def print_results(experiments: List[Experiment]):
         "high_precision_dtype",
         "torch_time_us",
         "triton_time_us",
+        "triton_speedup",
     ]
     rows = []
     for experiment in experiments:
@@ -165,6 +167,7 @@ def print_results(experiments: List[Experiment]):
                 experiment.config.high_precision_dtype,
                 experiment.result.torch_time_us,
                 experiment.result.triton_time_us,
+                f"{experiment.result.torch_time_us / experiment.result.triton_time_us:.2f}x",
             ]
         )
     print(tabulate(rows, headers=headers))