Refactor fp8_gemm benchmark to simplify addition of new scaling modes (#500)

jananisriram · facebook-github-bot · commit 9df1f8da473c · 2025-10-01T22:33:41.000-07:00
Summary:

Refactor the `fp8_gemm` benchmark in TritonBench to accept scaling modes as an argument. This diff enables us to extend the `fp8_gemm` benchmark to new scaling modes without adding new benchmarking arguments.

Reviewed By: NikhilAPatel

Differential Revision: D83617233
diff --git a/tritonbench/operators/fp8_gemm/fp8_gemm.py b/tritonbench/operators/fp8_gemm/fp8_gemm.py
@@ -1,4 +1,5 @@
 import argparse
+
 import logging
 
 from typing import Any, Callable, List, Optional
@@ -7,6 +8,8 @@
 import torch._inductor.config as inductor_config
 import triton
 
+from torch._inductor.kernel.mm import ScalingMode
+
 from tritonbench.operators.fp8_gemm.persistent import blackwell_persistent_tma
 from tritonbench.utils.env_utils import get_nvidia_gpu_model, is_cuda
 
@@ -46,7 +49,7 @@
 def parse_args(args):
     parser = argparse.ArgumentParser(description="TritonBench fp8_gemm")
     parser.add_argument("--llama", action="store_true")
-    parser.add_argument("--scaling_rowwise", action="store_true")
+    parser.add_argument("--scaling-mode", type=str, default="tensor")
     parser.add_argument("--m", type=int)
     parser.add_argument("--k", type=int)
     parser.add_argument("--n", type=int)
@@ -55,6 +58,15 @@ def parse_args(args):
     return parser.parse_args(args)
 
 
+def get_scaling_mode_int(scaling_mode: str) -> int:
+    if scaling_mode == "tensor":
+        return ScalingMode.TENSOR
+    elif scaling_mode == "row":
+        return ScalingMode.ROW
+    else:
+        raise ValueError(f"Invalid scaling mode: {scaling_mode}")
+
+
 class Operator(BenchmarkOperator):
     DEFAULT_METRICS = ["tflops", "gbps", "latency"]
     DEFAULT_PRECISION = "fp8"
@@ -65,11 +77,12 @@ def __init__(
         super().__init__(tb_args, extra_args)
         self.extra_args = parse_args(extra_args)
 
+        self.scaling_mode_int = get_scaling_mode_int(self.extra_args.scaling_mode).value
+
     def _get_dtype(self):
-        if self.extra_args.scaling_rowwise:
-            return torch.bfloat16
-        else:
+        if self.scaling_mode_int == ScalingMode.TENSOR:
             return torch.float16
+        return torch.bfloat16
 
     def get_input_iter(self):
         def _get_scale_per_tensor(
@@ -102,10 +115,10 @@ def args(m, n, k):
             a = torch.randn(m, k, device=self.device).to(self._get_dtype())
             b = torch.randn(n, k, device=self.device).to(self._get_dtype())
 
-            if self.extra_args.scaling_rowwise:
+            if self.scaling_mode_int == ScalingMode.ROW:
                 scale_a = _get_scale_per_row(a)
                 scale_b = _get_scale_per_row(b)
-            else:
+            else:  # self.scaling_mode_int == ScalingMode.TENSOR
                 scale_a = _get_scale_per_tensor(
                     a, custom_scale=self.extra_args.per_tensor_scale_a
                 )
@@ -191,7 +204,7 @@ def blackwell_persistent_tma_fp8_gemm(self, a, b, scale_a, scale_b):
                 scale_a,
                 scale_b,
                 self._get_dtype(),
-                self.extra_args.scaling_rowwise,
+                self.scaling_mode_int,
             )
 
         @register_benchmark(enabled=True)
diff --git a/tritonbench/operators/fp8_gemm/persistent.py b/tritonbench/operators/fp8_gemm/persistent.py
@@ -1,10 +1,13 @@
 from functools import lru_cache
+
 from typing import Optional
 
 import torch
 import triton
 import triton.language as tl
 
+from torch._inductor.kernel.mm import ScalingMode
+
 from tritonbench.utils.env_utils import is_cuda
 from tritonbench.utils.triton_utils import has_experimental_descriptor
 
@@ -410,9 +413,7 @@ def matmul_tma_persistent(a, b, c, desc_a, desc_b, desc_c):
 #       - 1 warp = 32 threads, so each thread block requires 128 / 32 = 4 warps
 
 
-def blackwell_persistent_tma(
-    a, b, scale_a_ptr, scale_b_ptr, acc_dtype, scaling_rowwise
-):
+def blackwell_persistent_tma(a, b, scale_a_ptr, scale_b_ptr, acc_dtype, scaling_mode):
     configs = matmul_configs_blackwell()
 
     # Check constraints.
@@ -471,7 +472,7 @@ def alloc_fn(size: int, align: int, stream: Optional[int]):
         NUM_SMS=NUM_SMS,  #
         num_stages=configs[shape_dtype]["num_stages"],  #
         num_warps=configs[shape_dtype]["num_warps"],  #
-        SCALING_ROWWISE=scaling_rowwise,
+        SCALING_MODE=scaling_mode,  #
         WARP_SPECIALIZE=configs[shape_dtype]["WARP_SPECIALIZE"],  #
         EPILOGUE_SUBTILE=configs[shape_dtype]["EPILOGUE_SUBTILE"],  #
     )
@@ -504,7 +505,7 @@ def blackwell_persistent_tma_kernel(
     GROUP_SIZE_M: tl.constexpr,  #
     ACC_TYPE: tl.constexpr,
     NUM_SMS: tl.constexpr,
-    SCALING_ROWWISE: tl.constexpr,  #
+    SCALING_MODE: tl.constexpr,  #
     WARP_SPECIALIZE: tl.constexpr,
     EPILOGUE_SUBTILE: tl.constexpr,
 ):  #
@@ -538,7 +539,7 @@ def blackwell_persistent_tma_kernel(
     tile_id_c = start_pid - NUM_SMS
     num_pid_in_group = GROUP_SIZE_M * num_pid_n
 
-    if SCALING_ROWWISE:
+    if SCALING_MODE == ScalingMode.ROW:
         # For row-wise scaling, we'll use the pointers as-is
         scale_a = scale_a_ptr
         scale_b = scale_b_ptr
@@ -563,7 +564,7 @@ def blackwell_persistent_tma_kernel(
             b_block = b_desc.load([offs_bn, offs_k])
             accumulator = tl.dot(a_block, b_block.T, accumulator, out_dtype=tl.float32)
 
-        if SCALING_ROWWISE:
+        if SCALING_MODE == ScalingMode.ROW:
             offs_scale_m = offs_am + tl.arange(0, BLOCK_SIZE_M)
             offs_scale_n = offs_bn + tl.arange(0, BLOCK_SIZE_N)