Add fp8_gemm benchmark for deepseek-style scaling (#504)

jananisriram · facebook-github-bot · commit a167af5be432 · 2025-10-09T09:54:19.000-07:00
Summary:

Add `fp8_gemm` benchmark for deepseek-style scaling in TritonBench.

Reviewed By: NikhilAPatel

Differential Revision: D83689980
diff --git a/tritonbench/operators/fp8_gemm/fp8_gemm.py b/tritonbench/operators/fp8_gemm/fp8_gemm.py
@@ -45,6 +45,10 @@
     HAS_TMA = False
     logger.warning(f"Failed to import TMA: {e}")
 
+HAS_CUDA_129 = (
+    torch.cuda.is_available() and torch.version.cuda and torch.version.cuda >= "12.9"
+)
+
 
 def parse_args(args):
     parser = argparse.ArgumentParser(description="TritonBench fp8_gemm")
@@ -63,6 +67,10 @@ def get_scaling_recipe_int(scaling_recipe: str) -> int:
         return ScalingType.TensorWise
     elif scaling_recipe == "RowWise":
         return ScalingType.RowWise
+    elif scaling_recipe == "BlockWise1x128":
+        return ScalingType.BlockWise1x128
+    elif scaling_recipe == "BlockWise128x128":
+        return ScalingType.BlockWise128x128
     else:
         raise ValueError(f"Invalid scaling recipe: {scaling_recipe}")
 
@@ -97,11 +105,25 @@ def _get_scale_per_row(x: torch.Tensor, transpose: bool = False) -> torch.Tensor
             torch.float32
         )  # For row-wise scaling, kernel requires a float32 scale tensor
 
+    def _get_scale_per_block(x: torch.Tensor, block_outer: int, block_inner: int):
+        x = x.unflatten(1, (-1, block_inner)).unflatten(0, (-1, block_outer))
+        amax = x.abs().amax(dim=[1, 3], keepdim=True).float()
+        scale = torch.finfo(torch.float8_e4m3fn).max / amax
+        x = (
+            x.mul(scale).flatten(2, 3).flatten(0, 1)
+        )  # scale input up to dynamic range of float8_e4m3fn
+        scale = scale.flatten(2, 3).flatten(0, 1)
+        return x, scale.to(torch.float32)
+
     match scaling_recipe_int:
         case ScalingType.TensorWise:
             return _get_scale_per_tensor(x, custom_scale=custom_scale)
         case ScalingType.RowWise:
             return _get_scale_per_row(x, transpose=transpose)
+        case ScalingType.BlockWise1x128:
+            return _get_scale_per_block(x, 1, 128)
+        case ScalingType.BlockWise128x128:
+            return _get_scale_per_block(x, 128, 128)
         case _:
             raise AssertionError(f"Unsupported scaling type {scaling_recipe_int}")
 
@@ -127,6 +149,19 @@ def __init__(
         self.scaling_recipe_a_int = get_scaling_recipe_int(scaling_recipe_a).value
         self.scaling_recipe_b_int = get_scaling_recipe_int(scaling_recipe_b).value
 
+        blockwise_scaling_types = [
+            ScalingType.BlockWise1x128,
+            ScalingType.BlockWise128x128,
+        ]
+        self.contains_blockwise_scaling = (
+            self.scaling_recipe_a_int in blockwise_scaling_types
+            or self.scaling_recipe_b_int in blockwise_scaling_types
+        )
+
+        self.use_fast_accum = (
+            False if self.contains_blockwise_scaling else True
+        )  # BlockWise scaled_gemm does not support use_fast_accum=True
+
     def _get_dtype(self):
         if (
             self.scaling_recipe_a_int == ScalingType.TensorWise
@@ -189,12 +224,16 @@ def get_x_val(self, example_inputs) -> float:
 
     @register_benchmark(baseline=True)
     def torch_fp8_gemm(self, a, b, scale_a, scale_b):
+        assert (
+            not self.contains_blockwise_scaling or HAS_CUDA_129
+        ), "BlockWise scaling variants for scaled_gemm require CUDA 12.9+"
+
         return lambda: torch._scaled_mm(
             a,
             b.t(),
             scale_a,
             scale_b.t(),
-            use_fast_accum=True,
+            use_fast_accum=self.use_fast_accum,
             out_dtype=self._get_dtype(),
         )
 
@@ -211,7 +250,7 @@ def pt2_fp8_gemm(self, a, b, scale_a, scale_b) -> Callable:
                 b.t(),
                 scale_a,
                 scale_b.t(),
-                use_fast_accum=True,
+                use_fast_accum=self.use_fast_accum,
                 out_dtype=self._get_dtype(),
             )
             compiled = torch.compile(f, dynamic=False)