Add fp8_gemm benchmark for deepseek-style scaling

jananisriram · facebook-github-bot · commit 2a254fe8ad32 · 2025-10-01T16:52:28.000-07:00
Summary: Add `fp8_gemm` benchmark for deepseek-style scaling in TritonBench.

Differential Revision: D83689980
diff --git a/tritonbench/operators/fp8_gemm/fp8_gemm.py b/tritonbench/operators/fp8_gemm/fp8_gemm.py
@@ -47,6 +47,10 @@
     HAS_TMA = False
     logger.warning(f"Failed to import TMA: {e}")
 
+HAS_CUDA_129 = (
+    torch.cuda.is_available() and torch.version.cuda and torch.version.cuda >= "12.9"
+)
+
 
 def parse_args(args):
     parser = argparse.ArgumentParser(description="TritonBench fp8_gemm")
@@ -65,6 +69,8 @@ def get_scaling_mode_int(scaling_mode: str) -> int:
         return ScalingMode.TENSOR
     elif scaling_mode == "row":
         return ScalingMode.ROW
+    elif scaling_mode == "deepseek":
+        return ScalingMode.DEEPSEEK
     else:
         raise ValueError(f"Invalid scaling mode: {scaling_mode}")
 
@@ -113,11 +119,40 @@ def _get_scale_per_row(
                 torch.float32
             )  # For row-wise scaling, kernel requires a float32 scale tensor
 
+        def _get_scale_deepseek(
+            x: torch.Tensor,
+            block_outer: int,
+        ) -> tuple[torch.Tensor, torch.Tensor]:
+            """
+            DeepSeek-style scaling on matmul A @ B uses a combination of block- and tile-wise scaling:
+                - activation tensor A: 1x128 tile-wise scaling
+                - weight tensor B: 128x128 block-wise scaling
+            """
+            block_inner = 128
+            x = x.unflatten(1, (-1, block_inner)).unflatten(0, (-1, block_outer))
+            amax = x.abs().amax(dim=[1, 3], keepdim=True).float()
+            scale = torch.finfo(torch.float8_e4m3fn).max / amax
+            x = (
+                x.mul(scale).flatten(2, 3).flatten(0, 1)
+            )  # scale input up to dynamic range of float8_e4m3fn
+            scale = scale.flatten(2, 3).flatten(0, 1)
+            return x, scale.to(torch.float32)
+
         def args(m, n, k):
             a = torch.randn(m, k, device=self.device).to(self._get_dtype())
             b = torch.randn(n, k, device=self.device).to(self._get_dtype())
 
-            if self.scaling_mode_int == ScalingMode.ROW:
+            if self.scaling_mode_int == ScalingMode.DEEPSEEK:
+                activations_block_outer = 1
+                weights_block_outer = 128
+
+                a, scale_a = _get_scale_deepseek(a, activations_block_outer)
+                b, scale_b = _get_scale_deepseek(b, weights_block_outer)
+
+                scale_a = (
+                    scale_a.t().contiguous().t()
+                )  # 1x128 blocks need scales to be outer-dim-major
+            elif self.scaling_mode_int == ScalingMode.ROW:
                 scale_a = _get_scale_per_row(a)
                 scale_b = _get_scale_per_row(b)
             else:  # self.scaling_mode_int == ScalingMode.TENSOR
@@ -166,12 +201,20 @@ def get_x_val(self, example_inputs) -> float:
 
     @register_benchmark(baseline=True)
     def torch_fp8_gemm(self, a, b, scale_a, scale_b):
+        is_scaling_deepseek = self.scaling_mode_int == ScalingMode.DEEPSEEK
+
+        assert (
+            not is_scaling_deepseek or HAS_CUDA_129
+        ), "Deepseek-style scaling (BlockWise128x128) for scaled_gemm requires CUDA 12.9+"
+
+        use_fast_accum = False if is_scaling_deepseek else True  # blockwise scaled_gemm does not support use_fast_accum=True
+
         return lambda: torch._scaled_mm(
             a,
             b.t(),
             scale_a,
             scale_b.t(),
-            use_fast_accum=True,
+            use_fast_accum=use_fast_accum,
             out_dtype=self._get_dtype(),
         )