Add an inductor generated _scale_mm to TritonBench

njriasan · web-flow · commit 04fb19a482ef · 2025-07-30T10:12:25.000-07:00
Differential Revision: D79263834 Pull Request resolved: #316
diff --git a/tritonbench/operators/fp8_gemm/fp8_gemm.py b/tritonbench/operators/fp8_gemm/fp8_gemm.py
@@ -1,9 +1,10 @@
 import argparse
 import logging
 
-from typing import Any, List, Optional
+from typing import Any, Callable, List, Optional
 
 import torch
+import torch._inductor.config as inductor_config
 import triton
 
 from tritonbench.utils.triton_op import (
@@ -90,6 +91,24 @@ def torch_fp8_gemm(self, a, b):
             a, b, scale_a, scale_b, use_fast_accum=True, out_dtype=torch.float16
         )
 
+    @register_benchmark()
+    def pt2_fp8_gemm(self, a, b) -> Callable:
+        torch._dynamo.reset()
+        with inductor_config.patch(
+            max_autotune=True,
+            max_autotune_gemm_backends="TRITON",
+            autotune_fallback_to_aten=False,
+        ):
+            scale_a = torch.tensor(1.0, device=a.device)
+            scale_b = torch.tensor(1.0, device=a.device)
+            f = lambda a, b: torch._scaled_mm(
+                a, b, scale_a, scale_b, use_fast_accum=True, out_dtype=torch.float16
+            )
+            compiled = torch.compile(f, dynamic=False)
+            compiled(a, b)
+
+        return lambda: compiled(a, b)
+
     @register_benchmark()
     def triton_fp8_gemm(self, a, b):
         return lambda: tutorial_matmul(a, b)