fix float8 training benchmarks on AMD (#2737)

vkuzo · web-flow · commit c88ebe82944c · 2025-08-11T21:03:38.000-07:00
Summary: Small fixes to make the float8 training rowwise benchmarks work properly on AMD GPUs, just making sure the right float8 flavor is used. Test Plan: ```bash python benchmarks/float8/float8_roofline.py ~/local/tmp/20250811_amd_mi300x_rowwise_with_gw_hp.csv --float8_recipe_name rowwise_with_gw_hp --shape_gen_name pow2_extended ``` MI300x results: https://gist.github.com/vkuzo/586af24b4c9a90f107590ba5e96dd7eb H100 results: https://gist.github.com/vkuzo/586af24b4c9a90f107590ba5e96dd7eb Reviewers: Subscribers: Tasks: Tags:
diff --git a/benchmarks/float8/bench_matmul.py b/benchmarks/float8/bench_matmul.py
@@ -18,6 +18,7 @@
 from torchao.ops import mx_fp4_bf16
 from torchao.prototype.mx_formats.mx_tensor import to_mx
 from torchao.testing.training.roofline_utils import get_specs
+from torchao.utils import is_MI300
 
 
 @torch.inference_mode()
@@ -46,6 +47,7 @@ def run(
     bf16_peak_tops = specs["bf16_peak_tops"]
     fp8_peak_tops = specs["fp8_peak_tops"]
     fp4_peak_tops = specs.get("fp4_peak_tops", 0.0)  # only on sm120
+    print(f"recipe: {recipe}")
     print(f"gpu_name: {torch.cuda.get_device_name(0)}")
     print(
         f"peak tops: bf16 {bf16_peak_tops:.2e}, fp8 {fp8_peak_tops:.2e}, fp4 {fp4_peak_tops:.2e}"
@@ -56,8 +58,8 @@ def run(
         "M",
         "K",
         "N",
+        "ref_time_s",
         "time_s",
-        "speedup",
         "fp8_speedup",
     )
     results = []
@@ -106,7 +108,10 @@ def run(
         else:
             # raw float8 matmul (upper bound for what we can achive in eager mode)
             # TODO(future): add e5m2
-            d1, d2, d3 = torch.float8_e4m3fn, torch.float8_e4m3fn, dtype
+            e4m3_dtype = torch.float8_e4m3fn
+            if torch.version.hip and torch.cuda.is_available() and is_MI300():
+                e4m3_dtype = torch.float8_e4m3fnuz
+            d1, d2, d3 = e4m3_dtype, e4m3_dtype, dtype
             A = A_hp.to(d1)
             B = B_hp_t.to(d2).contiguous().T
             peak_tops = fp8_peak_tops
diff --git a/benchmarks/float8/float8_roofline.py b/benchmarks/float8/float8_roofline.py
@@ -67,6 +67,7 @@
     get_float8_mem_sympy,
     get_gemm_time_sympy,
 )
+from torchao.utils import is_MI300
 
 
 class LNLinearSigmoid(torch.nn.Module):
@@ -161,7 +162,10 @@ def get_gemm_times(
     if float8_recipe_name == "rowwise_with_gw_hp" and gemm_role == "grad_weight":
         f8_time_s = bf16_time_s
     else:
-        d1, d2, d3 = torch.float8_e4m3fn, torch.float8_e4m3fn, torch.bfloat16
+        e4m3_dtype = torch.float8_e4m3fn
+        if torch.version.hip and torch.cuda.is_available() and is_MI300():
+            e4m3_dtype = torch.float8_e4m3fnuz
+        d1, d2, d3 = e4m3_dtype, e4m3_dtype, torch.bfloat16
         A = torch.zeros(M, K, device=device, dtype=d1)
         B = torch.zeros(K, N, device=device, dtype=d2).t().contiguous().t()
         if float8_recipe_name == "tensorwise":
@@ -236,9 +240,11 @@ def run(
         mx_recipe_name,
         enable_fusion_modeling,
     )
-    bf16_gemm_time_sympy = get_gemm_time_sympy(M, K, N, torch.bfloat16, None, None)
+    bf16_gemm_time_sympy = get_gemm_time_sympy(
+        M, K, N, torch.bfloat16, None, None, None
+    )
     fp8_gemm_time_sympy = get_gemm_time_sympy(
-        M, K, N, torch.float8_e4m3fn, float8_recipe_name, mx_recipe_name
+        M, K, N, torch.float8_e4m3fn, float8_recipe_name, mx_recipe_name, None
     )
     print("bf16_gemm_time_sympy", bf16_gemm_time_sympy)
     print("fp8_gemm_time_sympy", fp8_gemm_time_sympy)