Revert "[TUTORIAL][03] use float8_e4m3fn(uz) instead of e5m2 and add PyTorch comparison (#6850)"

anmyachev · anmyachev · commit ca77a334ee65 · 2025-05-29T11:16:32.000Z
This reverts commit 7f3e938.
diff --git a/python/tutorials/03-matrix-multiplication.py b/python/tutorials/03-matrix-multiplication.py
@@ -407,17 +407,15 @@ def matmul(a, b, activation=""):
 else:
     exit("❌ Triton and Torch differ")
 
-TORCH_HAS_FP8 = hasattr(torch, "float8_e4m3fn") or hasattr(torch, "float8_e4m3fnuz")
-
-if TORCH_HAS_FP8:
-    fp8_dtype = torch.float8_e4m3fn if is_cuda() else torch.float8_e4m3fnuz
+TORCH_HAS_FP8 = hasattr(torch, "float8_e5m2")
+if TORCH_HAS_FP8 and is_cuda():
     torch.manual_seed(0)
     a = torch.randn((512, 512), device=DEVICE, dtype=torch.float16)
     b = torch.randn((512, 512), device=DEVICE, dtype=torch.float16)
-    a = a.to(fp8_dtype)
+    a = a.to(torch.float8_e5m2)
     # pre-transpose b for efficiency.
     b = b.T
-    b = b.to(fp8_dtype)
+    b = b.to(torch.float8_e5m2)
     triton_output = matmul(a, b)
     torch_output = torch.matmul(a.to(torch.float16), b.to(torch.float16))
     print(f"triton_output_with_fp8_inputs={triton_output}")
@@ -441,7 +439,7 @@ def matmul(a, b, activation=""):
 
 configs = []
 for fp8_inputs in [False, True]:
-    if fp8_inputs and (not TORCH_HAS_FP8):
+    if fp8_inputs and (not TORCH_HAS_FP8 or not is_cuda()):
         continue
     configs.append(
         triton.testing.Benchmark(
@@ -450,8 +448,8 @@ def matmul(a, b, activation=""):
             line_arg="provider",  # Argument name whose value corresponds to a different line in the plot
             # Possible values for `line_arg`
             # Don't compare to cublas for fp8 cases as torch.matmul doesn't support fp8 at the moment.
-            line_vals=[ref_lib.lower(), "triton"],  # Label name for the lines
-            line_names=[ref_lib, "Triton"],  # Line styles
+            line_vals=["triton"] if fp8_inputs else [ref_lib.lower(), "triton"],  # Label name for the lines
+            line_names=["Triton"] if fp8_inputs else [ref_lib, "Triton"],  # Line styles
             styles=[("green", "-"), ("blue", "-")],
             ylabel="TFLOPS",  # Label name for the y-axis
             plot_name="matmul-performance-" +
@@ -465,19 +463,12 @@ def benchmark(M, N, K, provider, fp8_inputs):
     a = torch.randn((M, K), device=DEVICE, dtype=torch.float16)
     b = torch.randn((K, N), device=DEVICE, dtype=torch.float16)
     if TORCH_HAS_FP8 and fp8_inputs:
-        fp8_dtype = torch.float8_e4m3fn if is_cuda() else torch.float8_e4m3fnuz
-        a = a.to(fp8_dtype)
+        a = a.to(torch.float8_e5m2)
         b = b.T
-        b = b.to(fp8_dtype)
+        b = b.to(torch.float8_e5m2)
     quantiles = [0.5, 0.2, 0.8]
     if provider == ref_lib.lower():
-        if fp8_inputs:
-            one_device = torch.tensor(1., device=a.device, dtype=torch.float32)
-            ref_fn = lambda: torch._scaled_mm(a, b, scale_a=one_device, scale_b=one_device, out_dtype=torch.float16,
-                                              use_fast_accum=True)
-        else:
-            ref_fn = lambda: torch.matmul(a, b)
-        ms, min_ms, max_ms = triton.testing.do_bench(ref_fn, quantiles=quantiles)
+        ms, min_ms, max_ms = triton.testing.do_bench(lambda: torch.matmul(a, b), quantiles=quantiles)
     if provider == 'triton':
         ms, min_ms, max_ms = triton.testing.do_bench(lambda: matmul(a, b), quantiles=quantiles)
     perf = lambda ms: 2 * M * N * K * 1e-12 / (ms * 1e-3)