extend the MX cast benchmark to include casting to mxfp4 (#2693)

vkuzo · web-flow · commit 5d99ce4f8e32 · 2025-08-06T08:07:38.000-04:00
Update

[ghstack-poisoned]
diff --git a/benchmarks/mx_formats/cast_bench.py b/benchmarks/mx_formats/cast_bench.py
@@ -54,18 +54,24 @@ def scale_dim0_dim1_reference(
     return x_hp_d0_normalized, x_hp_d1_normalized.t(), amax_dim0, amax_dim1
 
 
-def to_mx_dim0_reference(x_hp, block_size, scaling_mode=ScaleCalculationMode.FLOOR):
-    scale_d0, data_d0 = to_mx(
-        x_hp, torch.float8_e4m3fn, block_size, scaling_mode=scaling_mode
-    )
+def to_mx_dim0_reference(
+    x_hp,
+    block_size,
+    scaling_mode=ScaleCalculationMode.FLOOR,
+    target_dtype=torch.float8_e4m3fn,
+):
+    scale_d0, data_d0 = to_mx(x_hp, target_dtype, block_size, scaling_mode=scaling_mode)
     return data_d0, scale_d0
 
 
-def to_mx_dim1_reference(x_hp, block_size, scaling_mode=ScaleCalculationMode.FLOOR):
+def to_mx_dim1_reference(
+    x_hp,
+    block_size,
+    scaling_mode=ScaleCalculationMode.FLOOR,
+    target_dtype=torch.float8_e4m3fn,
+):
     x_hp = x_hp.t().contiguous()
-    scale_d1, data_d1 = to_mx(
-        x_hp, torch.float8_e4m3fn, block_size, scaling_mode=scaling_mode
-    )
+    scale_d1, data_d1 = to_mx(x_hp, target_dtype, block_size, scaling_mode=scaling_mode)
     return data_d1.t(), scale_d1
 
 
@@ -88,13 +94,14 @@ def run(
         "dim0",
         "dim1",
         "dim0_dim1",
-        "dim0_mx_floor",
-        "dim0_mx_rceil",
-        "dim1_mx_floor",
-        "dim1_mx_rceil",
-        "dim1_mx_triton_floor",
-        "dim1_mx_cuda_floor",
-        "dim1_mx_cuda_rceil",
+        "dim0_mxfp8_floor",
+        "dim0_mxfp4_floor",
+        "dim0_mxfp8_rceil",
+        "dim1_mxfp8_floor",
+        "dim1_mxfp8_rceil",
+        "dim1_mxfp8_triton_floor",
+        "dim1_mxfp8_cuda_floor",
+        "dim1_mxfp8_cuda_rceil",
     )
 
     x = torch.randn(M, K, dtype=torch.bfloat16, device="cuda") * 1000
@@ -154,7 +161,7 @@ def run(
         )
         bps = bytes_rw / (time_us / 1e6)
 
-    elif mode == "dim0_mx_floor":
+    elif mode == "dim0_mxfp8_floor":
         to_mx_dim0_reference_c = torch.compile(to_mx_dim0_reference)
         y_d0, s_d0 = to_mx_dim0_reference_c(x, BLOCK_SIZE)
 
@@ -172,7 +179,32 @@ def run(
         bytes_w = (y_d0.numel() + s_d0.numel()) * bytes_per_el_fp8
         bps = (bytes_r + bytes_w) / (time_us / 1e6)
 
-    elif mode == "dim0_mx_rceil":
+    elif mode == "dim0_mxfp4_floor":
+        to_mx_dim0_reference_c = torch.compile(to_mx_dim0_reference)
+        y_d0, s_d0 = to_mx_dim0_reference_c(
+            x, BLOCK_SIZE, target_dtype=torch.float4_e2m1fn_x2
+        )
+
+        for _ in range(2):
+            __ = to_mx_dim0_reference_c(
+                x, BLOCK_SIZE, target_dtype=torch.float4_e2m1fn_x2
+            )
+        time_us = benchmark_cuda_function_in_microseconds(
+            lambda x, b: to_mx_dim0_reference_c(
+                x, BLOCK_SIZE, target_dtype=torch.float4_e2m1fn_x2
+            ),
+            x,
+            BLOCK_SIZE,
+        )
+
+        # TODO(future PR): make to_mx return float4 directly
+        assert y_d0.dtype == torch.uint8
+        assert s_d0.dtype == torch.float8_e8m0fnu
+        bytes_r = x.numel() * bytes_per_el_bf16
+        bytes_w = (y_d0.numel() + s_d0.numel()) * bytes_per_el_fp8
+        bps = (bytes_r + bytes_w) / (time_us / 1e6)
+
+    elif mode == "dim0_mxfp8_rceil":
         to_mx_dim0_reference_c = torch.compile(to_mx_dim0_reference)
         y_d0, s_d0 = to_mx_dim0_reference_c(x, BLOCK_SIZE, ScaleCalculationMode.RCEIL)
 
@@ -190,7 +222,7 @@ def run(
         bytes_w = (y_d0.numel() + s_d0.numel()) * bytes_per_el_fp8
         bps = (bytes_r + bytes_w) / (time_us / 1e6)
 
-    elif mode == "dim1_mx_floor":
+    elif mode == "dim1_mxfp8_floor":
         to_mx_dim1_reference_c = torch.compile(to_mx_dim1_reference)
         y_d1, s_d1 = to_mx_dim1_reference_c(x, BLOCK_SIZE)
 
@@ -208,7 +240,7 @@ def run(
         bytes_w = (y_d1.numel() + s_d1.numel()) * bytes_per_el_fp8
         bps = (bytes_r + bytes_w) / (time_us / 1e6)
 
-    elif mode == "dim1_mx_rceil":
+    elif mode == "dim1_mxfp8_rceil":
         to_mx_dim1_reference_c = torch.compile(to_mx_dim1_reference)
         y_d1, s_d1 = to_mx_dim1_reference_c(x, BLOCK_SIZE, ScaleCalculationMode.RCEIL)
 
@@ -226,7 +258,7 @@ def run(
         bytes_w = (y_d1.numel() + s_d1.numel()) * bytes_per_el_fp8
         bps = (bytes_r + bytes_w) / (time_us / 1e6)
 
-    elif mode == "dim1_mx_triton_floor":
+    elif mode == "dim1_mxfp8_triton_floor":
         y_d1, s_d1 = triton_to_mxfp8_dim1(x, inner_block_size=BLOCK_SIZE)
 
         for _ in range(2):
@@ -243,7 +275,7 @@ def run(
         bytes_w = (y_d1.numel() + s_d1.numel()) * bytes_per_el_fp8
         bps = (bytes_r + bytes_w) / (time_us / 1e6)
 
-    elif mode == "dim1_mx_cuda_floor":
+    elif mode == "dim1_mxfp8_cuda_floor":
         from torchao.prototype import mxfp8_cuda
 
         _, y_d1, _, s_d1 = mxfp8_cuda.quantize(
@@ -269,7 +301,7 @@ def run(
         bytes_w = (y_d1.numel() + s_d1.numel()) * bytes_per_el_fp8
         bps = (bytes_r + bytes_w) / (time_us / 1e6)
 
-    elif mode == "dim1_mx_cuda_rceil":
+    elif mode == "dim1_mxfp8_cuda_rceil":
         from torchao.prototype import mxfp8_cuda
 
         _, y_d1, _, s_d1 = mxfp8_cuda.quantize(