[benchmarks] Reworked the conversion benchmark and added more tests for up/down casts (#4800)

AndreyPavlenko · etiotto · anmyachev · web-flow · commit 1659ad96fdc9 · 2025-09-04T16:53:07.000+02:00
Here is an example of the benchmark report:
```TSV
float-upcast:
            N  fp8e4nv-&gt;fp16  fp8e4nv-&gt;bf16  fp8e4nv-&gt;fp32  fp8e5-&gt;fp16  fp8e5-&gt;bf16  fp8e5-&gt;fp32  fp16-&gt;fp32  bf16-&gt;fp32
0      4096.0       0.010524       0.010252       0.010187     0.080630     0.010488     0.043948    0.139510    0.193939
1     16384.0       0.042079       0.041001       0.040740     0.322013     0.041959     0.175794    0.556522    0.771375
2     65536.0       0.168248       0.163971       0.162928     1.286028     0.167731     0.701971    2.223066    3.079699
3    262144.0       0.507677       0.501730       0.493866     5.088199     0.503581     2.786395    8.820458   12.158813
4   1048576.0       1.202495       1.163017       1.182960    12.700775     1.169868     8.868200   30.411137   26.532794
5   4194304.0       1.623786       1.587861       1.594501    21.531335     1.587212    15.298745   50.582537   42.538580
6  16777216.0       1.961367       1.927211       1.924611    33.078107     1.923428    23.340590   83.022644   69.788752
7  67108864.0       2.121850       2.079672       2.081887    35.847220     2.076763    25.215246   95.001223   78.473379
float-downcast:
            N  fp16-&gt;fp8e4nv/rtne  fp16-&gt;fp8e5/rtne  bf16-&gt;fp8e4nv/rtne  bf16-&gt;fp8e5/rtne  fp32-&gt;fp8e4nv/rtne  fp32-&gt;fp8e4nv/rtz  fp32-&gt;fp8e5/rtne  fp32-&gt;fp8e5/rtz  fp32-&gt;fp16/rtne  fp32-&gt;fp16/rtz  fp32-&gt;bf16/rtne  fp32-&gt;bf16/rtz
0      4096.0            0.023953          0.071309            0.020427          0.020877            0.028330           0.202172          0.051561         0.304762         0.436674        0.508189         0.432981        0.135899
1     16384.0            0.095768          0.285237            0.081626          0.083490            0.113290           0.807094          0.206036         1.213630         1.731924        2.017734         1.724632        0.542517
2     65536.0            0.382893          1.138173            0.326310          0.333618            0.452534           3.222026          0.823523         4.825920         6.898526        8.031373         6.855230        2.167196
3    262144.0            1.490979          3.903276            1.226577          1.254518            1.786452          11.270163          3.193762        17.558205        26.969547       29.721542        26.586613        8.594885
4   1048576.0            4.111418          5.797081            2.281100          2.298904            6.448014          29.355431         10.123344        53.335504        60.401843       72.067079        57.174264       18.035363
5   4194304.0            7.612720         12.685410            4.564980          4.603662           12.267993          45.939803         18.057103        85.667974       103.768036      114.161786        97.000555       28.142136
6  16777216.0           10.127255         14.296733            5.533675          5.576271           18.747797          55.520604         26.028944       111.877941       179.781569      162.617195       168.074694       46.045713
7  67108864.0           10.861711         15.523535            5.929267          5.961534           18.644044          57.701748         27.834568       117.730718       204.201753      181.876698       191.695795       49.386877
```

---------

Co-authored-by: Ettore Tiotto &lt;ettore.tiotto@intel.com&gt;
Co-authored-by: Anatoly Myachev &lt;anatoliimyachev@mail.com&gt;
diff --git a/benchmarks/micro_benchmarks/conversion/float_conversion/__init__.py b/benchmarks/micro_benchmarks/conversion/float_conversion/__init__.py
@@ -1 +1 @@
-from .float_conversion import benchmark  # type: ignore # noqa: F401
+from .float_conversion import get_benchmarks, run_benchmarks  # type: ignore # noqa: F401
diff --git a/benchmarks/micro_benchmarks/conversion/float_conversion/float_conversion.py b/benchmarks/micro_benchmarks/conversion/float_conversion/float_conversion.py
@@ -2,67 +2,96 @@
 import triton
 import triton.language as tl
 
+from triton_kernels_benchmark import Benchmark, do_bench, perf_report
+
+TYPES = {
+    tl.float8e4nv: torch.float8_e4m3fn, tl.float8e5: torch.float8_e5m2, tl.float16: torch.float16, tl.bfloat16:
+    torch.bfloat16, tl.float32: torch.float32
+}
+
 
 @triton.jit
-def float_trunc_kernel(
+def float_conversion_kernel(
     x_ptr,
+    y_ptr,
     n_elements,
     BLOCK_SIZE: tl.constexpr,
-    target_type: tl.constexpr,
+    x_type: tl.constexpr,
+    y_type: tl.constexpr,
+    rnd: tl.constexpr,
 ):
     pid = tl.program_id(axis=0)
     block_start = pid * BLOCK_SIZE
     offsets = block_start + tl.arange(0, BLOCK_SIZE)
     mask = offsets < n_elements
+    x_itype = tl.int8 if x_type.itemsize == 1 else tl.int16 if x_type.itemsize == 2 else tl.int32
+    y_itype = tl.int8 if y_type.itemsize == 1 else tl.int16 if y_type.itemsize == 2 else tl.int32
 
     x = tl.load(x_ptr + offsets, mask=mask)
+    converted = x.to(y_type, fp_downcast_rounding=rnd)
+    x = tl.cast(x, x_itype, bitcast=True)
+    y = tl.cast(converted, y_itype, bitcast=True)
+    for i in range(99):
+        x += tl.full(x.shape, i, x_itype)
+        converted = tl.cast(x, x_type, bitcast=True).to(y_type, fp_downcast_rounding=rnd)
+        y += tl.cast(converted, y_itype, bitcast=True)
+    y = tl.cast(y, y_type, bitcast=True)
+    tl.store(y_ptr + offsets, y, mask=mask)
+
+
+def get_bench(x_type, y_type):
+    assert x_type.itemsize < y_type.itemsize
+    plot_name = f'{x_type}-{y_type}'
+    line_vals = [(x_type, y_type, None), (y_type, x_type, 'rtne')]
+    line_names = [f'{x_type}->{y_type}', f'{y_type}->{x_type}-rtne']
+    if y_type == tl.float32:
+        line_vals.append((y_type, x_type, 'rtz'))
+        line_names.append(f'{y_type}->{x_type}-rtz')
+
+    @perf_report(
+        Benchmark(
+            x_names=['N'],
+            x_vals=[2**i for i in range(12, 28, 2)],
+            line_arg='args',
+            line_vals=line_vals,
+            line_names=line_names,
+            styles=[(c, s) for c in 'bgry' for s in ('-', '--', '-.', ':')],
+            ylabel=('GB/s', ),
+            plot_name=plot_name,
+            args={},
+        ))
+    def bench(N, args):
+        quantiles = [0.5, 0.2, 0.8]
+        x_type = args[0]
+        y_type = args[1]
+        if x_type.itemsize == 1:
+            x = torch.rand(N, dtype=torch.float16, device='xpu', requires_grad=True).to(TYPES[x_type])
+        else:
+            x = torch.rand(N, dtype=TYPES[x_type], device='xpu', requires_grad=True)
+        y = torch.empty_like(x, dtype=TYPES[y_type], device='xpu')
+        rnd = args[2] if x_type.itemsize > y_type.itemsize else None
+
+        def fwd():
+            BLOCK_SIZE = 4096
+            grid = lambda meta: (triton.cdiv(N, meta['BLOCK_SIZE']), )
+            float_conversion_kernel[grid](x, y, N, BLOCK_SIZE, x_type, y_type, rnd)
+            return x
+
+        _, min_ms, max_ms, mean_ms, cv = do_bench(fwd, n_warmup=10, n_repeat=10, quantiles=quantiles)
+        gbps = lambda ms: (N * x.element_size() * 1e-9) / (ms * 1e-3)
+        return (gbps(mean_ms), gbps(max_ms), gbps(min_ms)), cv
+
+    return bench
+
+
+def get_benchmarks():
+    return [get_bench(s, t) for s in TYPES for t in TYPES if s.itemsize < t.itemsize]
+
 
-    as_target = x.to(target_type)
-    as_f32 = as_target.to(tl.float32)
-    for _ in range(100):
-        as_f32 += 1  # plus one ensures that there are no redundant conversions that can be removed
-        as_target = as_f32.to(target_type)
-        as_f32 = as_target.to(tl.float32)
-
-    tl.store(x_ptr + offsets, as_f32, mask=mask)
-
-
-def launch_conversion(x: torch.Tensor, target_type: type):
-    assert x.is_xpu
-    n_elements = x.numel()
-    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']), )
-    float_trunc_kernel[grid](x, n_elements, BLOCK_SIZE=1024, target_type=target_type)
-    return x
-
-
-@triton.testing.perf_report(
-    triton.testing.Benchmark(
-        x_names=['N'],
-        x_vals=[2**i for i in range(12, 28, 2)],
-        line_arg='target_type',
-        line_vals=['bfloat16', 'float16'],
-        line_names=['BF16', 'FP16'],
-        styles=[('blue', '-'), ('green', '-'), ('orange', '-')],
-        ylabel='GB/s',
-        plot_name='float-conversion',
-        args={},
-    ))
-def benchmark(N, target_type):
-    quantiles = [0.5, 0.2, 0.8]
-    inputs = torch.rand(N, dtype=torch.float32, device='xpu', requires_grad=True)
-
-    if target_type == 'bfloat16':
-        fwd = lambda: launch_conversion(inputs, tl.bfloat16)
-    elif target_type == 'float16':
-        fwd = lambda: launch_conversion(inputs, tl.float16)
-    else:
-        raise NotImplementedError(f'Type {target_type} is not supported')
-
-    ms, min_ms, max_ms = triton.testing.do_bench(fwd, quantiles=quantiles)
-    gbps = lambda ms: (inputs.numel() * inputs.element_size() * 1e-9) / (ms * 1e-3)
-
-    return gbps(ms), gbps(max_ms), gbps(min_ms)
+def run_benchmarks():
+    for bench in get_benchmarks():
+        bench.run(print_data=True)
 
 
 if __name__ == '__main__':
-    benchmark.run(print_data=True)
+    run_benchmarks()
diff --git a/benchmarks/micro_benchmarks/core_ops/dot_scaled.py b/benchmarks/micro_benchmarks/core_ops/dot_scaled.py
@@ -2,6 +2,8 @@
 import triton
 import triton.language as tl
 
+from triton_kernels_benchmark import Benchmark, do_bench, perf_report
+
 
 @triton.jit
 def dot_scale_kernel(a_base, stride_a0, stride_a1, a_scale, b_base, stride_b0, stride_b1, b_scale, out,
@@ -37,8 +39,8 @@ def dot_scaled(M, N, K, x, y, z, scale_x, scale_y, type_a, type_b, num_warps):
 
 
 # Benchmark Performance
-@triton.testing.perf_report(
-    triton.testing.Benchmark(
+@perf_report(
+    Benchmark(
         # argument names to use as an x-axis for the plot
         x_names=['M', 'K', 'N', 'col_a', 'col_b', 'rhs_scale', 'mxfp_type', 'normal_type'],
         x_vals=[(M, N, K, col_a, col_b, rhs_scale, mxfp_type, normal_type)
@@ -122,7 +124,7 @@ def make_finite(x, dtype):
     if provider == 'triton':
         triton_fn = lambda: dot_scaled(M, N, K, x, y, z, scale_x, scale_y, type_a, type_b, num_warps)
 
-        ms, min_ms, max_ms = triton.testing.do_bench(triton_fn, quantiles=quantiles)
+        _, min_ms, max_ms, mean_ms, cv = do_bench(triton_fn, n_warmup=10, n_repeat=10, quantiles=quantiles)
     else:
         raise NotImplementedError(f'Unsupported provider {provider}')
 
@@ -141,8 +143,16 @@ def size_x(m, n, ty):
         scale_size = (M * K // 32) if rhs_scale else (N * K // 32)
         return (tensor_size + scale_size + 4.0 * (M * N)) * (1e-9) / (ms * 1e-3)
 
-    return gbps(ms), gbps(max_ms), gbps(min_ms)
+    def tflops(ms):
+        scale_size = (M * K // 32) if rhs_scale else (N * K // 32)
+        return (2 * M * N * K + scale_size) * (1e-12) / (ms * 1e-3)
+
+    return (gbps(mean_ms), gbps(max_ms), gbps(min_ms)), (tflops(mean_ms), tflops(max_ms), tflops(min_ms)), cv
 
 
-if __name__ == '__main__':
+def run_benchmarks():
     benchmark.run(show_plots=False, print_data=True)
+
+
+if __name__ == '__main__':
+    run_benchmarks()
diff --git a/benchmarks/micro_benchmarks/run_benchmarks.py b/benchmarks/micro_benchmarks/run_benchmarks.py
@@ -1,16 +1,6 @@
-import argparse
-
 from conversion import float_conversion
 from core_ops import dot_scaled
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--reports',
-        type=str,
-        default='',
-        help='directory to save reports',
-    )
-    args = parser.parse_args()
-    float_conversion.benchmark.run(print_data=True, save_path=args.reports)
-    dot_scaled.benchmark.run(print_data=True, save_path=args.reports)
+    for mod in (float_conversion, dot_scaled):
+        mod.run_benchmarks()

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-from .float_conversion import benchmark # type: ignore # noqa: F401`
	`1`	`+from .float_conversion import get_benchmarks, run_benchmarks # type: ignore # noqa: F401`