[benchmarks] Reworked the conversion benchmark and added more tests for up/down casts

AndreyPavlenko · AndreyPavlenko · commit b5381cf437f3 · 2025-07-29T13:03:00.000Z
diff --git a/benchmarks/micro_benchmarks/conversion/float_conversion/__init__.py b/benchmarks/micro_benchmarks/conversion/float_conversion/__init__.py
@@ -1 +1 @@
-from .float_conversion import benchmark  # type: ignore # noqa: F401
+from .float_conversion import bench_upcast, bench_downcast  # type: ignore # noqa: F401
diff --git a/benchmarks/micro_benchmarks/conversion/float_conversion/float_conversion.py b/benchmarks/micro_benchmarks/conversion/float_conversion/float_conversion.py
@@ -1,68 +1,105 @@
+from functools import lru_cache
+
 import torch
 import triton
 import triton.language as tl
 
+TYPES = {
+    tl.float8e4nv: torch.float8_e4m3fn, tl.float8e5: torch.float8_e5m2, tl.float16: torch.float16, tl.bfloat16:
+    torch.bfloat16, tl.float32: torch.float32
+}
+UP_VALS = [(s, t) for s in TYPES for t in TYPES if s.itemsize < t.itemsize]
+DOWN_VALS = [(s, t, r) for s in TYPES for t in TYPES if s.itemsize > t.itemsize for r in ('rtne', 'rtz')
+             if r == 'rtne' or s == tl.float32]
+
+
+@lru_cache
+def _make_kernel(name):
+
+    def kernel(
+        x_ptr,
+        y_ptr,
+        n_elements,
+        BLOCK_SIZE: tl.constexpr,
+        x_type: tl.constexpr,
+        y_type: tl.constexpr,
+        rnd: tl.constexpr,
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+        x_itype = tl.int8 if x_type.itemsize == 1 else tl.int16 if x_type.itemsize == 2 else tl.int32
+        y_itype = tl.int8 if y_type.itemsize == 1 else tl.int16 if y_type.itemsize == 2 else tl.int32
+
+        x = tl.load(x_ptr + offsets, mask=mask)
+        converted = x.to(y_type, fp_downcast_rounding=rnd)
+        x = tl.cast(x, x_itype, bitcast=True)
+        y = tl.cast(converted, y_itype, bitcast=True)
+        for i in range(99):
+            x += tl.full(x.shape, i, x_itype)
+            converted = tl.cast(x, x_type, bitcast=True).to(y_type, fp_downcast_rounding=rnd)
+            y += tl.cast(converted, y_itype, bitcast=True)
+        y = tl.cast(y, y_type, bitcast=True)
+        tl.store(y_ptr + offsets, y, mask=mask)
+
+    kernel.__name__ = kernel.__qualname__ = name
+    return triton.jit(kernel)
 
-@triton.jit
-def float_trunc_kernel(
-    x_ptr,
-    n_elements,
-    BLOCK_SIZE: tl.constexpr,
-    target_type: tl.constexpr,
-):
-    pid = tl.program_id(axis=0)
-    block_start = pid * BLOCK_SIZE
-    offsets = block_start + tl.arange(0, BLOCK_SIZE)
-    mask = offsets < n_elements
-
-    x = tl.load(x_ptr + offsets, mask=mask)
-
-    as_target = x.to(target_type)
-    as_f32 = as_target.to(tl.float32)
-    for _ in range(100):
-        as_f32 += 1  # plus one ensures that there are no redundant conversions that can be removed
-        as_target = as_f32.to(target_type)
-        as_f32 = as_target.to(tl.float32)
-
-    tl.store(x_ptr + offsets, as_f32, mask=mask)
-
-
-def launch_conversion(x: torch.Tensor, target_type: type):
-    assert x.is_xpu
-    n_elements = x.numel()
-    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']), )
-    float_trunc_kernel[grid](x, n_elements, BLOCK_SIZE=1024, target_type=target_type)
-    return x
-
-
-@triton.testing.perf_report(
-    triton.testing.Benchmark(
-        x_names=['N'],
-        x_vals=[2**i for i in range(12, 28, 2)],
-        line_arg='target_type',
-        line_vals=['bfloat16', 'float16'],
-        line_names=['BF16', 'FP16'],
-        styles=[('blue', '-'), ('green', '-'), ('orange', '-')],
-        ylabel='GB/s',
-        plot_name='float-conversion',
-        args={},
-    ))
-def benchmark(N, target_type):
-    quantiles = [0.5, 0.2, 0.8]
-    inputs = torch.rand(N, dtype=torch.float32, device='xpu', requires_grad=True)
 
-    if target_type == 'bfloat16':
-        fwd = lambda: launch_conversion(inputs, tl.bfloat16)
-    elif target_type == 'float16':
-        fwd = lambda: launch_conversion(inputs, tl.float16)
+def _benchmark(N, args):
+    quantiles = [0.5, 0.2, 0.8]
+    x_type = args[0]
+    y_type = args[1]
+    if x_type.itemsize == 1:
+        x = torch.rand(N, dtype=torch.float16, device='xpu', requires_grad=True).to(TYPES[x_type])
     else:
-        raise NotImplementedError(f'Type {target_type} is not supported')
+        x = torch.rand(N, dtype=TYPES[x_type], device='xpu', requires_grad=True)
+    y = torch.empty_like(x, dtype=TYPES[y_type], device='xpu')
+    rnd = args[2] if x_type.itemsize > y_type.itemsize else None
+    name = f"{x_type}_to_{y_type}_conversion_kernel"
+    if rnd:
+        name = f"{rnd}_{name}"
+    kernel = _make_kernel(name)
 
-    ms, min_ms, max_ms = triton.testing.do_bench(fwd, quantiles=quantiles)
-    gbps = lambda ms: (inputs.numel() * inputs.element_size() * 1e-9) / (ms * 1e-3)
+    def fwd():
+        BLOCK_SIZE = 4096
+        grid = lambda meta: (triton.cdiv(N, meta['BLOCK_SIZE']), )
+        kernel[grid](x, y, N, BLOCK_SIZE, x_type, y_type, rnd)
+        return x
 
+    ms, min_ms, max_ms = triton.testing.do_bench(fwd, quantiles=quantiles)
+    gbps = lambda ms: (N * x.element_size() * 1e-9) / (ms * 1e-3)
     return gbps(ms), gbps(max_ms), gbps(min_ms)
 
 
+def _report(plot_name, line_names, line_vals):
+    report = triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=['N'],
+            x_vals=[2**i for i in range(12, 28, 2)],
+            line_arg='args',
+            line_vals=line_vals,
+            line_names=line_names,
+            styles=[(c, s) for c in 'bgry' for s in ('-', '--', '-.', ':')],
+            ylabel='GB/s',
+            plot_name=plot_name,
+            args={},
+        ))
+    return report(_benchmark)
+
+
+bench_upcast = _report(
+    plot_name='float-upcast',
+    line_names=[f"{s}->{t}" for s, t in UP_VALS],
+    line_vals=UP_VALS,
+)
+bench_downcast = _report(
+    plot_name='float-downcast',
+    line_names=[f"{s}->{t}/{r}" for s, t, r in DOWN_VALS],
+    line_vals=DOWN_VALS,
+)
+
 if __name__ == '__main__':
-    benchmark.run(print_data=True)
+    bench_upcast.run(print_data=True)
+    bench_downcast.run(print_data=True)
diff --git a/benchmarks/micro_benchmarks/run_benchmarks.py b/benchmarks/micro_benchmarks/run_benchmarks.py
@@ -12,5 +12,6 @@
         help='directory to save reports',
     )
     args = parser.parse_args()
-    float_conversion.benchmark.run(print_data=True, save_path=args.reports)
+    float_conversion.bench_upcast.run(print_data=True, save_path=args.reports)
+    float_conversion.bench_downcast.run(print_data=True, save_path=args.reports)
     dot_scaled.benchmark.run(print_data=True, save_path=args.reports)

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-from .float_conversion import benchmark # type: ignore # noqa: F401`
	`1`	`+from .float_conversion import bench_upcast, bench_downcast # type: ignore # noqa: F401`
Original file line number	Diff line number	Diff line change
`@@ -12,5 +12,6 @@`
`12`	`12`	`help='directory to save reports',`
`13`	`13`	`)`
`14`	`14`	`args = parser.parse_args()`
`15`		`- float_conversion.benchmark.run(print_data=True, save_path=args.reports)`
	`15`	`+ float_conversion.bench_upcast.run(print_data=True, save_path=args.reports)`
	`16`	`+ float_conversion.bench_downcast.run(print_data=True, save_path=args.reports)`
`16`	`17`	`dot_scaled.benchmark.run(print_data=True, save_path=args.reports)`