[fp8 blockwise] wrap triton quantization kernels in custom ops for torch.compile compatibility (#2829)

danielvegamyhre · web-flow · commit 364ad471b287 · 2025-08-27T18:56:31.000-07:00
diff --git a/benchmarks/prototype/blockwise_fp8_training/bench_1x128_128x128_gemms.py b/benchmarks/prototype/blockwise_fp8_training/bench_1x128_128x128_gemms.py
@@ -15,8 +15,8 @@
 from triton.testing import do_bench
 
 from torchao.prototype.blockwise_fp8_training.kernels import (
-    fp8_blockwise_act_quant_lhs,
-    fp8_blockwise_weight_quant_transposed_rhs,
+    triton_fp8_blockwise_act_quant_lhs,
+    triton_fp8_blockwise_weight_quant_transposed_rhs,
     triton_fp8_gemm_1x128_128x128,
 )
 
@@ -78,8 +78,8 @@ def run_experiment(config: ExperimentConfig) -> ExperimentResult:
     M, N, K = config.m, config.n, config.k
     A = torch.randn(M, K, dtype=config.out_dtype, device="cuda")
     B = torch.randn(N, K, dtype=config.out_dtype, device="cuda")
-    A_q, A_s = fp8_blockwise_act_quant_lhs(A, dtype=torch.float8_e4m3fn)
-    B_t_q, B_t_s = fp8_blockwise_weight_quant_transposed_rhs(
+    A_q, A_s = triton_fp8_blockwise_act_quant_lhs(A, dtype=torch.float8_e4m3fn)
+    B_t_q, B_t_s = triton_fp8_blockwise_weight_quant_transposed_rhs(
         B, dtype=torch.float8_e4m3fn
     )
 
diff --git a/benchmarks/prototype/blockwise_fp8_training/bench_1x128_128x1_gemms.py b/benchmarks/prototype/blockwise_fp8_training/bench_1x128_128x1_gemms.py
@@ -15,8 +15,8 @@
 from triton.testing import do_bench
 
 from torchao.prototype.blockwise_fp8_training.kernels import (
-    fp8_blockwise_act_quant_rhs,
-    fp8_blockwise_act_quant_transposed_lhs,
+    triton_fp8_blockwise_act_quant_rhs,
+    triton_fp8_blockwise_act_quant_transposed_lhs,
     triton_fp8_gemm_1x128_128x1,
 )
 
@@ -78,8 +78,10 @@ def run_experiment(config: ExperimentConfig) -> ExperimentResult:
     M, N, K = config.m, config.n, config.k
     A = torch.randn(M, N, dtype=config.out_dtype, device="cuda")
     B = torch.randn(M, K, dtype=config.out_dtype, device="cuda")
-    A_t_q, A_t_s = fp8_blockwise_act_quant_transposed_lhs(A, dtype=torch.float8_e4m3fn)
-    B_q, B_s = fp8_blockwise_act_quant_rhs(B, dtype=torch.float8_e4m3fn)
+    A_t_q, A_t_s = triton_fp8_blockwise_act_quant_transposed_lhs(
+        A, dtype=torch.float8_e4m3fn
+    )
+    B_q, B_s = triton_fp8_blockwise_act_quant_rhs(B, dtype=torch.float8_e4m3fn)
 
     def warmup(func, *args, **kwargs):
         for _ in range(10):
diff --git a/benchmarks/utils.py b/benchmarks/utils.py
@@ -1,26 +1,22 @@
-import statistics
-from time import perf_counter_ns
-
 import torch
 from torch.nn import functional as F
+from triton.testing import do_bench
 
 
 def bench_fwd_bwd_microseconds(
     fn, *args, labels=None, use_compile=False, fullgraph=True, **kwargs
 ):
     assert labels is not None
-    fn = torch.compile(fn, fullgraph=fullgraph) if use_compile else fn
-    times = []
-    for _ in range(10):
-        start_ns = perf_counter_ns()
+
+    def fwd_bwd():
         out = fn(*args, **kwargs)
         loss = F.mse_loss(out, labels)
         loss.backward()
-        torch.cuda.synchronize()
-        end_ns = perf_counter_ns()
-        duration_us = (end_ns - start_ns) / 1000
-        times.append(duration_us)
-    return statistics.median(times)
+
+    fwd_bwd_compiled = (
+        torch.compile(fwd_bwd, fullgraph=fullgraph) if use_compile else fwd_bwd
+    )
+    return benchmark_cuda_function_in_microseconds(fwd_bwd_compiled)
 
 
 def profile_fwd_bwd(
@@ -56,3 +52,7 @@ def profile_fwd_bwd(
     # Save profiler results
     prof.export_chrome_trace(f"{profile_name}.json")
     print(f"Saved: {profile_name}.json")
+
+
+def benchmark_cuda_function_in_microseconds(f, *args, **kwargs):
+    return do_bench(lambda: f(*args, **kwargs), return_mode="median") * 1e3
diff --git a/test/prototype/blockwise_fp8_training/test_blockwise_kernels.py b/test/prototype/blockwise_fp8_training/test_blockwise_kernels.py
@@ -12,14 +12,14 @@
 from packaging import version
 from torchao.float8.float8_utils import compute_error
 from torchao.prototype.blockwise_fp8_training.kernels import (
-    fp8_blockwise_act_quant_lhs,
-    fp8_blockwise_act_quant_rhs,
-    fp8_blockwise_act_quant_transposed_lhs,
-    fp8_blockwise_weight_quant_rhs,
-    fp8_blockwise_weight_quant_transposed_rhs,
     torch_blockwise_scale_act_quant_lhs,
     torch_blockwise_scale_act_quant_rhs,
     torch_blockwise_scale_weight_quant,
+    triton_fp8_blockwise_act_quant_lhs,
+    triton_fp8_blockwise_act_quant_rhs,
+    triton_fp8_blockwise_act_quant_transposed_lhs,
+    triton_fp8_blockwise_weight_quant_rhs,
+    triton_fp8_blockwise_weight_quant_transposed_rhs,
     triton_fp8_gemm_1x128_128x1,
     triton_fp8_gemm_1x128_128x128,
 )
@@ -51,8 +51,8 @@ def test_triton_fp8_gemm_1x128_128x128(M, N, K, dtype):
     A = torch.randn(M, K, dtype=torch.bfloat16, device="cuda")
     B = torch.randn(N, K, dtype=torch.bfloat16, device="cuda")
     C = A @ B.T
-    A_q, A_s = fp8_blockwise_act_quant_lhs(A, dtype=dtype)
-    B_t_q, B_t_s = fp8_blockwise_weight_quant_transposed_rhs(B, dtype=dtype)
+    A_q, A_s = triton_fp8_blockwise_act_quant_lhs(A, dtype=dtype)
+    B_t_q, B_t_s = triton_fp8_blockwise_weight_quant_transposed_rhs(B, dtype=dtype)
     C_q = triton_fp8_gemm_1x128_128x128(
         A_q, B_t_q, A_s, B_t_s, out_dtype=torch.bfloat16
     )
@@ -76,8 +76,8 @@ def test_triton_fp8_gemm_1x128_128x1(M, N, K, dtype):
     A = torch.randn(K, M, dtype=torch.bfloat16, device="cuda")
     B = torch.randn(K, N, dtype=torch.bfloat16, device="cuda")
     C = A.T @ B
-    A_t_q, A_t_s = fp8_blockwise_act_quant_transposed_lhs(A, dtype=dtype)
-    B_q, B_s = fp8_blockwise_act_quant_rhs(B, dtype=dtype)
+    A_t_q, A_t_s = triton_fp8_blockwise_act_quant_transposed_lhs(A, dtype=dtype)
+    B_q, B_s = triton_fp8_blockwise_act_quant_rhs(B, dtype=dtype)
     C_q = triton_fp8_gemm_1x128_128x1(A_t_q, B_q, A_t_s, B_s, out_dtype=torch.bfloat16)
 
     assert not C_q.isnan().any(), "C_q must not contain NaNs"
@@ -102,7 +102,7 @@ def test_triton_quantize_fp8_act_quant_lhs(block_size):
     x[0, :block_size] = 0.0
 
     # Get the quantized tensor and reciprocal scales using triton implementation
-    triton_fp8, triton_scale = fp8_blockwise_act_quant_lhs(
+    triton_fp8, triton_scale = triton_fp8_blockwise_act_quant_lhs(
         x,
         block_size=block_size,
     )
@@ -149,7 +149,7 @@ def test_triton_quantize_fp8_act_quant_rhs(block_size: int):
     x[:block_size, :block_size] = 0.0
 
     # Get the quantized tensor and reciprocal scales using triton implementation
-    triton_fp8, triton_scale = fp8_blockwise_act_quant_rhs(
+    triton_fp8, triton_scale = triton_fp8_blockwise_act_quant_rhs(
         x,
         block_size=block_size,
     )
@@ -196,7 +196,7 @@ def test_triton_quantize_fp8_act_quant_transposed_lhs(M, K, block_size: int):
     x[0, :block_size] = 0.0
 
     # Get the quantized tensor and reciprocal scales using triton implementation
-    triton_fp8, triton_scale = fp8_blockwise_act_quant_transposed_lhs(
+    triton_fp8, triton_scale = triton_fp8_blockwise_act_quant_transposed_lhs(
         x,
         block_size=block_size,
     )
@@ -245,7 +245,7 @@ def test_triton_quantize_fp8_weight_quant_rhs(M, K, block_size: int):
     x[:block_size, :block_size] = 0.0
 
     # Get the quantized tensor and reciprocal scales using triton implementation
-    triton_fp8, triton_scale = fp8_blockwise_weight_quant_rhs(
+    triton_fp8, triton_scale = triton_fp8_blockwise_weight_quant_rhs(
         x,
         block_size=block_size,
     )
@@ -292,7 +292,7 @@ def test_triton_quantize_fp8_weight_quant_transposed_rhs(block_size: int):
     x[:block_size, :block_size] = 0.0
 
     # Get the quantized tensor and reciprocal scales using triton implementation
-    triton_fp8, triton_scale = fp8_blockwise_weight_quant_transposed_rhs(
+    triton_fp8, triton_scale = triton_fp8_blockwise_weight_quant_transposed_rhs(
         x,
         block_size=block_size,
     )
diff --git a/torchao/prototype/blockwise_fp8_training/kernels.py b/torchao/prototype/blockwise_fp8_training/kernels.py
@@ -9,6 +9,7 @@
 import torch
 import triton
 import triton.language as tl
+from torch.library import triton_op, wrap_triton
 
 from torchao.prototype.moe_training.utils import (
     _is_column_major,
@@ -119,7 +120,7 @@ def triton_fp8_gemm_1x128_128x128(
         triton.cdiv(M, META["BLOCK_SIZE_M"]),
         triton.cdiv(N, META["BLOCK_SIZE_N"]),
     )
-    triton_fp8_gemm_1x128_128x128_kernel[grid](
+    wrap_triton(triton_fp8_gemm_1x128_128x128_kernel)[grid](
         a,
         a.stride(0),
         a.stride(1),
@@ -234,7 +235,7 @@ def triton_fp8_gemm_1x128_128x1(
         triton.cdiv(M, META["BLOCK_SIZE_M"]),
         triton.cdiv(N, META["BLOCK_SIZE_N"]),
     )
-    triton_fp8_gemm_1x128_128x1_kernel[grid](
+    wrap_triton(triton_fp8_gemm_1x128_128x1_kernel)[grid](
         a,
         a.stride(0),
         a.stride(1),
@@ -281,7 +282,7 @@ def triton_fp8_gemm_1x128_128x1(
 
 @triton.autotune(configs=quant_kernel_configs_with_groups, key=["K"])
 @triton.jit
-def fp8_blockwise_act_quant_lhs_kernel(
+def triton_fp8_blockwise_act_quant_lhs_kernel(
     x_ptr,
     x_stride_dim_0,
     x_stride_dim_1,
@@ -327,7 +328,8 @@ def fp8_blockwise_act_quant_lhs_kernel(
     tl.store(s_ptr + scale_offs, tl.div_rn(1.0, scale))
 
 
-def fp8_blockwise_act_quant_lhs(
+@triton_op("torchao::triton_fp8_blockwise_act_quant_lhs", mutates_args={})
+def triton_fp8_blockwise_act_quant_lhs(
     x: torch.Tensor, block_size: int = 128, dtype: torch.dtype = torch.float8_e4m3fn
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """
@@ -352,7 +354,7 @@ def fp8_blockwise_act_quant_lhs(
         triton.cdiv(M, meta["NUM_GROUPS"]),
         triton.cdiv(K, meta["BLOCK_SIZE"]),
     )
-    fp8_blockwise_act_quant_lhs_kernel[grid](
+    wrap_triton(triton_fp8_blockwise_act_quant_lhs_kernel)[grid](
         x,
         x.stride(0),
         x.stride(1),
@@ -372,7 +374,7 @@ def fp8_blockwise_act_quant_lhs(
 
 @triton.autotune(configs=quant_kernel_configs_with_groups, key=["K"])
 @triton.jit
-def fp8_blockwise_act_quant_rhs_kernel(
+def triton_fp8_blockwise_act_quant_rhs_kernel(
     x_ptr,
     x_stride_dim_0,
     x_stride_dim_1,
@@ -420,7 +422,8 @@ def fp8_blockwise_act_quant_rhs_kernel(
     tl.store(s_ptr + scale_offs, tl.div_rn(1.0, scale))
 
 
-def fp8_blockwise_act_quant_rhs(
+@triton_op("torchao::triton_fp8_blockwise_act_quant_rhs", mutates_args={})
+def triton_fp8_blockwise_act_quant_rhs(
     x: torch.Tensor, block_size: int = 128, dtype: torch.dtype = torch.float8_e4m3fn
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """
@@ -444,7 +447,7 @@ def fp8_blockwise_act_quant_rhs(
         triton.cdiv(M, meta["BLOCK_SIZE"]),
         triton.cdiv(K, meta["NUM_GROUPS"]),
     )
-    fp8_blockwise_act_quant_rhs_kernel[grid](
+    wrap_triton(triton_fp8_blockwise_act_quant_rhs_kernel)[grid](
         x,
         x.stride(0),
         x.stride(1),
@@ -464,7 +467,7 @@ def fp8_blockwise_act_quant_rhs(
 
 @triton.autotune(configs=quant_kernel_configs_with_groups, key=["K"])
 @triton.jit
-def fp8_blockwise_act_quant_transposed_lhs_kernel(
+def triton_fp8_blockwise_act_quant_transposed_lhs_kernel(
     x_ptr,
     x_stride_dim_0,
     x_stride_dim_1,
@@ -524,7 +527,8 @@ def fp8_blockwise_act_quant_transposed_lhs_kernel(
     tl.store(s_ptr + scale_offs, tl.div_rn(1.0, scale), mask=scale_mask)
 
 
-def fp8_blockwise_act_quant_transposed_lhs(
+@triton_op("torchao::triton_fp8_blockwise_act_quant_transposed_lhs", mutates_args={})
+def triton_fp8_blockwise_act_quant_transposed_lhs(
     x: torch.Tensor, block_size: int = 128, dtype: torch.dtype = torch.float8_e4m3fn
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     assert x.is_contiguous(), "Input tensor must be contiguous"
@@ -550,7 +554,7 @@ def fp8_blockwise_act_quant_transposed_lhs(
         triton.cdiv(K, meta["NUM_GROUPS"]),
     )
 
-    fp8_blockwise_act_quant_transposed_lhs_kernel[grid](
+    wrap_triton(triton_fp8_blockwise_act_quant_transposed_lhs_kernel)[grid](
         x,
         x.stride(0),
         x.stride(1),
@@ -570,7 +574,7 @@ def fp8_blockwise_act_quant_transposed_lhs(
 
 @triton.autotune(configs=quant_kernel_configs, key=["M", "N"])
 @triton.jit
-def fp8_blockwise_weight_quant_rhs_kernel(
+def triton_fp8_blockwise_weight_quant_rhs_kernel(
     x_ptr,
     x_stride_dim_0,
     x_stride_dim_1,
@@ -615,8 +619,9 @@ def fp8_blockwise_weight_quant_rhs_kernel(
     tl.store(s_ptr + scale_m_off + scale_n_off, tl.div_rn(1.0, scale))
 
 
-def fp8_blockwise_weight_quant_rhs(
-    x: torch.Tensor, block_size: int = 128, dtype=torch.float8_e4m3fn
+@triton_op("torchao::triton_fp8_blockwise_weight_quant_rhs", mutates_args={})
+def triton_fp8_blockwise_weight_quant_rhs(
+    x: torch.Tensor, block_size: int = 128, dtype: torch.dtype = torch.float8_e4m3fn
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     assert x.is_contiguous(), "Input tensor must be contiguous"
     assert x.dim() == 2, "Input tensor must have 2 dimensions"
@@ -638,7 +643,7 @@ def fp8_blockwise_weight_quant_rhs(
         triton.cdiv(M, meta["BLOCK_SIZE"]),
         triton.cdiv(N, meta["BLOCK_SIZE"]),
     )
-    fp8_blockwise_weight_quant_rhs_kernel[grid](
+    wrap_triton(triton_fp8_blockwise_weight_quant_rhs_kernel)[grid](
         x,
         x.stride(0),
         x.stride(1),
@@ -658,7 +663,7 @@ def fp8_blockwise_weight_quant_rhs(
 
 @triton.autotune(configs=quant_kernel_configs, key=["M", "N"])
 @triton.jit
-def fp8_blockwise_weight_quant_transposed_rhs_kernel(
+def triton_fp8_blockwise_weight_quant_transposed_rhs_kernel(
     x_ptr,
     x_stride_dim_0,
     x_stride_dim_1,
@@ -719,8 +724,9 @@ def fp8_blockwise_weight_quant_transposed_rhs_kernel(
     tl.store(s_ptr + scale_offs, tl.div_rn(1.0, scale), mask=scale_mask)
 
 
-def fp8_blockwise_weight_quant_transposed_rhs(
-    x: torch.Tensor, block_size: int = 128, dtype=torch.float8_e4m3fn
+@triton_op("torchao::triton_fp8_blockwise_weight_quant_transposed_rhs", mutates_args={})
+def triton_fp8_blockwise_weight_quant_transposed_rhs(
+    x: torch.Tensor, block_size: int = 128, dtype: torch.dtype = torch.float8_e4m3fn
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     assert x.is_contiguous(), "Input tensor must be contiguous"
     assert x.dim() == 2, "Input tensor must have 2 dimensions"
@@ -742,7 +748,7 @@ def fp8_blockwise_weight_quant_transposed_rhs(
         triton.cdiv(M, meta["BLOCK_SIZE"]),
         triton.cdiv(N, meta["BLOCK_SIZE"]),
     )
-    fp8_blockwise_weight_quant_transposed_rhs_kernel[grid](
+    wrap_triton(triton_fp8_blockwise_weight_quant_transposed_rhs_kernel)[grid](
         x,
         x.stride(0),
         x.stride(1),
diff --git a/torchao/prototype/blockwise_fp8_training/linear.py b/torchao/prototype/blockwise_fp8_training/linear.py