Support IR (ttir/ttgir/llir/ptx) overriding at scale by Triton autotune configs (#6802)

dshi7 · web-flow · commit 4593bcd9ffbb · 2025-05-19T16:49:15.000-07:00
This is an alternative to triton-lang/triton#6790 where we tried to allow users to override kernels with customized PTX source files. Unlike triton-lang/triton#6790 where some missing metadata are set in a brittle way, this PR simplifies the logic by adding `ptx_override` to Triton autotune config. We don't lose the legit metadata produced by `make_ttgir` / `make_llir` while users can still load their customized PTX files at scale as below. ``` bwd_configs_ws = [ ( triton_config( { "BLOCK_M1": BM1, "BLOCK_N1": BN1, "BLOCK_M2": BN1, "BLOCK_N2": BM1, "NUM_CONSUMER_GROUPS": 2, }, num_stages=s, num_warps=w, num_buffers_warp_spec=buf, num_consumer_groups=2, ptx_override=f"/data/users/daohang/test_bwd/customized_block_n1_{BN1}.ptx" ) ) for buf in [2] for BM1 in [64] for BN1 in [64,128] for s in ([1, 2] if is_hip() else [0]) for w in [4] ] ``` Besides PTX, ttir/ttgir/llir are also supported. See `pytest python/test/unit/runtime/test_autotuner.py -k test_override` for more details.
diff --git a/python/test/unit/runtime/test_autotuner.py b/python/test/unit/runtime/test_autotuner.py
@@ -4,6 +4,10 @@
 import triton.language as tl
 import pytest
 
+import pathlib
+import uuid
+from triton._internal_testing import is_cuda
+
 
 def do_bench(kernel_call, quantiles, use_cuda_graph=False):
     if use_cuda_graph:
@@ -169,6 +173,203 @@ def _kernel(dst, src, N, BLOCK_SIZE: tl.constexpr):
         assert records['capture_named_args']
 
 
+@pytest.mark.skipif(not is_cuda() or torch.cuda.get_device_capability()[0] < 9,
+                    reason="Requires compute capability >= 9 for NV")
+def test_override_ttir(device):
+    N = 1024
+    src = torch.randn(N, device=device)
+    dst = torch.empty(N, device=device)
+
+    ir_src = r"""
+module {
+  tt.func public @_kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<1.000000e+01> : tensor<32xf32>
+    %c32_i32 = arith.constant 32 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c32_i32 : i32
+    %2 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32>
+    %3 = tt.splat %1 : i32 -> tensor<32xi32>
+    %4 = arith.addi %3, %2 : tensor<32xi32>
+    %5 = tt.splat %arg2 : i32 -> tensor<32xi32>
+    %6 = arith.cmpi slt, %4, %5 : tensor<32xi32>
+    %7 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<32x!tt.ptr<f32>>
+    %8 = tt.addptr %7, %4 : tensor<32x!tt.ptr<f32>>, tensor<32xi32>
+    %9 = tt.load %8, %6 : tensor<32x!tt.ptr<f32>>
+    %10 = arith.mulf %9, %cst : tensor<32xf32>
+    %11 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<32x!tt.ptr<f32>>
+    %12 = tt.addptr %11, %4 : tensor<32x!tt.ptr<f32>>, tensor<32xi32>
+    tt.store %12, %10, %6 : tensor<32x!tt.ptr<f32>>
+    tt.return
+  }
+}
+    """
+    temp_file = pathlib.Path(f"/tmp/test_override_{str(uuid.uuid4())}.ttir")
+    temp_file.write_text(ir_src)
+
+    configs = [triton.Config(kwargs={'BLOCK_SIZE': 32, 'ir_override': str(temp_file)})]
+
+    @triton.autotune(configs=configs, key=['N'], do_bench=do_bench)
+    @triton.jit
+    def _kernel(dst, src, N, BLOCK_SIZE: tl.constexpr):
+        offsets = tl.program_id(0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+        x = tl.load(src + offsets, mask=offsets < N)
+        tl.store(dst + offsets, x, mask=offsets < N)
+
+    grid = lambda META: (triton.cdiv(N, META['BLOCK_SIZE']), )
+    _kernel[grid](dst, src, N=N)
+
+    # Change the behavior of kernel by overriding PTX
+    torch.testing.assert_close(src * 10, dst)
+
+
+@pytest.mark.skipif(not is_cuda() or torch.cuda.get_device_capability()[0] < 9,
+                    reason="Requires compute capability >= 9 for NV")
+def test_override_ttgir(device):
+    N = 1024
+    src = torch.randn(N, device=device)
+    dst = torch.empty(N, device=device)
+
+    ir_src = r"""
+#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @_kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<1.000000e+01> : tensor<32xf32, #blocked>
+    %c32_i32 = arith.constant 32 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c32_i32 : i32
+    %2 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #blocked>
+    %3 = tt.splat %1 : i32 -> tensor<32xi32, #blocked>
+    %4 = arith.addi %3, %2 : tensor<32xi32, #blocked>
+    %5 = tt.splat %arg2 : i32 -> tensor<32xi32, #blocked>
+    %6 = arith.cmpi slt, %4, %5 : tensor<32xi32, #blocked>
+    %7 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<32x!tt.ptr<f32>, #blocked>
+    %8 = tt.addptr %7, %4 : tensor<32x!tt.ptr<f32>, #blocked>, tensor<32xi32, #blocked>
+    %9 = tt.load %8, %6 : tensor<32x!tt.ptr<f32>, #blocked>
+    %10 = arith.mulf %9, %cst : tensor<32xf32, #blocked>
+    %11 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<32x!tt.ptr<f32>, #blocked>
+    %12 = tt.addptr %11, %4 : tensor<32x!tt.ptr<f32>, #blocked>, tensor<32xi32, #blocked>
+    tt.store %12, %10, %6 : tensor<32x!tt.ptr<f32>, #blocked>
+    tt.return
+  }
+}
+    """
+    temp_file = pathlib.Path(f"/tmp/test_override_{str(uuid.uuid4())}.ttgir")
+    temp_file.write_text(ir_src)
+
+    configs = [triton.Config(kwargs={'BLOCK_SIZE': 32, 'ir_override': str(temp_file)})]
+
+    @triton.autotune(configs=configs, key=['N'], do_bench=do_bench)
+    @triton.jit
+    def _kernel(dst, src, N, BLOCK_SIZE: tl.constexpr):
+        offsets = tl.program_id(0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+        x = tl.load(src + offsets, mask=offsets < N)
+        tl.store(dst + offsets, x, mask=offsets < N)
+
+    grid = lambda META: (triton.cdiv(N, META['BLOCK_SIZE']), )
+    _kernel[grid](dst, src, N=N)
+
+    # Change the behavior of kernel by overriding PTX
+    torch.testing.assert_close(src * 10, dst)
+
+
+@pytest.mark.skipif(not is_cuda() or torch.cuda.get_device_capability()[0] != 9,
+                    reason="PTX file in this unit test is only for SM90")
+def test_override_ptx(device):
+    N = 1024
+    src = torch.randn(N, device=device)
+    dst = torch.empty(N, device=device)
+
+    ir_src = r"""
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.7
+.target sm_90a
+.address_size 64
+
+	// .globl	_kernel                 // -- Begin function _kernel
+                                        // @_kernel
+.visible .entry _kernel(
+	.param .u64 .ptr .global .align 1 _kernel_param_0,
+	.param .u64 .ptr .global .align 1 _kernel_param_1,
+	.param .u32 _kernel_param_2,
+	.param .u64 .ptr .global .align 1 _kernel_param_3
+)
+.reqntid 128
+{
+	.reg .pred 	%p<4>;
+	.reg .b32 	%r<10>;
+	.reg .b32 	%f<3>;
+	.reg .b64 	%rd<6>;
+	.loc	1 180 0
+$L__func_begin0:
+	.loc	1 180 0
+
+// %bb.0:
+	ld.param.u64 	%rd3, [_kernel_param_0];
+	ld.param.u64 	%rd4, [_kernel_param_1];
+$L__tmp0:
+	.loc	1 181 28
+	mov.u32 	%r3, %ctaid.x;
+	.loc	1 181 33
+	shl.b32 	%r4, %r3, 5;
+	ld.param.u32 	%r5, [_kernel_param_2];
+	.loc	1 181 59
+	mov.u32 	%r6, %tid.x;
+	and.b32  	%r7, %r6, 31;
+	.loc	1 181 46
+	or.b32  	%r8, %r4, %r7;
+	.loc	1 182 46
+	setp.lt.s32 	%p1, %r8, %r5;
+	.loc	1 182 22
+	mul.wide.s32 	%rd5, %r8, 4;
+	add.s64 	%rd1, %rd4, %rd5;
+	.loc	1 182 16
+	// begin inline asm
+	mov.u32 %r1, 0x0;
+	@%p1 ld.global.b32 { %r1 }, [ %rd1 + 0 ];
+	// end inline asm
+	mov.b32 	%f1, %r1;
+	.loc	1 183 12
+	mul.f32 	%f2, %f1, 0f41200000;
+	.loc	1 184 19
+	add.s64 	%rd2, %rd3, %rd5;
+	.loc	1 184 28
+	and.b32  	%r9, %r6, 96;
+	setp.eq.s32 	%p3, %r9, 0;
+	mov.b32 	%r2, %f2;
+	and.pred  	%p2, %p3, %p1;
+	// begin inline asm
+	@%p2 st.global.b32 [ %rd2 + 0 ], { %r2 };
+	// end inline asm
+	.loc	1 184 4
+	ret;
+$L__tmp1:
+$L__func_end0:
+                                        // -- End function
+}
+    """
+    temp_file = pathlib.Path(f"/tmp/test_override_{str(uuid.uuid4())}.ptx")
+    temp_file.write_text(ir_src)
+
+    configs = [triton.Config(kwargs={'BLOCK_SIZE': 32, 'ir_override': str(temp_file)})]
+
+    @triton.autotune(configs=configs, key=['N'], do_bench=do_bench)
+    @triton.jit
+    def _kernel(dst, src, N, BLOCK_SIZE: tl.constexpr):
+        offsets = tl.program_id(0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+        x = tl.load(src + offsets, mask=offsets < N)
+        x = x * 10
+        tl.store(dst + offsets, x, mask=offsets < N)
+
+    grid = lambda META: (triton.cdiv(N, META['BLOCK_SIZE']), )
+    _kernel[grid](dst, src, N=N)
+
+    # Change the behavior of kernel by overriding PTX
+    torch.testing.assert_close(src * 10, dst)
+
+
 def test_exceed_tmem(device):
     if not torch.cuda.is_available() or not torch.cuda.get_device_capability()[0] == 10:
         pytest.skip("Test requires tensor memory.")
diff --git a/python/triton/compiler/compiler.py b/python/triton/compiler/compiler.py
@@ -348,7 +348,12 @@ def compile(src, target=None, options=None):
     for ext, compile_ir in list(stages.items())[first_stage:]:
         next_module = compile_ir(module, metadata)
         ir_filename = f"{file_name}.{ext}"
-        if (fn_override_manager is not None and (full_name := fn_override_manager.get_file(ir_filename)) is not None):
+        if fn_override_manager is None:
+            # Users can override kernels at scale by setting `ir_override` in autotune config
+            # without TRITON_KERNEL_OVERRIDE
+            if (ir_override := metadata.get("ir_override", None)) and ir_override.endswith(f".{ext}"):
+                next_module = parse(ir_override, ext, context)
+        elif full_name := fn_override_manager.get_file(ir_filename):
             print(f"\nOverriding kernel with file {full_name}")
             next_module = parse(full_name, ext, context)
         # If TRITON_STORE_BINARY_ONLY is 1, only store cubin/hsaco/json
diff --git a/python/triton/runtime/autotuner.py b/python/triton/runtime/autotuner.py
@@ -313,15 +313,17 @@ class Config:
                        to ptx .maxnreg directive.  Not supported on all platforms.
     :ivar pre_hook: a function that will be called before the kernel is called. Parameters of this
                     function are args.
+    :ivar ir_override: filename of a user-defined IR (*.{ttgir|llir|ptx|amdgcn}).
     """
 
-    def __init__(self, kwargs, num_warps=4, num_stages=3, num_ctas=1, maxnreg=None, pre_hook=None):
+    def __init__(self, kwargs, num_warps=4, num_stages=3, num_ctas=1, maxnreg=None, pre_hook=None, ir_override=None):
         self.kwargs = kwargs
         self.num_warps = num_warps
         self.num_ctas = num_ctas
         self.num_stages = num_stages
         self.maxnreg = maxnreg
         self.pre_hook = pre_hook
+        self.ir_override = ir_override
 
     def __setstate__(self, state):
         self.kwargs = state.get("kwargs", {})
@@ -330,6 +332,7 @@ def __setstate__(self, state):
         self.num_ctas = state.get("num_ctas", 1)
         self.maxnreg = state.get("maxnreg", None)
         self.pre_hook = state.get("pre_hook", None)
+        self.ir_override = state.get("ir_override", None)
 
     def all_kwargs(self):
         return {
@@ -340,6 +343,7 @@ def all_kwargs(self):
                     ("num_ctas", self.num_ctas),
                     ("num_stages", self.num_stages),
                     ("maxnreg", self.maxnreg),
+                    ("ir_override", self.ir_override),
                 ) if v is not None
             }
         }
diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py
@@ -106,6 +106,7 @@ class CUDAOptions:
     maxnreg: Optional[int] = None
     cluster_dims: tuple = (1, 1, 1)
     ptx_version: int = None
+    ir_override: Optional[str] = None  # filename of a user-defined IR (*.{ttir|ttgir|llir|ptx})
     enable_fp_fusion: bool = True
     launch_cooperative_grid: bool = False
     launch_pdl: bool = False