[Gluon] Change gl.warp_specialize API (#8527)

Mogball · web-flow · commit cbab5f436cc5 · 2025-10-24T12:14:51.000-07:00
Functions and their individual arguments are passed as an array. All the
arguments are just appended together in MLIR, but the
`WarpSpecializeOp::canonicalize` method will clean up duplicate
arguments.
diff --git a/python/examples/gluon/01-attention-forward.py b/python/examples/gluon/01-attention-forward.py
@@ -840,12 +840,13 @@ def attention_kernel(  #
 
     chnls = (q_chnl, kv_chnl, o_chnl, epi_chnl, s0_chnl, s1_chnl, c0_chnl, c1_chnl, exp_turnstile)
     descs = (desc_q, desc_k, desc_v, desc_o)
-    gl.warp_specialize((config, chnls, descs, M, STAGE), _attn_fwd_correction, (config, chnls, descs, M, STAGE), [
-        _attn_fwd_softmax0,
-        _attn_fwd_softmax1,
-        _attn_fwd_mma,
-        _attn_fwd_load,
-        _attn_fwd_epilogue,
+    gl.warp_specialize([
+        (_attn_fwd_correction, (config, chnls, descs, M, STAGE)),
+        (_attn_fwd_softmax0, (config, chnls, descs, M, STAGE)),
+        (_attn_fwd_softmax1, (config, chnls, descs, M, STAGE)),
+        (_attn_fwd_mma, (config, chnls, descs, M, STAGE)),
+        (_attn_fwd_load, (config, chnls, descs, M, STAGE)),
+        (_attn_fwd_epilogue, (config, chnls, descs, M, STAGE)),
     ], [4, 4, 1, 1, 1], [192, 192, 24, 24, 24])
 
     q_chnl.release()
diff --git a/python/test/gluon/test_consan.py b/python/test/gluon/test_consan.py
@@ -742,8 +742,10 @@ def ws_kernel(output, FAILURE: ttgl.constexpr):
         bar = ttgl.allocate_shared_memory(ttgl.int64, [2, 1], mbarrier.MBarrierLayout())
         for i in range(2):
             mbarrier.init(bar.index(i), count=1)
-        ttgl.warp_specialize((smem, bar, FAILURE, blocked_layout), ws_default, (smem, bar, FAILURE, blocked_layout),
-                             [ws_1], [4], [32])
+        ttgl.warp_specialize([
+            (ws_default, (smem, bar, FAILURE, blocked_layout)),
+            (ws_1, (smem, bar, FAILURE, blocked_layout)),
+        ], [4], [32])
         mbarrier.wait(bar.index(1), phase=0)
         val = smem.index(0).load(blocked_layout)
         output_ptrs = output + ttgl.arange(0, XBLOCK, blocked_layout)
@@ -796,8 +798,10 @@ def ws_kernel(output, FAILURE: ttgl.constexpr):
         bar = ttgl.allocate_shared_memory(ttgl.int64, [2, 1], mbarrier.MBarrierLayout())
         for i in range(2):
             mbarrier.init(bar.index(i), count=1)
-        ttgl.warp_specialize((smem, bar, FAILURE, blocked_layout), ws_default, (smem, bar, FAILURE, blocked_layout),
-                             [ws_1], [4], [32])
+        ttgl.warp_specialize([
+            (ws_default, (smem, bar, FAILURE, blocked_layout)),
+            (ws_1, (smem, bar, FAILURE, blocked_layout)),
+        ], [4], [32])
         mbarrier.wait(bar.index(1), phase=0)
         val = smem.index(0).load(blocked_layout)
         output_ptrs = output + ttgl.arange(0, XBLOCK, blocked_layout)
@@ -859,8 +863,11 @@ def kernel(output, MISSING_BAR: ttgl.constexpr):
         bar = ttgl.allocate_shared_memory(ttgl.int64, [3, 1], mbarrier.MBarrierLayout())
         for i in range(3):
             mbarrier.init(bar.index(i), count=1)
-        ttgl.warp_specialize((smem, bar, MISSING_BAR, blocked_layout), ws_default,
-                             (smem, bar, MISSING_BAR, blocked_layout), [ws_1, ws_2], [4, 4], [32, 32])
+        ttgl.warp_specialize([
+            (ws_default, (smem, bar, MISSING_BAR, blocked_layout)),
+            (ws_1, (smem, bar, MISSING_BAR, blocked_layout)),
+            (ws_2, (smem, bar, MISSING_BAR, blocked_layout)),
+        ], [4, 4], [32, 32])
         mbarrier.wait(bar.index(2), phase=0)
         val = smem.index(0).load(blocked_layout)
         output_ptrs = output + ttgl.arange(0, XBLOCK, blocked_layout)
@@ -919,8 +926,11 @@ def kernel(output, FAILURE: ttgl.constexpr):
         bar = ttgl.allocate_shared_memory(ttgl.int64, [2, 1], mbarrier.MBarrierLayout())
         mbarrier.init(bar.index(0), count=2)
         mbarrier.init(bar.index(1), count=1)
-        ttgl.warp_specialize((smem, bar, FAILURE, blocked_layout), ws_default, (smem, bar, FAILURE, blocked_layout),
-                             [ws_1, ws_2], [4, 4], [32, 32])
+        ttgl.warp_specialize([
+            (ws_default, (smem, bar, FAILURE, blocked_layout)),
+            (ws_1, (smem, bar, FAILURE, blocked_layout)),
+            (ws_2, (smem, bar, FAILURE, blocked_layout)),
+        ], [4, 4], [32, 32])
         mbarrier.wait(bar.index(1), phase=0)
         val = smem.index(0).load(blocked_layout)
         output_ptrs = output + ttgl.arange(0, XBLOCK, blocked_layout)
@@ -1007,8 +1017,11 @@ def kernel(output, MISSING_BAR: ttgl.constexpr):
         mbarrier.arrive(bar.index(2), count=1)
         mbarrier.arrive(bar.index(3), count=1)
 
-        ttgl.warp_specialize((smem, bar, MISSING_BAR, blocked_layout), ws_default,
-                             (smem, bar, MISSING_BAR, blocked_layout), [ws_1, ws_2], [4, 4], [32, 32])
+        ttgl.warp_specialize([
+            (ws_default, (smem, bar, MISSING_BAR, blocked_layout)),
+            (ws_1, (smem, bar, MISSING_BAR, blocked_layout)),
+            (ws_2, (smem, bar, MISSING_BAR, blocked_layout)),
+        ], [4, 4], [32, 32])
 
     output = torch.empty((XBLOCK, ), device=device, dtype=torch.float16)
     kernel[(1, )](output, MISSING_BAR=MISSING_BAR, num_warps=4)
@@ -1072,8 +1085,10 @@ def kernel(output, FAILURE: ttgl.constexpr):
 
         mbarrier.arrive(bar.index(2), count=1)
 
-        ttgl.warp_specialize((smem, bar, FAILURE, blocked_layout), ws_default, (smem, bar, FAILURE, blocked_layout),
-                             [ws_1], [4], [32])
+        ttgl.warp_specialize([
+            (ws_default, (smem, bar, FAILURE, blocked_layout)),
+            (ws_1, (smem, bar, FAILURE, blocked_layout)),
+        ], [4], [32])
 
     output = torch.empty((XBLOCK, ), device=device, dtype=torch.float16)
     kernel[(1, )](output, FAILURE=FAILURE, num_warps=4)
@@ -1160,8 +1175,12 @@ def kernel(output, MISSING_BAR: ttgl.constexpr):
         mbarrier.arrive(bar.index(2), count=2)
         mbarrier.arrive(bar.index(3), count=2)
 
-        ttgl.warp_specialize((smem, bar, MISSING_BAR, blocked_layout), ws_default,
-                             (smem, bar, MISSING_BAR, blocked_layout), [ws_1, ws_2, ws_3], [4, 4, 4], [32, 32, 32])
+        ttgl.warp_specialize([
+            (ws_default, (smem, bar, MISSING_BAR, blocked_layout)),
+            (ws_1, (smem, bar, MISSING_BAR, blocked_layout)),
+            (ws_2, (smem, bar, MISSING_BAR, blocked_layout)),
+            (ws_3, (smem, bar, MISSING_BAR, blocked_layout)),
+        ], [4, 4, 4], [32, 32, 32])
 
     output = torch.empty((XBLOCK, ), device=device, dtype=torch.float16)
     kernel[(1, )](output, MISSING_BAR=MISSING_BAR, num_warps=4)
@@ -1225,8 +1244,11 @@ def kernel(output, MISSING_BAR: ttgl.constexpr):
         bar = ttgl.allocate_shared_memory(ttgl.int64, [3, 1], mbarrier.MBarrierLayout())
         for i in range(3):
             mbarrier.init(bar.index(i), count=1)
-        ttgl.warp_specialize((smem, bar, MISSING_BAR), ws_default, (smem, bar, MISSING_BAR), [ws_1, ws_2], [2, 8],
-                             [32, 32])
+        ttgl.warp_specialize([
+            (ws_default, (smem, bar, MISSING_BAR)),
+            (ws_1, (smem, bar, MISSING_BAR)),
+            (ws_2, (smem, bar, MISSING_BAR)),
+        ], [2, 8], [32, 32])
         mbarrier.wait(bar.index(2), phase=0)
         val = smem.index(0).load(blocked_layout)
         output_ptrs = output + ttgl.arange(0, XBLOCK, blocked_layout)
@@ -1291,8 +1313,10 @@ def kernel(input, FAILURE: ttgl.constexpr):
         smem = ttgl.allocate_shared_memory(ttgl.float16, [4, XBLOCK], smem_layout)
         blocked_layout: ttgl.constexpr = ttgl.BlockedLayout(size_per_thread=[XBLOCK], threads_per_warp=[32],
                                                             warps_per_cta=[4], order=[0])
-        ttgl.warp_specialize((input, smem, FAILURE, blocked_layout, 0), ws_prog,
-                             (input, smem, FAILURE, blocked_layout, 2), [ws_prog], [4], [32])
+        ttgl.warp_specialize([
+            (ws_prog, (input, smem, FAILURE, blocked_layout, 0)),
+            (ws_prog, (input, smem, FAILURE, blocked_layout, 2)),
+        ], [4], [32])
 
     input = torch.randn((XBLOCK, ), device=device, dtype=torch.float16)
     kernel[(1, )](input, FAILURE=FAILURE, num_warps=4)
@@ -1346,8 +1370,10 @@ def kernel(input, FAILURE: ttgl.constexpr):
         smem = ttgl.allocate_shared_memory(ttgl.float16, [2, XBLOCK], smem_layout)
         bar = ttgl.allocate_shared_memory(ttgl.int64, [1, 1], mbarrier.MBarrierLayout())
         mbarrier.init(bar.index(0), count=1)
-        ttgl.warp_specialize((input, smem, bar, FAILURE, blocked_layout), ws_default,
-                             (input, smem, bar, FAILURE, blocked_layout), [ws_1], [4], [32])
+        ttgl.warp_specialize([
+            (ws_default, (input, smem, bar, FAILURE, blocked_layout)),
+            (ws_1, (input, smem, bar, FAILURE, blocked_layout)),
+        ], [4], [32])
 
     input = torch.randn((XBLOCK, ), device=device, dtype=torch.float16)
     kernel[(1, )](input, FAILURE=FAILURE, num_warps=4)
@@ -1402,8 +1428,10 @@ def kernel(FAILURE: ttgl.constexpr):
         smem = ttgl.allocate_shared_memory(ttgl.float16, [2, XBLOCK, XBLOCK], smem_layout)
         bar = ttgl.allocate_shared_memory(ttgl.int64, [1, 1], mbarrier.MBarrierLayout())
         mbarrier.init(bar.index(0), count=1)
-        ttgl.warp_specialize((smem, bar, FAILURE, blocked_layout, mma_layout), ws_default,
-                             (smem, bar, FAILURE, blocked_layout), [ws_1], [4], [32])
+        ttgl.warp_specialize([
+            (ws_default, (smem, bar, FAILURE, blocked_layout, mma_layout)),
+            (ws_1, (smem, bar, FAILURE, blocked_layout)),
+        ], [4], [32])
 
     kernel[(1, )](FAILURE=FAILURE, num_warps=4)
 
@@ -1438,7 +1466,10 @@ def kernel():
         bar = ttgl.allocate_shared_memory(ttgl.int64, [2, 1], mbarrier.MBarrierLayout())
         mbarrier.init(bar.index(0), count=1)
         mbarrier.init(bar.index(1), count=1)
-        ttgl.warp_specialize((bar, ), ws_default, (bar, ), [ws_1], [4], [32])
+        ttgl.warp_specialize([
+            (ws_default, (bar, )),
+            (ws_1, (bar, )),
+        ], [4], [32])
 
     kernel[(1, )](num_warps=4)
 
@@ -1505,7 +1536,10 @@ def kernel():
         bar = ttgl.allocate_shared_memory(ttgl.int64, [2, 1], mbarrier.MBarrierLayout())
         mbarrier.init(bar.index(0), count=2)
         mbarrier.init(bar.index(1), count=2)
-        ttgl.warp_specialize((bar, ), ws_default, (bar, ), [ws_1], [4], [32])
+        ttgl.warp_specialize([
+            (ws_default, (bar, )),
+            (ws_1, (bar, )),
+        ], [4], [32])
 
     kernel[(1, )](num_warps=4)
 
@@ -1541,7 +1575,10 @@ def kernel():
         bar = ttgl.allocate_shared_memory(ttgl.int64, [1, 1], mbarrier.MBarrierLayout())
         mbarrier.init(bar.index(0), count=1)
         mbarrier.arrive(bar.index(0), count=1)
-        ttgl.warp_specialize((bar, ), ws_default, (bar, ), [ws_1], [4], [32])
+        ttgl.warp_specialize([
+            (ws_default, (bar, )),
+            (ws_1, (bar, )),
+        ], [4], [32])
 
     kernel[(1, )](num_warps=4)
 
@@ -1582,7 +1619,10 @@ def kernel(input_desc):
         bar = ttgl.allocate_shared_memory(ttgl.int64, [2, 1], mbarrier.MBarrierLayout())
         mbarrier.init(bar.index(0), count=1)
         mbarrier.init(bar.index(1), count=1)
-        ttgl.warp_specialize((input_desc, smem, bar), ws_default, (input_desc, smem, bar), [ws_1], [4], [32])
+        ttgl.warp_specialize([
+            (ws_default, (input_desc, smem, bar)),
+            (ws_1, (input_desc, smem, bar)),
+        ], [4], [32])
 
     input = torch.randn((XBLOCK, XBLOCK), device=device, dtype=torch.float16)
     shared_layout = ttgl.NVMMASharedLayout(swizzle_byte_width=128, element_bitwidth=16, rank=2)
@@ -1621,6 +1661,9 @@ def kernel():
         bar = ttgl.allocate_shared_memory(ttgl.int64, [2, 1], mbarrier.MBarrierLayout())
         mbarrier.init(bar.index(0), count=1)
         mbarrier.init(bar.index(1), count=1)
-        ttgl.warp_specialize((bar, ), ws_default, (bar, ), [ws_1], [4], [32])
+        ttgl.warp_specialize([
+            (ws_default, (bar, )),
+            (ws_1, (bar, )),
+        ], [4], [32])
 
     kernel[(1, )](num_warps=4)
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -466,17 +466,17 @@ def test_warp_specialize():
     # CHECK-NEXT:    [[A:%.*]] = tt.make_range {end = 1 : i32, start = 0 : i32}
     # CHECK-NEXT:    [[B:%.*]] = tt.make_range {end = 2 : i32, start = 0 : i32}
     # CHECK-NEXT:    [[C:%.*]] = tt.make_range {end = 4 : i32, start = 0 : i32}
-    # CHECK-NEXT:    [[OUTS:%.*]]:3 = ttg.warp_specialize([[A]], [[B]], [[C]]) {{.*}}requestedRegisters = array<i32: 24, 48>
+    # CHECK-NEXT:    [[OUTS:%.*]]:3 = ttg.warp_specialize([[A]], [[B]], [[C]], [[A]], [[B]], [[C]]) {{.*}}requestedRegisters = array<i32: 24, 48>
     # CHECK-NEXT:    default {
     # CHECK-NEXT:      [[RESULTS:%.*]]:3 = tt.call @{{.*}}warp_specialize_default{{.*}}cconstexpr_42{{.*}}([[A]], [[B]], [[C]])
     # CHECK-NEXT:      warp_yield [[RESULTS]]#0, [[RESULTS]]#1, [[RESULTS]]#2
     # CHECK-NEXT:    }
-    # CHECK-NEXT:    partition0(%arg0: tensor<1xi32, [[BLOCKED]]>, %arg1: tensor<2xi32, [[BLOCKED]]>, %arg2: tensor<4xi32, [[BLOCKED]]>) num_warps(4) {
+    # CHECK-NEXT:    partition0(%arg0: tensor<1xi32, [[BLOCKED]]>, %arg1: tensor<2xi32, [[BLOCKED]]>, %arg2: tensor<4xi32, [[BLOCKED]]>, %arg3: tensor<1xi32, [[BLOCKED]]>, %arg4: tensor<2xi32, [[BLOCKED]]>, %arg5: tensor<4xi32, [[BLOCKED]]>) num_warps(4) {
     # CHECK-NEXT:      call @{{.*}}warp_specialize_worker0{{.*}}cconstexpr_42{{.*}}(%arg0, %arg1, %arg2)
     # CHECK-NEXT:      warp_return
     # CHECK-NEXT:    }
-    # CHECK-NEXT:    partition1(%arg0: tensor<1xi32, [[BLOCKED]]>, %arg1: tensor<2xi32, [[BLOCKED]]>, %arg2: tensor<4xi32, [[BLOCKED]]>) num_warps(4) {
-    # CHECK-NEXT:      call @{{.*}}warp_specialize_worker1{{.*}}cconstexpr_42{{.*}}(%arg0, %arg1, %arg2)
+    # CHECK-NEXT:    partition1(%arg0: tensor<1xi32, [[BLOCKED]]>, %arg1: tensor<2xi32, [[BLOCKED]]>, %arg2: tensor<4xi32, [[BLOCKED]]>, %arg3: tensor<1xi32, [[BLOCKED]]>, %arg4: tensor<2xi32, [[BLOCKED]]>, %arg5: tensor<4xi32, [[BLOCKED]]>) num_warps(4) {
+    # CHECK-NEXT:      call @{{.*}}warp_specialize_worker1{{.*}}cconstexpr_42{{.*}}(%arg3, %arg4, %arg5)
     # CHECK-NEXT:      warp_return
     # CHECK-NEXT:    }
     # CHECK-NEXT:    call @{{.*}}anchor{{.*}}([[OUTS]]#0)
@@ -487,14 +487,20 @@ def test_warp_specialize():
     c = ttgl.arange(0, 4, layout=layout)
     pair = Pair(a, b)
     e: ttgl.constexpr = 42
-    a, b = ttgl.warp_specialize((pair, c, e), warp_specialize_default, (pair, c, e),
-                                [warp_specialize_worker0, warp_specialize_worker1], [4, 4], [24, 48])
+    a, b = ttgl.warp_specialize([
+        (warp_specialize_default, (pair, c, e)),
+        (warp_specialize_worker0, (pair, c, e)),
+        (warp_specialize_worker1, (pair, c, e)),
+    ], [4, 4], [24, 48])
     anchor(a)
     anchor(b)
 
     # CHECK: ttg.warp_specialize([[A]], [[B]], [[C]])
     # CHECK: (tensor<1xi32, [[BLOCKED]]>, tensor<2xi32, [[BLOCKED]]>, tensor<4xi32, [[BLOCKED]]>) -> ()
-    ttgl.warp_specialize((pair, c, e), warp_specialize_worker0, (pair, c, e), [warp_specialize_worker1], [4], [48])
+    ttgl.warp_specialize([
+        (warp_specialize_worker0, (pair, c, e)),
+        (warp_specialize_worker1, (pair, c, e)),
+    ], [4], [48])
 
 
 @gluon.jit
@@ -535,7 +541,11 @@ def test_num_warps_caller_context():
     # CHECK: func private @{{.*}}ws_test_worker1{{.*}}_NW1() attributes {noinline = false, "ttg.num-warps" = 1 : i32}
     # CHECK: func private @{{.*}}ws_body{{.*}}_NW1"() attributes {noinline = false, "ttg.num-warps" = 1 : i32}
     # CHECK: func private @{{.*}}anchor{{.*}}_NW1(%arg0: tensor<128xi32, [[BLOCKED_NW1]]>) attributes {noinline = false, "ttg.num-warps" = 1 : i32}
-    ttgl.warp_specialize((), ws_test_default, (), [ws_test_worker0, ws_test_worker1], [2, 1], [80, 80])
+    ttgl.warp_specialize([
+        (ws_test_default, ()),
+        (ws_test_worker0, ()),
+        (ws_test_worker1, ()),
+    ], [2, 1], [80, 80])
 
 
 @gluon.jit
@@ -2913,8 +2923,12 @@ def test_get_num_warps():
     # CHECK: tt.func private @{{.*}}print_num_warps{{.*}}NW8
     # CHECK-NEXT arith.constant 8 : i32
     print_num_warps()
-    ttgl.warp_specialize((), print_num_warps, (), [print_num_warps, print_num_warps, print_num_warps], [1, 2, 8],
-                         [24, 24, 24])
+    ttgl.warp_specialize([
+        (print_num_warps, ()),
+        (print_num_warps, ()),
+        (print_num_warps, ()),
+        (print_num_warps, ()),
+    ], [1, 2, 8], [24, 24, 24])
 
 
 def test_mismatch_shape_and_layout_rank():
diff --git a/python/triton/experimental/gluon/language/_core.py b/python/triton/experimental/gluon/language/_core.py
@@ -493,16 +493,12 @@ def set_auto_layout(value, layout, _semantic=None):
 
 
 @builtin
-def warp_specialize(default_args, default_partition, worker_args, worker_partitions, worker_num_warps, worker_num_regs,
-                    _semantic=None, _generator=None):
+def warp_specialize(functions_and_args, worker_num_warps, worker_num_regs, _semantic=None, _generator=None):
     """
     Create a warp-specialized execution region, partitioning work across warps.
 
     Args:
-        default_args (List[Any]): Arguments for the default region.
-        default_partition (callable): Function to build the default execution region.
-        worker_args (List[Any]): Arguments for each warp partition.
-        worker_partitions (List[callable]): Functions for each warp partition.
+        functions_and_args (List[Tuple[Callable, Any]]): List of functions and arguments for each partition.
         worker_num_warps (List[int]): Number of warps per partition.
         worker_num_regs (List[int]): Number of registers per partition.
 
@@ -511,8 +507,7 @@ def warp_specialize(default_args, default_partition, worker_args, worker_partiti
     """
     worker_num_warps = [_unwrap_if_constexpr(w) for w in worker_num_warps]
     worker_num_regs = [_unwrap_if_constexpr(r) for r in worker_num_regs]
-    return _semantic.warp_specialize(default_args, default_partition, worker_args, worker_partitions, worker_num_warps,
-                                     worker_num_regs, _generator)
+    return _semantic.warp_specialize(functions_and_args, worker_num_warps, worker_num_regs, _generator)
 
 
 @builtin
diff --git a/python/triton/experimental/gluon/language/_semantic.py b/python/triton/experimental/gluon/language/_semantic.py
diff --git a/python/tutorials/gluon/08-warp-specialization.py b/python/tutorials/gluon/08-warp-specialization.py