[Gluon] Add ttgl.get_num_warps metafunction (#8133)

Mogball · web-flow · commit f90255886173 · 2025-09-10T10:23:36.000-07:00
This is a function which returns a constexpr of the current contextual number of warps. This returns the value passed as `num_warps=` when called from a regular function, but this also works inside warp specialized regions and returns the number of warps in the current partition. cc @aeng-openai
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -2375,3 +2375,29 @@ def test_layout_zeros():
     # CHECK: #blocked = #ttg.blocked
     # CHECK: arith.constant dense<0.000000e+00> : tensor<128xf32, #blocked>
     ttgl.zeros([128], ttgl.float32, layout=ttgl.BlockedLayout([1], [32], [4], [0]))
+
+
+@gluon.jit
+def print_num_warps():
+    num_warps: ttgl.constexpr = ttgl.num_warps()
+    print("num_warps", num_warps)
+
+
+@filecheck_test
+@gluon.jit
+def test_get_num_warps():
+    # CHECK-LABEL: test_get_num_warps
+    # CHECK: tt.func private @{{.*}}print_num_warps
+    # CHECK-NEXT arith.constant 4 : i32
+
+    # CHECK: tt.func private @{{.*}}print_num_warps{{.*}}NW1
+    # CHECK-NEXT arith.constant 1 : i32
+
+    # CHECK: tt.func private @{{.*}}print_num_warps{{.*}}NW2
+    # CHECK-NEXT arith.constant 2 : i32
+
+    # CHECK: tt.func private @{{.*}}print_num_warps{{.*}}NW8
+    # CHECK-NEXT arith.constant 8 : i32
+    print_num_warps()
+    ttgl.warp_specialize((), print_num_warps, (), [print_num_warps, print_num_warps, print_num_warps], [1, 2, 8],
+                         [24, 24, 24])
diff --git a/python/triton/experimental/gluon/language/__init__.py b/python/triton/experimental/gluon/language/__init__.py
@@ -47,6 +47,7 @@
     expand_dims,
     full,
     gather,
+    num_warps,
     histogram,
     inline_asm_elementwise,
     join,
diff --git a/python/triton/experimental/gluon/language/_core.py b/python/triton/experimental/gluon/language/_core.py
@@ -501,6 +501,14 @@ def warp_specialize(default_args, default_partition, worker_args, worker_partiti
                                      worker_num_regs, _generator)
 
 
+@builtin
+def num_warps(_semantic=None, _generator=None):
+    """
+    Returns the number of warps that execute the current context, including in warp-specialized regions.
+    """
+    return _semantic.num_warps(_generator)
+
+
 @builtin
 def thread_barrier(_semantic=None):
     """
diff --git a/python/triton/experimental/gluon/language/_semantic.py b/python/triton/experimental/gluon/language/_semantic.py
@@ -427,3 +427,9 @@ def warp_specialize(self, default_args, default_partition, worker_args, worker_p
         if default_results is None:
             return
         return tuple(unflatten_ir_values(mlir_results, [r.type for r in default_results]))
+
+    def num_warps(self, generator):
+        if generator.caller_context is not None:
+            assert isinstance(generator.caller_context, GluonCallerContext)
+            return ttgl.constexpr(generator.caller_context.num_warps)
+        return ttgl.constexpr(self.builder.options.num_warps)