[Frontend][Gluon] Pass contextual num_warps down call graph (#7529)

Mogball · web-flow · commit b37bd6b6ba9f · 2025-07-16T11:04:44.000-07:00
Warp-specialization changes the number of contextual warps for certain
sections of code. This information needs to be propagated down the
callgraph as well. This PR adds an optional `caller_context` that can be
set and propagated to callees. `ttgl.warp_specialize` uses this to pass
down the number of worker warps.
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -419,6 +419,47 @@ def test_warp_specialize():
     ttgl.warp_specialize((pair, c, e), warp_specialize_worker0, (pair, c, e), [warp_specialize_worker1], [4], [48])
 
 
+@gluon.jit
+def ws_body(num_warps: ttgl.constexpr):
+    anchor(ttgl.arange(0, 128, layout=ttgl.BlockedLayout([1], [32], [num_warps], [0])))
+
+
+@gluon.jit
+def ws_test_default():
+    ws_body(4)
+
+
+@gluon.jit
+def ws_test_worker0():
+    ws_body(2)
+
+
+@gluon.jit
+def ws_test_worker1():
+    ws_body(1)
+
+
+@filecheck_test
+@gluon.jit
+def test_num_warps_caller_context():
+    # CHECK-DAG: [[BLOCKED_NW4:#.*]] = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+    # CHECK-DAG: [[BLOCKED_NW2:#.*]] = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0]}>
+    # CHECK-DAG: [[BLOCKED_NW1:#.*]] = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [1], order = [0]}>
+
+    # CHECK: func private @{{.*}}ws_test_default{{.*}}() attributes {noinline = false}
+    # CHECK: func private @{{.*}}ws_body{{.*}}() attributes {noinline = false}
+    # CHECK: func private @{{.*}}anchor{{.*}}(%arg0: tensor<128xi32, [[BLOCKED_NW4]]>) attributes {noinline = false}
+
+    # CHECK: func private @{{.*}}ws_test_worker0{{.*}}_NW2() attributes {noinline = false, "ttg.num-warps" = 2 : i32}
+    # CHECK: func private @{{.*}}ws_body{{.*}}_NW2"() attributes {noinline = false, "ttg.num-warps" = 2 : i32}
+    # CHECK: func private @{{.*}}anchor{{.*}}_NW2(%arg0: tensor<128xi32, [[BLOCKED_NW2]]>) attributes {noinline = false, "ttg.num-warps" = 2 : i32}
+
+    # CHECK: func private @{{.*}}ws_test_worker1{{.*}}_NW1() attributes {noinline = false, "ttg.num-warps" = 1 : i32}
+    # CHECK: func private @{{.*}}ws_body{{.*}}_NW1"() attributes {noinline = false, "ttg.num-warps" = 1 : i32}
+    # CHECK: func private @{{.*}}anchor{{.*}}_NW1(%arg0: tensor<128xi32, [[BLOCKED_NW1]]>) attributes {noinline = false, "ttg.num-warps" = 1 : i32}
+    ttgl.warp_specialize((), ws_test_default, (), [ws_test_worker0, ws_test_worker1], [2, 1], [80, 80])
+
+
 @gluon.jit
 def mbarrier_kernel():
     bar = ttgl.allocate_shared_memory(ttgl.int64, [1], mbarrier.MBarrierLayout())
diff --git a/python/triton/compiler/code_generator.py b/python/triton/compiler/code_generator.py
@@ -28,7 +28,7 @@ def check_identifier_legality(name, type):
     return name
 
 
-def mangle_fn(name, arg_tys, constants):
+def mangle_fn(name, arg_tys, constants, caller_context):
     # doesn't mangle ret type, which must be a function of arg tys
     mangled_arg_names = '_'.join([ty.mangle() for ty in arg_tys])
     mangled_constants = '_'.join([f'{i}c{repr(constants[i])}' for i in sorted(constants)])
@@ -37,6 +37,8 @@ def mangle_fn(name, arg_tys, constants):
     # [ and ] are not allowed in LLVM identifiers
     mangled_constants = mangled_constants.replace('[', '_').replace(']', '_')
     ret = f'{name}__{mangled_arg_names}__{mangled_constants}'
+    if caller_context is not None:
+        ret += caller_context.mangle()
     return ret
 
 
@@ -293,7 +295,7 @@ class CodeGenerator(ast.NodeVisitor):
 
     def __init__(self, context, prototype, gscope, function_name, jit_fn: JITFunction, options, codegen_fns, module_map,
                  module=None, is_kernel=False, function_types: Optional[Dict] = None, noinline=False,
-                 file_name: Optional[str] = None, begin_line=0):
+                 caller_context=None, file_name: Optional[str] = None, begin_line=0):
         self.context = context
         if jit_fn.is_gluon():
             from triton.experimental.gluon.language._semantic import GluonSemantic
@@ -339,6 +341,7 @@ def __init__(self, context, prototype, gscope, function_name, jit_fn: JITFunctio
         self.is_kernel = is_kernel
         self.cur_node = None
         self.noinline = noinline
+        self.caller_context = caller_context
         self.scf_stack = []
         self.ret_type = None
         # SSA-construction
@@ -570,6 +573,8 @@ def visit_FunctionDef(self, node):
         self.module.push_back(self.fn)
         entry = self.fn.add_entry_block()
         arg_values = self.prototype.deserialize(self.fn)
+        if self.caller_context is not None:
+            self.caller_context.initialize_callee(self.fn, self.builder)
         # bind arguments to symbols
         for arg_name, arg_value in zip(arg_names, arg_values):
             self.set_value(arg_name, arg_value)
@@ -1190,7 +1195,7 @@ def visit_Assert(self, node) -> Any:
         msg = self.visit(node.msg) if node.msg is not None else ""
         return language.core.device_assert(test, msg, _semantic=self.semantic)
 
-    def call_JitFunction(self, fn: JITFunction, args, kwargs):
+    def call_JitFunction(self, fn: JITFunction, args, kwargs, caller_context=None):
         args = inspect.getcallargs(fn.fn, *args, **kwargs)
         args = [args[name] for name in fn.arg_names]
         for i, arg in enumerate(args):
@@ -1201,7 +1206,8 @@ def call_JitFunction(self, fn: JITFunction, args, kwargs):
         args_path = find_paths_if(args, lambda _, x: not _is_constexpr(x))
         args_val = [get_iterable_path(args, path) for path in args_path]
         # mangle
-        fn_name = mangle_fn(get_full_name(fn), [arg.type for arg in args_val], args_cst)
+        caller_context = caller_context or self.caller_context
+        fn_name = mangle_fn(get_full_name(fn), [arg.type for arg in args_val], args_cst, caller_context)
         # generate function def if necessary
         if not self.module.has_function(fn_name):
             # If the callee is not set, we use the same debug setting as the caller
@@ -1216,7 +1222,7 @@ def call_JitFunction(self, fn: JITFunction, args, kwargs):
                                       function_name=fn_name, function_types=self.function_ret_types,
                                       noinline=fn.noinline, file_name=file_name, begin_line=begin_line,
                                       options=self.builder.options, codegen_fns=self.builder.codegen_fns,
-                                      module_map=self.builder.module_map)
+                                      module_map=self.builder.module_map, caller_context=caller_context)
             try:
                 generator.visit(fn.parse())
             except Exception as e:
diff --git a/python/triton/experimental/gluon/language/_semantic.py b/python/triton/experimental/gluon/language/_semantic.py
@@ -14,6 +14,18 @@ def _check(cond: bool, msg_fn: Callable[[], str], category=ValueError):
         raise category(msg_fn())
 
 
+class GluonCallerContext:
+
+    def __init__(self, num_warps: int):
+        self.num_warps = num_warps
+
+    def mangle(self):
+        return f"_NW{self.num_warps}"
+
+    def initialize_callee(self, fn, builder):
+        fn.set_attr("ttg.num-warps", builder.get_int32_attr(self.num_warps))
+
+
 class GluonSemantic(TritonSemantic[TensorTy]):
     tensor = ttgl.tensor
     lang = ttgl
@@ -319,10 +331,11 @@ def warp_specialize(self, default_args, default_partition, worker_args, worker_p
         partitions_op = builder.create_warp_specialize_partitions(num_partitions)
         arg_types = [arg.get_type() for arg in mlir_args]
         for i in range(num_partitions):
+            caller_context = GluonCallerContext(num_warps=worker_num_warps[i])
             block = builder.create_block_with_parent(partitions_op.get_region(i), arg_types)
             block_args = [block.get_argument(j) for j in range(len(mlir_args))]
             block_args = unflatten_ir_values(block_args, [arg.type for arg in worker_args])
-            generator.call_JitFunction(worker_partitions[i], block_args, kwargs={})
+            generator.call_JitFunction(worker_partitions[i], block_args, kwargs={}, caller_context=caller_context)
             builder.create_warp_return()
 
         builder.set_insertion_point_after(ws_op.get_operation())