[Gluon] Implement reductions (#7091)

peterbell10 · web-flow · commit d57cbee8633e · 2025-06-06T17:23:18.000Z
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -1,4 +1,5 @@
 import expecttest
+from triton.runtime.jit import MockTensor
 import torch
 import pytest
 import re
@@ -705,3 +706,78 @@ def test_math(fresh_knobs):
 } loc(#loc)
 #loc = loc(unknown)
 """)
+
+
+@gluon.jit
+def pair_add(a0, a1, b0, b1):
+    return a0 + b0, a1 + b1
+
+
+@gluon.jit
+def reduce_kernel(out):
+    layout: ttgl.constexpr = ttgl.BlockedLayout([1, 1], [1, 32], [4, 1], [1, 0])
+    a = ttgl.full([16, 16], 1, ttgl.float32, layout)
+    b = ttgl.full([16, 16], 2, ttgl.float32, layout)
+    s0 = ttgl.sum(a, 0)
+    ttgl.static_assert(s0.type.layout == ttgl.SliceLayout(0, layout))
+    s1 = ttgl.sum(a, 1)
+    ttgl.static_assert(s1.type.layout == ttgl.SliceLayout(1, layout))
+
+    scalar = ttgl.max(s0, 0)
+    ttgl.static_assert(scalar.type == ttgl.float32)
+
+    s1 = ttgl.convert_layout(s1, s0.type.layout)
+
+    pairs = ttgl.reduce((a, b), 0, pair_add)
+    ttgl.static_assert(pairs[0].type.layout == ttgl.SliceLayout(0, layout))
+    ttgl.static_assert(pairs[1].type.layout == ttgl.SliceLayout(0, layout))
+    result = scalar + s1 + pairs[0] + pairs[1]
+    tl.store(out + ttgl.arange(0, 16, s0.type.layout), result)
+
+
+def test_reduce(fresh_knobs):
+    knobs.compilation.disable_line_info = True
+
+    h = reduce_kernel.warmup(MockTensor(ttgl.float32), sanitize_overflow=False, grid=(1, ))
+    expecttest.assert_expected_inline(
+        anonymize_ir(h.asm["ttgir"]), """\
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+#loc = loc(unknown)
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @reduce_kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc(unknown)) attributes {noinline = false} {
+    %cst = arith.constant dense<2.000000e+00> : tensor<16x16xf32, #blocked> loc(#loc)
+    %cst_0 = arith.constant dense<1.000000e+00> : tensor<16x16xf32, #blocked> loc(#loc)
+    %0 = "tt.reduce"(%cst_0) <{axis = 0 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %12 = arith.addf %arg1, %arg2 : f32 loc(#loc)
+      tt.reduce.return %12 : f32 loc(#loc)
+    }) : (tensor<16x16xf32, #blocked>) -> tensor<16xf32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc)
+    %1 = "tt.reduce"(%cst_0) <{axis = 1 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %12 = arith.addf %arg1, %arg2 : f32 loc(#loc)
+      tt.reduce.return %12 : f32 loc(#loc)
+    }) : (tensor<16x16xf32, #blocked>) -> tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc)
+    %2 = "tt.reduce"(%0) <{axis = 0 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)):
+      %12 = arith.maxnumf %arg1, %arg2 : f32 loc(#loc)
+      tt.reduce.return %12 : f32 loc(#loc)
+    }) : (tensor<16xf32, #ttg.slice<{dim = 0, parent = #blocked}>>) -> f32 loc(#loc)
+    %3 = ttg.convert_layout %1 : tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<16xf32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc)
+    %4:2 = "tt.reduce"(%cst_0, %cst) <{axis = 0 : i32}> ({
+    ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown), %arg3: f32 loc(unknown), %arg4: f32 loc(unknown)):
+      %12 = arith.addf %arg1, %arg3 : f32 loc(#loc)
+      %13 = arith.addf %arg2, %arg4 : f32 loc(#loc)
+      tt.reduce.return %12, %13 : f32, f32 loc(#loc)
+    }) : (tensor<16x16xf32, #blocked>, tensor<16x16xf32, #blocked>) -> (tensor<16xf32, #ttg.slice<{dim = 0, parent = #blocked}>>, tensor<16xf32, #ttg.slice<{dim = 0, parent = #blocked}>>) loc(#loc)
+    %5 = tt.splat %2 : f32 -> tensor<16xf32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc)
+    %6 = arith.addf %5, %3 : tensor<16xf32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc)
+    %7 = arith.addf %6, %4#0 : tensor<16xf32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc)
+    %8 = arith.addf %7, %4#1 : tensor<16xf32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc)
+    %9 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc)
+    %10 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<16x!tt.ptr<f32>, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc)
+    %11 = tt.addptr %10, %9 : tensor<16x!tt.ptr<f32>, #ttg.slice<{dim = 0, parent = #blocked}>>, tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc)
+    tt.store %11, %8 : tensor<16x!tt.ptr<f32>, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc)
+    tt.return loc(#loc)
+  } loc(#loc)
+} loc(#loc)
+""")
diff --git a/python/triton/compiler/code_generator.py b/python/triton/compiler/code_generator.py
@@ -1464,7 +1464,7 @@ def ret(self, node: ast.Call):
     }
 
 
-def ast_to_ttir(fn, src, context, options, codegen_fns, module_map):
+def ast_to_ttir(fn, src, context, options, codegen_fns, module_map, module=None):
     arg_types = [None] * len(fn.arg_names)
     for k, v in src.signature.items():
         idx = fn.arg_names.index(k)
@@ -1479,7 +1479,7 @@ def ast_to_ttir(fn, src, context, options, codegen_fns, module_map):
     proxy = namedtuple("SpecializationProxy", ["constants", "signature"])(constants, signature)
     generator = CodeGenerator(context, prototype, gscope=fn.__globals__.copy(), function_name=fn.repr(proxy), jit_fn=fn,
                               is_kernel=True, file_name=file_name, begin_line=begin_line, options=options,
-                              codegen_fns=codegen_fns, module_map=module_map)
+                              codegen_fns=codegen_fns, module_map=module_map, module=module)
     generator.visit(fn.parse())
     ret = generator.module
     # module takes ownership of the context
diff --git a/python/triton/experimental/gluon/_runtime.py b/python/triton/experimental/gluon/_runtime.py
@@ -18,9 +18,11 @@ def __init__(self, fn, signature, constexprs=None, attrs=None) -> None:
 
     def make_ir(self, options, codegen_fns, module_map, context):
         from triton.compiler.compiler import make_backend
-        module = ast_to_ttir(self.fn, self, context=context, options=options, codegen_fns=codegen_fns,
-                             module_map=module_map)
+
         builder = ir.builder(context)
+        module = builder.create_module()
+
+        # Assign module attributes eagerly, as they are needed to verify layouts
         target = triton.runtime.driver.active.get_current_target()
         backend = make_backend(target)
         target = backend.get_target_name(options)
@@ -30,6 +32,9 @@ def make_ir(self, options, codegen_fns, module_map, context):
         module.set_attr("ttg.threads-per-warp", builder.get_int32_attr(32))
         if options.maxnreg is not None:
             module.set_attr("ttg.maxnreg", builder.get_int32_attr(options.maxnreg))
+
+        module = ast_to_ttir(self.fn, self, context=context, options=options, codegen_fns=codegen_fns,
+                             module_map=module_map, module=module)
         return module
 
 
diff --git a/python/triton/experimental/gluon/language/__init__.py b/python/triton/experimental/gluon/language/__init__.py
@@ -4,12 +4,15 @@
 from ._layouts import __all__ as __layouts_all
 from ._math import *  # NOQA: F403
 from ._math import __all__ as __math_all
+from ._standard import *  # NOQA: F403
+from ._standard import __all__ as __standard_all
 
 from . import nvidia
 
 __all__ = [
     *__core_all,
     *__layouts_all,
     *__math_all,
+    *__standard_all,
     "nvidia",
 ]
diff --git a/python/triton/experimental/gluon/language/_core.py b/python/triton/experimental/gluon/language/_core.py
@@ -43,8 +43,10 @@
 
 _IMPORT_FROM_TRITON: List[str] = [
     "expand_dims",  # NOQA: F822
-    "program_id",  # NOQA: F822
     "load",  # NOQA: F822
+    "program_id",  # NOQA: F822
+    "reduce",  # NOQA: F822
+    "static_assert",  # NOQA: F822
     "store",  # NOQA: F822
     "to_tensor",  # NOQA: F822
 ]
diff --git a/python/triton/experimental/gluon/language/_semantic.py b/python/triton/experimental/gluon/language/_semantic.py
@@ -180,6 +180,41 @@ def memdesc_reinterpret(self, mem_desc, dtype, shape, layout):
         handle = self.builder.create_memdesc_reinterpret(ty.to_ir(self.builder), mem_desc.handle)
         return ttgl.shared_memory_descriptor(handle, **ty.__dict__)
 
+    def wrap_tensor(self, x, scalar_ty, ret_shape, layout):
+        if ret_shape:
+            res_ty = ttgl.distributed_type(scalar_ty, ret_shape, layout)
+        else:
+            res_ty = scalar_ty
+        return self.tensor(x, res_ty)
+
+    @staticmethod
+    def _check_same_layout(xs):
+        for x in xs:
+            _check(isinstance(x.type, ttgl.distributed_type), lambda: f"expected distributed_type but got: {x.type!r}")
+        layouts = [x.type.layout for x in xs]
+        l0 = layouts[0]
+        _check(all(l == l0 for l in layouts[1:]),
+               lambda: f"Expected inputs to have matching layouts, but got: {layouts}")
+
+    def reduction(self, inputs: Sequence[TensorTy], axis: int, region_builder_fn) -> Tuple[TensorTy, ...]:
+        _check(axis is not None, lambda: "All-reduce is not yet implemented in gluon")
+        # get result shape
+        shape = inputs[0].type.shape
+        rank = len(shape)
+        _check(0 <= axis < rank, lambda: f"expected reduction axis to be in the range [0, {rank}) but got {axis}")
+        self._check_same_layout(inputs)
+        ret_shape = [s for i, s in enumerate(shape) if i != axis]
+        ret_layout = SliceLayout(axis, inputs[0].type.layout)
+        assert all(t.type.shape == shape for t in inputs), "all reduction inputs must have the same shape"
+
+        reduce_op = self.builder.create_reduce([t.handle for t in inputs], axis)
+        region_builder_fn(reduce_op)
+        assert reduce_op.verify()
+
+        return tuple(
+            self.wrap_tensor(reduce_op.get_result(i), inputs[i].type.scalar, ret_shape, ret_layout)
+            for i in range(len(inputs)))
+
     def warp_specialize(self, args, default_partition, worker_partitions, worker_num_warps: Sequence[int],
                         worker_num_regs: Sequence[int], generator):
         num_partitions = len(worker_partitions)
diff --git a/python/triton/experimental/gluon/language/_standard.py b/python/triton/experimental/gluon/language/_standard.py
@@ -0,0 +1,20 @@
+# flake8: noqa
+import triton
+import triton.language.standard as tl_standard
+from .._runtime import jit
+
+_IMPORT_FROM_TRITON = [
+    "sum",
+    "max",
+    "min",
+    "reduce_or",
+    "xor_sum",
+]
+
+__all__ = _IMPORT_FROM_TRITON
+
+for name in _IMPORT_FROM_TRITON:
+    # Convert JITFunction -> GluonJitFunction
+    fn = getattr(tl_standard, name)
+    assert isinstance(fn, triton.runtime.JITFunction)
+    globals()[name] = jit(fn.fn)
diff --git a/python/triton/language/semantic.py b/python/triton/language/semantic.py
@@ -1659,7 +1659,7 @@ def reduction(self, inputs: Sequence[TensorTy], axis: int, region_builder_fn) ->
 
         reduce_op = self.builder.create_reduce([t.handle for t in inputs], axis)
         region_builder_fn(reduce_op)
-        reduce_op.verify()
+        assert reduce_op.verify()
 
         return tuple(
             self.wrap_tensor(reduce_op.get_result(i), inputs[i].type.scalar, ret_shape) for i in range(len(inputs)))