Merge OpenAI Triton commit 272188c (#4612)

anmyachev · yongjik · pawelszczerbuk · web-flow · commit 2892882ec102 · 2025-07-03T20:49:11.000+02:00
This PR change the Triton base from e21efcb to 272188c (Jun 26). Pass rate: 97.14% Please do not squash and merge this PR. --------- Co-authored-by: Yongjik Kim <yongjik@openai.com> Co-authored-by: pawelszczerbuk <153013546+pawelszczerbuk@users.noreply.github.com> Co-authored-by: Nick Riasanovsky <njriasan@meta.com> Co-authored-by: Yi Qian <68618497+yiqian1@users.noreply.github.com> Co-authored-by: Jeff Niu <jeffniu22@gmail.com>
diff --git a/include/triton/Tools/Sys/GetEnv.hpp b/include/triton/Tools/Sys/GetEnv.hpp
@@ -14,6 +14,7 @@ namespace mlir::triton {
 inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     // clang-format off
     "AMDGCN_ENABLE_DUMP",
+    "AMDGCN_USE_BUFFER_ATOMICS",
     "AMDGCN_USE_BUFFER_OPS",
     "DISABLE_LLVM_OPT",
     "DISABLE_MMA_V3",
diff --git a/python/src/gluon_ir.cc b/python/src/gluon_ir.cc
@@ -391,7 +391,10 @@ void init_gluon_ir(py::module &&m) {
              return self.create<ttng::WarpGroupDotOp>(
                  a, b, acc, useAcc, precision, maxNumImpreciseAcc, isAsync);
            })
-
+      .def("create_warpgroup_mma_wait",
+           [](GluonOpBuilder &self, std::vector<Value> &deps, int pendings) {
+             self.create<ttng::WarpGroupDotWaitOp>(deps, pendings);
+           })
       .def("create_tmem_alloc",
            [](GluonOpBuilder &self, Type resultTy, Value value) -> Value {
              return self.create<ttng::TMEMAllocOp>(resultTy, value);
diff --git a/python/test/gluon/test_core.py b/python/test/gluon/test_core.py
@@ -100,7 +100,7 @@ def test_async_copy_mbarrier():
 
 
 @gluon.jit
-def warpgroup_mma_kernel(a, b, out, M: ttgl.constexpr, N: ttgl.constexpr, K: ttgl.constexpr):
+def warpgroup_mma_kernel(a, b, out, M: ttgl.constexpr, N: ttgl.constexpr, K: ttgl.constexpr, ASYNC: ttgl.constexpr):
     block_layout: ttgl.constexpr = ttgl.BlockedLayout([1, 1], [1, 32], [4, 1], [1, 0])
     mma_layout: ttgl.constexpr = ttgl.NVMMADistributedLayout(version=[3, 0], warps_per_cta=[4, 1],
                                                              instr_shape=[16, 32, 16])
@@ -121,19 +121,23 @@ def warpgroup_mma_kernel(a, b, out, M: ttgl.constexpr, N: ttgl.constexpr, K: ttg
     a_shmem = ttgl.allocate_shared_memory(ttgl.float16, [M, K], nvmma_layout, A)
     b_shmem = ttgl.allocate_shared_memory(ttgl.float16, [K, N], nvmma_layout, B)
 
-    acc = hopper.warpgroup_mma(a_shmem, b_shmem, acc)
+    acc = hopper.warpgroup_mma(a_shmem, b_shmem, acc, is_async=ASYNC)
+
+    if ASYNC:
+        hopper.warpgroup_mma_wait(num_outstanding=1, deps=[acc])
 
     ttgl.store(out + out_offs_m * N + out_offs_n, acc)
 
 
 @pytest.mark.skipif(not is_hopper(), reason="Requires Hopper")
-def test_warpgroup_mma():
+@pytest.mark.parametrize("ASYNC", [True, False])
+def test_warpgroup_mma(ASYNC):
     torch.manual_seed(0)
     M, N, K = 64, 32, 32
     a = torch.randn((M, K), device="cuda", dtype=torch.float16)
     b = torch.randn((K, N), device="cuda", dtype=torch.float16)
     out = torch.zeros((M, N), device="cuda", dtype=torch.float16)
-    warpgroup_mma_kernel[(1, )](a, b, out, M, N, K)
+    warpgroup_mma_kernel[(1, )](a, b, out, M, N, K, ASYNC)
 
     ref = torch.matmul(a, b)
 
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -482,6 +482,33 @@ def test_warpgroup_mma(fresh_knobs):
 """)
 
 
+@gluon.jit
+def warpgroup_mma_wait_kernel():
+    layout: ttgl.constexpr = ttgl.NVMMADistributedLayout(version=[3, 0], warps_per_cta=[4, 1], instr_shape=[16, 32, 16])
+    acc = ttgl.full([128, 128], 0, dtype=ttgl.float16, layout=layout)
+    hopper.warpgroup_mma_wait(num_outstanding=1, deps=[acc])
+
+
+@pytest.mark.skipif(not is_hopper(), reason="Requires Hopper WGMMA")
+def test_warpgroup_mma_wait(fresh_knobs):
+    knobs.compilation.disable_line_info = True
+
+    h = warpgroup_mma_wait_kernel.warmup(grid=(1, ))
+    expecttest.assert_expected_inline(
+        anonymize_ir(h.asm["source"]), """\
+#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 32, 16]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @warpgroup_mma_wait_kernel() attributes {noinline = false} {
+    %cst = arith.constant 0.000000e+00 : f16 loc(#loc)
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x128xf16, #mma> loc(#loc)
+    %0 = ttng.warp_group_dot_wait %cst_0 {pendings = 1 : i32} : tensor<128x128xf16, #mma> loc(#loc)
+    tt.return loc(#loc)
+  } loc(#loc)
+} loc(#loc)
+#loc = loc(unknown)
+""")
+
+
 @gluon.jit
 def async_tma_kernel(input_desc, XBLOCK: ttgl.constexpr):
     smem = ttgl.allocate_shared_memory(ttgl.float16, [XBLOCK, XBLOCK], input_desc.layout)
diff --git a/python/test/unit/language/test_frontend.py b/python/test/unit/language/test_frontend.py
@@ -374,6 +374,7 @@ def test_constexpr_generator():
     generator(lhs)
 
 
+@tl.constexpr_function
 def Box(T):
 
     @tl.core._aggregate
@@ -401,3 +402,23 @@ def kernel():
         anchor(value)
 
     run_filecheck_test(kernel)
+
+
+@filecheck_test
+@triton.jit
+def test_modify_if_livein():
+    # CHECK-LABEL: test_modify_if_livein
+    none_livein = None  # noqa: F841
+
+    # CHECK: [[LOOP_OUT:%.*]] = scf.for {{.*}} iter_args([[BOX:%.*]] = %true)
+    # CHECK:   [[LIVEOUT:%.*]] = scf.if [[BOX]]
+    # CHECK:     yield %false
+    # CHECK:   else
+    # CHECK:     yield [[BOX]]
+    # CHECK:   yield [[LIVEOUT]]
+    # CHECK: call @{{.*}}anchor{{.*}}([[LOOP_OUT]])
+    box = Box(tl.tensor)(tl.core.to_tensor(True))
+    for i in range(10):
+        if box.value:
+            box.value = False
+    anchor(box.value)
diff --git a/python/triton/compiler/code_generator.py b/python/triton/compiler/code_generator.py
@@ -719,35 +719,40 @@ def visit_then_else_blocks(self, node, liveins, then_block, else_block):
         self.visit_compound_statement(node.body)
         then_block = self.builder.get_insertion_block()
         then_defs = self.local_defs.copy()
+        then_vals = self.lscope.copy()
         # else block
         else_defs = {}
+        else_vals = liveins.copy()
         if node.orelse:
             self.builder.set_insertion_point_to_start(else_block)
             self.lscope = liveins.copy()
             self.local_defs = {}
             self.visit_compound_statement(node.orelse)
             else_defs = self.local_defs.copy()
             else_block = self.builder.get_insertion_block()
+            else_vals = self.lscope.copy()
 
         # update block arguments
         names = []
         # variables in livein whose value is updated in `if`
-        for name in liveins:
+        for name, value in liveins.items():
+            # livein variable changed value in either then or else
+            if not _is_triton_value(value):
+                continue
+            then_handles = flatten_values_to_ir([then_vals[name]])
+            else_handles = flatten_values_to_ir([else_vals[name]])
+            if then_handles == else_handles:
+                continue
+            names.append(name)
+            then_defs[name] = then_vals[name]
+            else_defs[name] = else_vals[name]
             # check type
             for defs, block_name in [(then_defs, 'then'), (else_defs, 'else')]:
-                if name in defs:
-                    type_equal = type(defs[name]) == type(liveins[name])  # noqa: E721
-                    assert type_equal and defs[name].type == liveins[name].type, \
-                        f'initial value for `{name}` is of type {liveins[name]}, '\
-                        f'but the {block_name} block redefines it as {defs[name]}'
-            if name in then_defs or name in else_defs:
-                names.append(name)
-            # variable defined in then but not in else
-            if name in then_defs and name not in else_defs:
-                else_defs[name] = liveins[name]
-            # variable defined in else but not in then
-            if name in else_defs and name not in then_defs:
-                then_defs[name] = liveins[name]
+                type_equal = type(defs[name]) == type(value)  # noqa: E721
+                assert type_equal and defs[name].type == value.type, \
+                    f'initial value for `{name}` is of type {value}, '\
+                    f'but the {block_name} block redefines it as {defs[name]}'
+
         # variables that are both in then and else but not in liveins
         # TODO: could probably be cleaned up
         for name in sorted(then_defs.keys() & else_defs.keys()):
diff --git a/python/triton/experimental/gluon/language/nvidia/hopper/__init__.py b/python/triton/experimental/gluon/language/nvidia/hopper/__init__.py
@@ -2,7 +2,7 @@
 from . import mbarrier, tma
 from ... import _core
 
-__all__ = ["async_copy", "fence_async_shared", "mbarrier", "tma", "warpgroup_mma"]
+__all__ = ["async_copy", "fence_async_shared", "mbarrier", "tma", "warpgroup_mma", "warpgroup_mma_wait"]
 
 
 @_core.builtin
@@ -25,3 +25,10 @@ def warpgroup_mma(a, b, acc, *, use_acc=True, precision=None, max_num_imprecise_
     handle = _semantic.builder.create_warpgroup_mma(a.handle, b.handle, acc.handle, use_acc.handle, precision,
                                                     max_num_imprecise_acc, is_async)
     return _core.tensor(handle, acc.type)
+
+
+@_core.builtin
+def warpgroup_mma_wait(num_outstanding=0, deps=None, _semantic=None):
+    deps = [x.handle for x in deps] if deps is not None else []
+    num_outstanding = _core._unwrap_if_constexpr(num_outstanding)
+    _semantic.builder.create_warpgroup_mma_wait(deps, num_outstanding)
diff --git a/python/triton/knobs.py b/python/triton/knobs.py
@@ -510,6 +510,8 @@ class intel_knobs(base_knobs):
 
 class amd_knobs(base_knobs):
     use_buffer_ops: env_bool = env_bool("AMDGCN_USE_BUFFER_OPS", True)
+    # Note: This requires use_buffer_ops be true to have any effect
+    use_buffer_atomics: env_bool = env_bool("AMDGCN_USE_BUFFER_ATOMICS", True)
     dump_amdgcn: env_bool = env_bool("AMDGCN_ENABLE_DUMP")
     libhip_path: env_opt_str = env_opt_str("TRITON_LIBHIP_PATH")
     lld_path: env_opt_str = env_opt_str("TRITON_HIP_LLD_PATH")
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_common.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_common.py
@@ -78,17 +78,37 @@ def convert_dtype(dtype):
 
 
 def matmul_launch_metadata(grid, kernel, args):
+    from ..proton_opts import launch_metadata_allow_sync
+
     ret = dict()
     M, N, K = args["M"], args["N"], args["K"]
     Y, X, W = [t.base if isinstance(t, TensorDescriptor) else t for t in [args["Y"], args["X"], args["W"]]]
+    tokens_per_expt = args.get("TOKENS_PER_EXPT_FOR_ANNOTATION")
     hist = args["ExptHist"]
     if hist is not None:
-        n_tokens = float(hist.sum())
-        n_w_bytes = (W.numel() * W.element_size() // hist.numel()) * (hist > 0).sum()
+        # If annotation is given, use that to generate name for profiling.
+        if tokens_per_expt is not None:
+            n_rows = f"{tokens_per_expt}*"
+        elif launch_metadata_allow_sync():
+            n_rows = int(hist.float().mean())
+        else:
+            n_rows = "unknown"
+
+        if launch_metadata_allow_sync():
+            n_tokens = float(hist.sum())
+            n_w_bytes = (W.numel() * W.element_size() // hist.numel()) * (hist > 0).sum()
+        elif tokens_per_expt is not None:
+            n_tokens = tokens_per_expt * args["N_EXPTS_TOT"]
+            # This may not be totally correct (e.g., we might not be using all experts)
+            # but it's better than nothing.
+            n_w_bytes = W.numel() * W.element_size()
+        else:
+            n_tokens = None
+            n_w_bytes = 0
 
         # If annotation is given, use that to generate name for profiling.
         tokens_per_expt = args.get("TOKENS_PER_EXPT_FOR_ANNOTATION")
-        n_rows = f"{tokens_per_expt}*" if tokens_per_expt is not None else int(hist.float().mean())
+        n_rows = f"{tokens_per_expt}*" if tokens_per_expt is not None else n_rows
     else:
         n_tokens = None
         n_w_bytes = W.numel() * W.element_size()
@@ -101,6 +121,10 @@ def matmul_launch_metadata(grid, kernel, args):
     ep_subtile = args["EPILOGUE_SUBTILE"]
     if ep_subtile is not None and ep_subtile > 1:
         ret["name"] += f" ep/{ep_subtile}"
+
+    if hist is not None and n_tokens is None:
+        return ret  # Don't fill metadata because we can't compute them properly.
+
     fM = M if M is not None else n_tokens
     fK = K if K is not None else n_tokens
     ret[f"flops{nbits}"] = 2.0 * fM * N * fK
@@ -115,7 +139,7 @@ def matmul_launch_metadata(grid, kernel, args):
         assert n_tokens is not None
         n_expts_act = args["N_EXPTS_ACT"]
 
-        if gindx is not None:
+        if (gindx is not None) and launch_metadata_allow_sync():
             # recreate inverse GatherIndx.
             dst = torch.full_like(gindx, -1)
             idx = torch.arange(len(gindx), device=gindx.device, dtype=torch.int32)
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py
@@ -29,7 +29,8 @@ def _zero_masked_rows(
 
 
 _matmul_ogs_repr = make_matmul_repr("_matmul_ogs", [0, 1, 2])
-@triton.jit(repr=_matmul_ogs_repr, launch_metadata=matmul_launch_metadata)
+@triton.jit(do_not_specialize=["TOKENS_PER_EXPT_FOR_ANNOTATION"],
+            repr=_matmul_ogs_repr, launch_metadata=matmul_launch_metadata)
 def _matmul_ogs(
              Y, Out, stride_y_k, stride_y_z, stride_y_m, stride_y_n,
              YExpectedScale, YActualScale, YChecksumScale,
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py
@@ -96,7 +96,8 @@ def _load_writeback_idx_and_mask(WriteBackIndx, writeback_size, offs, mask):
 
 
 _matmul_ogs_repr = make_matmul_repr("_p_matmul_ogs", [0, 1, 2])
-@triton.jit(repr=_matmul_ogs_repr, launch_metadata=matmul_launch_metadata)
+@triton.jit(do_not_specialize=["TOKENS_PER_EXPT_FOR_ANNOTATION"],
+            repr=_matmul_ogs_repr, launch_metadata=matmul_launch_metadata)
 def _p_matmul_ogs(
              Y, Out, stride_y_k, stride_y_z, stride_y_m, stride_y_n,
              YExpectedScale, YActualScale, YChecksumScale,
diff --git a/python/triton_kernels/triton_kernels/proton_opts.py b/python/triton_kernels/triton_kernels/proton_opts.py
@@ -0,0 +1,17 @@
+# proton options
+
+import os
+
+_launch_metadata_allow_sync = None
+
+
+def launch_metadata_allow_sync():
+    global _launch_metadata_allow_sync
+    if _launch_metadata_allow_sync is None:
+        _launch_metadata_allow_sync = not (os.getenv("PROTON_LAUNCH_METADATA_NOSYNC") == "1")
+    return _launch_metadata_allow_sync
+
+
+def set_launch_metadata_allow_sync(allow_sync: bool):
+    global _launch_metadata_allow_sync
+    _launch_metadata_allow_sync = allow_sync
diff --git a/test/Conversion/amd/tritongpu_to_llvm.mlir b/test/Conversion/amd/tritongpu_to_llvm.mlir
@@ -1,4 +1,5 @@
 // RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch=gfx942 --convert-builtin-func-to-llvm | FileCheck %s
+// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch=gfx950 | FileCheck %s --check-prefix=GFX950
 
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   // CHECK-LABEL: atomic_add_f32_scalar
@@ -409,3 +410,34 @@ module attributes {"ttg.target" = "hip:gfx942", "ttg.num-ctas" = 1 : i32, "ttg.n
     tt.return
   }
 }
+
+// -----
+
+// GFX950-LABEL: reduce_32x32
+// GFX950: llvm.call_intrinsic "llvm.amdgcn.permlane32.swap"
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @reduce_32x32(%arg0: tensor<64x32xf32, #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [32, 32], isTransposed = true}>>) {
+%3101 = "tt.reduce"(%arg0) <{axis = 1 : i32}> ({
+^bb0(%arg24: f32, %arg25: f32):
+  %3166 = "arith.maxnumf"(%arg24, %arg25) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
+  "tt.reduce.return"(%3166) : (f32) -> ()
+}) : (tensor<64x32xf32, #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [32, 32], isTransposed = true}>>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [32, 32], isTransposed = true}>}>>
+  tt.return
+  }
+}
+
+// -----
+
+// GFX950-LABEL: reduce_16x16
+// GFX950: llvm.call_intrinsic "llvm.amdgcn.permlane32.swap"
+// GFX950: llvm.call_intrinsic "llvm.amdgcn.permlane16.swap"
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @reduce_16x16(%arg0: tensor<64x16xf32, #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16], isTransposed = true}>>){
+%1 = "tt.reduce"(%arg0) <{axis = 1 : i32}> ({
+^bb0(%arg24: f32, %arg25: f32):
+  %3166 = "arith.maxnumf"(%arg24, %arg25) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
+  "tt.reduce.return"(%3166) : (f32) -> ()
+}) : (tensor<64x16xf32, #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16], isTransposed = true}>>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16], isTransposed = true}>}>>
+  tt.return
+  }
+}
diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py
@@ -261,7 +261,7 @@ def make_ttgir(mod, metadata, options):
         if knobs.amd.use_buffer_ops:
             amd.passes.ttgpuir.add_canonicalize_pointers(pm)
             passes.common.add_canonicalizer(pm)
-            amd.passes.ttgpuir.add_convert_to_buffer_ops(pm, options.arch)
+            amd.passes.ttgpuir.add_convert_to_buffer_ops(pm, options.arch, knobs.amd.use_buffer_atomics)
 
         amd.passes.ttgpuir.add_fold_true_cmpi(pm)
         passes.common.add_canonicalizer(pm)
diff --git a/third_party/amd/include/TritonAMDGPUTransforms/Passes.td b/third_party/amd/include/TritonAMDGPUTransforms/Passes.td
@@ -137,6 +137,9 @@ def TritonAMDGPUConvertToBufferOps : Pass<"tritonamdgpu-convert-buffer-ops", "ml
     Option<"archGenerationName", "arch-generation-name",
            "std::string", /*default=*/"std::string{}",
            "GFX generation name of target device.">,
+    Option<"allowBufferAtomics", "allow-buffer-atomics",
+           "bool", /*default*/"true",
+           "Allow buffer atomic operations when the hardware supports it.">,
   ];
 }
 
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ConvertToBufferOps.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ConvertToBufferOps.cpp
diff --git a/third_party/amd/python/triton_amd.cc b/third_party/amd/python/triton_amd.cc