Merge OpenAI Triton commit 717997b (#4497)

whitneywhtsang · web-flow · commit 565808e0a056 · 2025-06-12T22:46:27.000-04:00
This PR change the Triton base from 88a2851 to 717997b (Jun 11). Pass rate: 97.11%
diff --git a/include/triton/Dialect/TritonGPU/Transforms/Utility.h b/include/triton/Dialect/TritonGPU/Transforms/Utility.h
@@ -35,9 +35,11 @@ SmallVector<unsigned, 3> mmaVersionToInstrShape(int version,
 // Return true if the Load uses block pointer.
 bool isLoadFromTensorPtr(triton::LoadOp op);
 
-// Return an array of indices enumerating the elements of 'arr' in descending
-// order (so that result[i] is the index of the i-th largest element of 'arr')
-SmallVector<unsigned, 4> argSort(const SmallVector<int64_t> &arr);
+// Gets the order of a tensor from its contiguity. Places the dimensions with
+// the largest contiguity as the inner most dimension. If the contiguity is
+// all ones, returns the order {dim - 1, dim - 2, ..., 0}
+SmallVector<unsigned, 4>
+getOrderFromContiguity(const SmallVector<int64_t> &contiguity);
 
 // Return the operand used to access the memory in the operation
 Value getMemAccessPtr(Operation *op);
diff --git a/lib/Dialect/TritonGPU/Transforms/Coalesce.cpp b/lib/Dialect/TritonGPU/Transforms/Coalesce.cpp
@@ -38,7 +38,7 @@ struct CoalescePass : public impl::TritonGPUCoalesceBase<CoalescePass> {
     });
 
     auto contiguity = axisInfoAnalysis.getAxisInfo(ptr)->getContiguity();
-    SmallVector<unsigned> order = argSort(contiguity);
+    SmallVector<unsigned> order = getOrderFromContiguity(contiguity);
     LDBG("order=[" << triton::join(order, ", ") << "]");
 
     auto matchesShape = [&refTensorType](const Value &val) {
@@ -55,8 +55,8 @@ struct CoalescePass : public impl::TritonGPUCoalesceBase<CoalescePass> {
         Value val = getMemAccessPtr(use);
         if (!val || !matchesShape(val) || memAccessesSameOrder.contains(use))
           continue;
-        auto currOrder =
-            argSort(axisInfoAnalysis.getAxisInfo(val)->getContiguity());
+        auto currOrder = getOrderFromContiguity(
+            axisInfoAnalysis.getAxisInfo(val)->getContiguity());
         if (order == currOrder) {
           LDBG("multi-root-slice: insert to memAccessesSameOrder " << *use);
           memAccessesSameOrder.insert(use);
diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
@@ -92,9 +92,11 @@ bool isLoadFromTensorPtr(triton::LoadOp op) {
   return mlir::triton::isTensorPointerType(op.getPtr().getType());
 }
 
-SmallVector<unsigned, 4> argSort(const SmallVector<int64_t> &arr) {
+SmallVector<unsigned, 4>
+getOrderFromContiguity(const SmallVector<int64_t> &arr) {
   SmallVector<unsigned, 4> ret(arr.size());
   std::iota(ret.begin(), ret.end(), 0);
+  std::reverse(ret.begin(), ret.end());
   std::stable_sort(ret.begin(), ret.end(),
                    [&](unsigned x, unsigned y) { return arr[x] > arr[y]; });
   return ret;
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -881,3 +881,47 @@ def test_tensor_reshape():
     expect_layout: ttgl.constexpr = ttgl.BlockedLayout([1, 1, 2], [2, 4, 4], [4, 1, 1], [2, 1, 0], [1, 1, 1], [1, 1, 1],
                                                        [2, 1, 0])
     ttgl.static_assert(v.type.layout == expect_layout)
+
+
+@filecheck_test
+@gluon.jit
+def test_zeros():
+    # CHECK: [[BLOCKED:#.*]] = #ttg.blocked<{sizePerThread = [2]
+    # CHECK: [[BLOCKED2D:#.*]] = #ttg.blocked<{sizePerThread = [1, 2]
+    layout: ttgl.constexpr = ttgl.BlockedLayout([2], [32], [4], [0])
+    layout_2d: ttgl.constexpr = ttgl.BlockedLayout([1, 2], [4, 8], [4, 1], [1, 0])
+
+    # CHECK: arith.constant dense<0.000000e+00> : tensor<32xf32, [[BLOCKED]]>
+    a = ttgl.zeros([32], ttgl.float32, layout)
+
+    # CHECK: arith.constant dense<7.000000e+00> : tensor<32xf32, [[BLOCKED]]>
+    ttgl.full_like(a, 7)
+
+    # CHECK: arith.constant dense<0.000000e+00> : tensor<32xf32, [[BLOCKED]]>
+    ttgl.zeros_like(a)
+
+    # CHECK: arith.constant dense<0.000000e+00> : tensor<64xf32, [[BLOCKED]]>
+    ttgl.zeros_like(a, shape=[64])
+
+    # CHECK: arith.constant dense<0> : tensor<16x16xi8, [[BLOCKED2D]]>
+    ttgl.zeros_like(a, shape=[16, 16], dtype=ttgl.int8, layout=layout_2d)
+
+    # CHECK: arith.constant dense<7> : tensor<8x8xi16, [[BLOCKED2D]]>
+    ttgl.full_like(a, 7, shape=[8, 8], dtype=ttgl.int16, layout=layout_2d)
+
+
+@filecheck_test
+@gluon.jit
+def test_barrier():
+    # CHECK: gpu.barrier
+    ttgl.thread_barrier()
+
+
+@filecheck_test
+@gluon.jit
+def test_fence_async_shared():
+    # CHECK: ttng.fence_async_shared {bCluster = false}
+    blackwell.fence_async_shared()
+
+    # CHECK-NEXT: ttng.fence_async_shared {bCluster = true}
+    blackwell.fence_async_shared(cluster=True)
diff --git a/python/triton/experimental/gluon/language/_core.py b/python/triton/experimental/gluon/language/_core.py
@@ -91,6 +91,7 @@
     "tensor",
     "tuple",
     "tuple_type",
+    "thread_barrier",
     "arange",
     "full",
     "convert_layout",
@@ -313,3 +314,8 @@ def warp_specialize(args, default_partition, worker_partitions, worker_num_warps
     worker_num_regs = [_unwrap_if_constexpr(r) for r in worker_num_regs]
     return _semantic.warp_specialize(args, default_partition, worker_partitions, worker_num_warps,  #
                                      worker_num_regs, _generator)
+
+
+@builtin
+def thread_barrier(_semantic=None):
+    return _semantic.debug_barrier()
diff --git a/python/triton/experimental/gluon/language/_standard.py b/python/triton/experimental/gluon/language/_standard.py
@@ -3,6 +3,7 @@
 import triton.language.standard as tl_standard
 from .._runtime import jit
 from triton import knobs
+from . import _core as ttgl
 
 _IMPORT_FROM_TRITON = [
     "sum",
@@ -12,10 +13,35 @@
     "xor_sum",
 ]
 
-__all__ = _IMPORT_FROM_TRITON
+__all__ = [
+    "full_like",
+    "zeros",
+    "zeros_like",
+    *_IMPORT_FROM_TRITON,
+]
 
 for name in _IMPORT_FROM_TRITON:
     # Convert JITFunction -> GluonJitFunction
     fn = getattr(tl_standard, name)
     assert knobs.runtime.interpret or isinstance(fn, triton.runtime.JITFunction)
     globals()[name] = jit(fn.fn)
+
+
+@jit
+def zeros(shape, dtype, layout):
+    return ttgl.full(shape, 0, dtype, layout)
+
+
+@jit
+def full_like(input, value, shape=None, dtype=None, layout=None):
+    return ttgl.full(
+        input.shape if shape is None else shape,
+        value,
+        input.dtype if dtype is None else dtype,
+        input.type.layout if layout is None else layout,
+    )
+
+
+@jit
+def zeros_like(input, shape=None, dtype=None, layout=None):
+    return full_like(input, 0, shape=shape, dtype=dtype, layout=layout)
diff --git a/python/triton/experimental/gluon/language/nvidia/blackwell/__init__.py b/python/triton/experimental/gluon/language/nvidia/blackwell/__init__.py
@@ -6,18 +6,19 @@
 from triton.experimental.gluon.language._core import builtin, base_type, base_value, _unwrap_if_constexpr
 
 from . import tma
-from ..hopper import mbarrier
+from ..hopper import mbarrier, fence_async_shared
 
 if TYPE_CHECKING:
     from triton._C.libtriton.gluon_ir import GluonOpBuilder
     from triton._C.libtriton import gluon_ir as ir
     from ..._semantic import GluonSemantic
 
 __all__ = [
-    "TensorMemoryLayout",
-    "tensor_memory_descriptor",
     "allocate_tensor_memory",
+    "fence_async_shared",
     "mbarrier",
+    "tensor_memory_descriptor",
+    "TensorMemoryLayout",
     "tma",
 ]
 
diff --git a/python/triton/experimental/gluon/language/nvidia/hopper/__init__.py b/python/triton/experimental/gluon/language/nvidia/hopper/__init__.py
@@ -1,4 +1,11 @@
 from . import mbarrier
 from . import tma
+from ... import _core
 
-__all__ = ["mbarrier", "tma"]
+__all__ = ["fence_async_shared", "mbarrier", "tma"]
+
+
+@_core.builtin
+def fence_async_shared(cluster=False, _semantic=None):
+    cluster = _core._unwrap_if_constexpr(cluster)
+    _semantic.builder.create_fence_async_shared(cluster)
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_finalize_matmul.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_finalize_matmul.py
@@ -291,7 +291,7 @@ def _finalize_matmul(
                         if src_idx != -1:
                             As = A + src_idx.to(tl.int64) * stride_a_m + offs_n
                             for ki in tl.static_range(K):
-                                acc += tl.load(As, mask=n_mask, other=0.0)
+                                acc += tl.load(As, mask=(src_idxs != -1)[:, None] & n_mask[None, :], other=0.0)
                                 As += stride_a_k
                 else:
                     As = A + src_idxs.to(tl.int64)[:, None] * stride_a_m + offs_n[None, :]
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py
@@ -387,7 +387,7 @@ def _compute_writeback_idx(
     is_src_active = (src_idxs != -1).to(tl.int32)
     num_src_active = tl.sum(is_src_active, axis=1)
 
-    need_finalize_scatter = mask_m & (num_src_active > 1)
+    need_finalize_scatter = mask_m & (num_src_active != 1)
     finalize_scatter_count = tl.sum(need_finalize_scatter.to(tl.int32))
     if finalize_scatter_count == 0:
         return
diff --git a/test/TritonGPU/coalesce.mlir b/test/TritonGPU/coalesce.mlir
@@ -199,3 +199,17 @@ tt.func @coalesce_poison(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1
 }
 
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [2, 4, 4], warpsPerCTA = [4, 1, 1], order = [2, 1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @load_3D_contig_1(%arg: !tt.ptr<i8> {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+    %50 = tt.splat %arg : !tt.ptr<i8> -> tensor<32x4x4x!tt.ptr<i8>, #blocked>
+    // This checks that the pass picks the row-major ordering by default for elements with contiguity 1.
+    // CHECK: #blocked = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [2, 4, 4], warpsPerCTA = [4, 1, 1], order = [2, 1, 0]}>
+    // CHECK:  tt.load %1 : tensor<32x4x4x!tt.ptr<i8>, #blocked>
+    %108 = tt.load %50 : tensor<32x4x4x!tt.ptr<i8>, #blocked>
+    tt.return
+  }
+}
diff --git a/test/TritonIntelGPU/coalesce.mlir b/test/TritonIntelGPU/coalesce.mlir
@@ -345,8 +345,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
 #blocked1 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [1, 1, 32], warpsPerCTA = [1, 4, 4], order = [2, 1, 0]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, "ttg.threads-per-warp" = 32 : i32} {
   // CHECK-DAG: [[BLOCKED_LAYOUT:#.*]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 4], order = [1, 0]}>
-  // CHECK-DAG: [[BLOCKED_LAYOUT1:#.*]] = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [1, 32, 1], warpsPerCTA = [1, 1, 16], order = [0, 1, 2]}>
-  // CHECK-DAG: [[BLOCKED_LAYOUT2:#.*]] = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [1, 1, 32], warpsPerCTA = [1, 4, 4], order = [2, 1, 0]}>
+  // CHECK-DAG: [[BLOCKED_LAYOUT1:#.*]] = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [1, 1, 32], warpsPerCTA = [1, 4, 4], order = [2, 1, 0]}>
   // CHECK: @triton_red_fused_mul_sum_0
   tt.func public @triton_red_fused_mul_sum_0(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
     %c128_i32 = arith.constant 128 : i32
@@ -368,7 +367,6 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, "ttg.th
     // CHECK: [[RES:%.*]]:2 = scf.for {{.*}} iter_args([[ARG1:%.*]] = [[PTR1]], [[ARG2:%.*]] = {{.*}}) -> (!tt.ptr<tensor<1x32x128xf32, [[BLOCKED_LAYOUT1]]>>, tensor<32x128xf32, [[BLOCKED_LAYOUT]]>)
     %8:2 = scf.for %arg5 = %c0_i32 to %c512_i32 step %c128_i32 iter_args(%arg6 = %6, %arg8 = %cst_0) -> (!tt.ptr<tensor<1x32x128xf32, #blocked1>>, tensor<32x128xf32, #blocked>) : i32 {
       // CHECK: [[LOAD:%.*]] = tt.load [[ARG1]] evictionPolicy = evict_last {boundaryCheck = array<i32: 2>, padding = 1 : i32} : !tt.ptr<tensor<1x32x128xf32, [[BLOCKED_LAYOUT1]]>>
-      // CHECK-NEXT: ttg.convert_layout [[LOAD]] : tensor<1x32x128xf32, [[BLOCKED_LAYOUT1]]> -> tensor<1x32x128xf32, [[BLOCKED_LAYOUT2]]>
       %17 = tt.load %arg6 evictionPolicy = evict_last {boundaryCheck = array<i32: 2>, padding = 1 : i32} : !tt.ptr<tensor<1x32x128xf32, #blocked1>>
       // CHECK: scf.yield [[ARG1]], [[ARG2]] : !tt.ptr<tensor<1x32x128xf32, [[BLOCKED_LAYOUT1]]>>, tensor<32x128xf32, [[BLOCKED_LAYOUT]]>
       scf.yield %arg6, %arg8 : !tt.ptr<tensor<1x32x128xf32, #blocked1>>, tensor<32x128xf32, #blocked>
@@ -404,7 +402,6 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, "ttg.th
       scf.yield %arg7 : !tt.ptr<tensor<1x32x128xf32, #blocked1>>
     }
     // CHECK: [[LOAD_RES:%.*]] = tt.load [[RES]] : !tt.ptr<tensor<1x32x128xf32, [[BLOCKED_LAYOUT1]]>>
-    // CHECK: ttg.convert_layout [[LOAD_RES]] : tensor<1x32x128xf32, [[BLOCKED_LAYOUT1]]> -> tensor<1x32x128xf32, [[BLOCKED_LAYOUT2]]>
     %res = tt.load %8#0 : !tt.ptr<tensor<1x32x128xf32, #blocked1>>
     tt.return
   }
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Coalesce.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Coalesce.cpp
@@ -48,7 +48,7 @@ struct CoalescePass
     });
 
     const auto &contiguity = axisInfoAnalysis.getAxisInfo(ptr)->getContiguity();
-    SmallVector<unsigned> order = argSort(contiguity);
+    SmallVector<unsigned> order = getOrderFromContiguity(contiguity);
     LLVM_DEBUG(llvm::dbgs().indent(2)
                    << "order=[" << tt::join(order, ", ") << "]\n";);
 
@@ -67,8 +67,8 @@ struct CoalescePass
         Value val = getMemAccessPtr(use);
         if (!val || !matchesShape(val) || memAccessesSameOrder.contains(use))
           continue;
-        auto currOrder =
-            argSort(axisInfoAnalysis.getAxisInfo(val)->getContiguity());
+        auto currOrder = getOrderFromContiguity(
+            axisInfoAnalysis.getAxisInfo(val)->getContiguity());
         if (order == currOrder) {
           LLVM_DEBUG(llvm::dbgs().indent(2)
                      << "multi-root-slice: insert to memAccessesSameOrder "