Merge OpenAI Triton commit 9410804 (#5132)

whitneywhtsang · web-flow · commit e14f5b93fb02 · 2025-09-17T16:26:01.000-04:00
This PR change the Triton base from 625c8cb to 9410804 (Sep 10). Pass rate: 98.8% Please do not squash and merge this PR.
diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -1158,6 +1158,14 @@ SharedMemoryObject::getMaskSpanOffsets(triton::gpu::MemDescType srcTy) {
   if (allocShape == shape) {
     return 0;
   }
+  if (auto paddedEncoding = dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(
+          srcTy.getEncoding())) {
+    // Mask is used in fusion of constant part of memory operation address as
+    // immediate operand. Padded layout has additional address computations
+    // between main offset computation and actual memory access, which breaks
+    // constand fusing. Full mask disables this optimization.
+    return ~uint64_t(0);
+  }
   auto totalLl = triton::gpu::toLinearLayout(allocShape, srcTy.getEncoding());
   auto dimNames = standardOutDimNames(ctx, shape.size());
   // Remove the kBlock dimension
@@ -1194,14 +1202,15 @@ Value SharedMemoryObject::getShmemOffset(Location loc, RewriterBase &rewriter,
     return b.i32_val(0);
   }
 
+  LinearLayout ll;
   // We return the offset without the padding. The padding will be added in the
   // lowering
   if (auto paddedSharedEncoding =
           dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(
               srcTy.getEncoding())) {
-    auto allocShape64 = srcTy.getAllocShape();
-    SmallVector<unsigned> allocShape(allocShape64.begin(), allocShape64.end());
-    return LLVM::linearize(rewriter, loc, offsets, allocShape);
+    ll = paddedSharedEncoding.getLinearComponent();
+  } else {
+    ll = triton::gpu::toLinearLayout(srcTy);
   }
 
   auto dimNames = standardOutDimNames(ctx, offsets.size());
@@ -1210,7 +1219,6 @@ Value SharedMemoryObject::getShmemOffset(Location loc, RewriterBase &rewriter,
     logicalOffsets.push_back({dim, offset});
   }
 
-  LinearLayout ll = triton::gpu::toLinearLayout(srcTy);
   ll = ll.sublayout({str_attr("offset")}, dimNames);
   auto offset =
       applyLinearLayout(loc, rewriter, ll.invert(), logicalOffsets)[0].second;
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -1815,6 +1815,12 @@ Attribute PaddedSharedEncodingAttr::parse(AsmParser &parser, Type type) {
       }
     }
 
+    if (order.size() != shape.size()) {
+      parser.emitError(parser.getCurrentLocation(),
+                       "Mismatch of shape and order ranks in padded layout");
+      return {};
+    }
+
     // Create identity mapping based on shape and order
     auto kOffset = StringAttr::get(parser.getContext(), "offset");
     maybeLL = identityStandardND(kOffset, shape, order);
diff --git a/lib/Dialect/TritonGPU/IR/Ops.cpp b/lib/Dialect/TritonGPU/IR/Ops.cpp
@@ -817,7 +817,16 @@ LogicalResult MemDescSubsliceOp::verify() {
   }
 
   auto ctx = getContext();
-  auto ll = triton::gpu::toLinearLayout(srcTy);
+  LinearLayout ll;
+  if (auto paddedEncoding = dyn_cast<PaddedSharedEncodingAttr>(srcEnc)) {
+    if (paddedEncoding.getRank() < srcTy.getRank()) {
+      return emitError("SubSlice of low rank PaddedSharedEncoding from higher "
+                       "rank tensors is not supported yet");
+    }
+    ll = paddedEncoding.getLinearComponent();
+  } else {
+    ll = triton::gpu::toLinearLayout(srcTy);
+  }
   // NYI: We don't support non-trivial block dimension for now.
   auto kBlock = mlir::StringAttr::get(getContext(), "block");
   if (ll.getInDimSize(kBlock) != 1) {
diff --git a/lib/Dialect/TritonGPU/IR/Types.cpp b/lib/Dialect/TritonGPU/IR/Types.cpp
@@ -177,29 +177,19 @@ LogicalResult MemDescType::verify(function_ref<InFlightDiagnostic()> emitError,
                          << "the shape size when pipelining.";
     }
 
-    // Subslices are not yet implemented
-    auto subsliceAllocSize =
-        allocShape.drop_front(allocShape.size() - shape.size());
-    for (auto [allocDim, shapeDim] : llvm::zip(shape, subsliceAllocSize)) {
-      if (allocDim != shapeDim) {
-        return emitError() << "Subslices with padded encodings are not yet "
-                           << "implemented.";
-      }
-    }
-
     // Ensure linear component's outDims match the alloc size ignoring
     // pipelining dimension
     auto outDims = standardOutDimNames(ctx, rank);
     const auto &ll = enc.getLinearComponent();
-    auto expectedShape = shape;
-    if (shape.size() == allocShape.size() && shape.size() == rank + 1)
+    auto expectedShape = allocShape;
+    if (rank == allocShape.size() - 1)
       expectedShape = expectedShape.drop_front(1);
 
     for (auto d = 0; d < rank; d++) {
       if (ll.getOutDimSize(outDims[d]) != expectedShape[d]) {
         return emitError() << "Mismatch in expected shape for dimension " << d
-                           << ". Expected: " << ll.getOutDimSize(outDims[d])
-                           << ", got: " << expectedShape[d];
+                           << ". Expected: " << expectedShape[d]
+                           << ", got: " << ll.getOutDimSize(outDims[d]);
       }
     }
   }
diff --git a/python/test/gluon/test_core.py b/python/test/gluon/test_core.py
@@ -897,3 +897,58 @@ def kernel(x, y):
 
     compiled_kernel = kernel.warmup(input, output, grid=(1, ))
     assert compiled_kernel.asm["ttgir"].count("tt.func private") == 0
+
+
+@pytest.mark.parametrize("interval_pairs", [[[32, 4]], [[16, 4]], [[16, 4], [64, 8]]])
+@pytest.mark.parametrize(
+    "shared_layout",
+    [{"order": [0, 1]}, {"order": [1, 0]},
+     {"offsets": [[0, 1], [0, 2], [0, 8], [0, 4], [0, 16], [0, 32], [2, 0], [1, 0], [4, 0], [8, 0], [16, 0], [32, 0]]}])
+@pytest.mark.parametrize("slice_m_offset, slice_n_offset, slice_m, slice_n", [(48, 16, 16, 16), (32, 48, 32, 16),
+                                                                              (48, 32, 16, 32)])
+def test_padded_shared_layout_subslice(interval_pairs, shared_layout, slice_m_offset, slice_n_offset, slice_m, slice_n):
+    m = 64
+    n = 64
+    num_warps = 1
+    num_warps_cst = ttgl.constexpr(num_warps)
+    warp_size_cst = ttgl.constexpr(THREADS_PER_WARP)
+
+    shape = [m, n]
+    if "order" in shared_layout:
+        order = shared_layout["order"]
+        smem_layout = ttgl.constexpr(ttgl.PaddedSharedLayout.with_identity_for(interval_pairs, shape, order))
+    elif "offsets" in shared_layout:
+        offsets = shared_layout["offsets"]
+        blocks = []
+        smem_layout = ttgl.constexpr(ttgl.PaddedSharedLayout(interval_pairs, offsets, blocks, shape))
+
+    @gluon.jit
+    def kernel(in_ptr, out_ptr, M: ttgl.constexpr, N: ttgl.constexpr, SLICE_M_OFFSET: ttgl.constexpr,
+               SLICE_N_OFFSET: ttgl.constexpr, SLICE_M: ttgl.constexpr, SLICE_N: ttgl.constexpr):
+        blocked: ttgl.constexpr = ttgl.BlockedLayout([1, 1], [warp_size_cst, 1], [1, num_warps_cst], [1, 0])
+        offs_m_load = ttgl.arange(0, M, ttgl.SliceLayout(1, blocked))
+        offs_n_load = ttgl.arange(0, N, ttgl.SliceLayout(0, blocked))
+        in_offs = offs_m_load[:, None] * N + offs_n_load[None, :]
+
+        in_data = ttgl.load(in_ptr + in_offs)
+
+        smem = ttgl.allocate_shared_memory(ttgl.int32, [M, N], smem_layout)
+        smem_slice0 = smem.slice(SLICE_M_OFFSET, SLICE_M, dim=0)
+        smem_slice1 = smem_slice0.slice(SLICE_N_OFFSET, SLICE_N, dim=1)
+
+        smem.store(in_data)
+
+        out_data = smem_slice1.load(blocked)
+
+        offs_m_store = ttgl.arange(0, SLICE_M, ttgl.SliceLayout(1, blocked))
+        offs_n_store = ttgl.arange(0, SLICE_N, ttgl.SliceLayout(0, blocked))
+        out_offs = offs_m_store[:, None] * SLICE_N + offs_n_store[None, :]
+        ttgl.store(out_ptr + out_offs, out_data)
+
+    input = torch.arange(m * n, device="cuda").reshape(m, n).to(torch.int32)
+    output = torch.zeros((slice_m, slice_n), dtype=torch.int32, device="cuda")
+    ref_output = input[slice_m_offset:slice_m_offset + slice_m, slice_n_offset:slice_n_offset + slice_n]
+
+    kernel[(1, )](input, output, m, n, slice_m_offset, slice_n_offset, slice_m, slice_n, num_warps=num_warps)
+
+    assert (output == ref_output).all()
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -2375,3 +2375,29 @@ def test_layout_zeros():
     # CHECK: #blocked = #ttg.blocked
     # CHECK: arith.constant dense<0.000000e+00> : tensor<128xf32, #blocked>
     ttgl.zeros([128], ttgl.float32, layout=ttgl.BlockedLayout([1], [32], [4], [0]))
+
+
+@gluon.jit
+def print_num_warps():
+    num_warps: ttgl.constexpr = ttgl.num_warps()
+    print("num_warps", num_warps)
+
+
+@filecheck_test
+@gluon.jit
+def test_get_num_warps():
+    # CHECK-LABEL: test_get_num_warps
+    # CHECK: tt.func private @{{.*}}print_num_warps
+    # CHECK-NEXT arith.constant 4 : i32
+
+    # CHECK: tt.func private @{{.*}}print_num_warps{{.*}}NW1
+    # CHECK-NEXT arith.constant 1 : i32
+
+    # CHECK: tt.func private @{{.*}}print_num_warps{{.*}}NW2
+    # CHECK-NEXT arith.constant 2 : i32
+
+    # CHECK: tt.func private @{{.*}}print_num_warps{{.*}}NW8
+    # CHECK-NEXT arith.constant 8 : i32
+    print_num_warps()
+    ttgl.warp_specialize((), print_num_warps, (), [print_num_warps, print_num_warps, print_num_warps], [1, 2, 8],
+                         [24, 24, 24])
diff --git a/python/triton/experimental/gluon/language/__init__.py b/python/triton/experimental/gluon/language/__init__.py
@@ -47,6 +47,7 @@
     expand_dims,
     full,
     gather,
+    num_warps,
     histogram,
     inline_asm_elementwise,
     join,
diff --git a/python/triton/experimental/gluon/language/_core.py b/python/triton/experimental/gluon/language/_core.py
@@ -501,6 +501,14 @@ def warp_specialize(default_args, default_partition, worker_args, worker_partiti
                                      worker_num_regs, _generator)
 
 
+@builtin
+def num_warps(_semantic=None, _generator=None):
+    """
+    Returns the number of warps that execute the current context, including in warp-specialized regions.
+    """
+    return _semantic.num_warps(_generator)
+
+
 @builtin
 def thread_barrier(_semantic=None):
     """
diff --git a/python/triton/experimental/gluon/language/_semantic.py b/python/triton/experimental/gluon/language/_semantic.py
@@ -427,3 +427,9 @@ def warp_specialize(self, default_args, default_partition, worker_args, worker_p
         if default_results is None:
             return
         return tuple(unflatten_ir_values(mlir_results, [r.type for r in default_results]))
+
+    def num_warps(self, generator):
+        if generator.caller_context is not None:
+            assert isinstance(generator.caller_context, GluonCallerContext)
+            return ttgl.constexpr(generator.caller_context.num_warps)
+        return ttgl.constexpr(self.builder.options.num_warps)
diff --git a/test/Conversion/amd/tritongpu_to_llvm.mlir b/test/Conversion/amd/tritongpu_to_llvm.mlir
@@ -510,6 +510,30 @@ module attributes {"ttg.target" = "hip:gfx942", "ttg.num-ctas" = 1 : i32, "ttg.n
 
 // -----
 
+// CHECK-LABEL: padded_shared_layout_subslice_load_store
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [16, 4], warpsPerCTA = [2, 1], order = [1, 0]}>
+#shared = #ttg.padded_shared<[32:+4] {order = [1, 0], shape = [32, 32]}>
+#smem = #ttg.shared_memory
+#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [2, 1], instrShape = [16, 16], isTransposed = true}>
+module attributes {"ttg.target" = "hip:gfx942", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @padded_shared_layout_subslice_load_store(%arg0: tensor<32x32xf16, #blocked>) {
+    // CHECK: llvm.store {{.*}} : vector<8xf16>, !llvm.ptr<3>
+    // CHECK-NOT: llvm.store
+    %0 = ttg.local_alloc %arg0 : (tensor<32x32xf16, #blocked>) -> !ttg.memdesc<32x32xf16, #shared, #smem, mutable>
+    %1 = ttg.memdesc_subslice %0 [16, 0]  : !ttg.memdesc<32x32xf16, #shared, #smem, mutable> -> !ttg.memdesc<16x32xf16, #shared, #smem, mutable, 32x32>
+    // CHECK-COUNT-2: llvm.load {{.*}} : !llvm.ptr<3> -> vector<4xf16>
+    // CHECK-NOT: llvm.load
+    %2 = ttg.local_load %1: !ttg.memdesc<16x32xf16, #shared, #smem, mutable, 32x32> -> tensor<16x32xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
+    // CHECK-COUNT-2: llvm.store {{.*}} : vector<4xf16>, !llvm.ptr<3>
+    // CHECK-NOT: llvm.store
+    ttg.local_store %2, %1 : tensor<16x32xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> -> !ttg.memdesc<16x32xf16, #shared, #smem, mutable, 32x32>
+    tt.return
+  }
+}
+
+// -----
+
 // GFX950-LABEL: reduce_32x32
 // GFX950: llvm.call_intrinsic "llvm.amdgcn.permlane32.swap"
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
diff --git a/test/TritonGPU/invalid.mlir b/test/TritonGPU/invalid.mlir
@@ -452,20 +452,29 @@ tt.func @async_copy_invalid_other_type(%input: tensor<64x64x!tt.ptr<f16>, #block
 // -----
 
 #shared = #ttg.padded_shared<[4:+4] {offset=[[1, 0], [2, 0], [0, 1], [0, 2]], block=[]}>
-// expected-error @below {{Subslices with padded encodings are not yet implemented}}
-!unsupported_subslice = !ttg.memdesc<2x2xf32, #shared, #ttg.shared_memory, 4x4>
+// expected-error @below {{Mismatch in expected shape for dimension 0. Expected: 2, got: 4}}
+!out_dim_too_small = !ttg.memdesc<2x2xf32, #shared, #ttg.shared_memory>
 
 // -----
 
 #shared = #ttg.padded_shared<[4:+4] {offset=[[1, 0], [2, 0], [0, 1], [0, 2]], block=[]}>
-// expected-error @below {{Mismatch in expected shape for dimension 0. Expected: 4, got: 2}}
-!out_dim_too_small = !ttg.memdesc<2x2xf32, #shared, #ttg.shared_memory>
+// expected-error @below {{Mismatch in expected shape for dimension 0. Expected: 8, got: 4}}
+!out_dim_too_large = !ttg.memdesc<8x8xf32, #shared, #ttg.shared_memory>
 
 // -----
 
-#shared = #ttg.padded_shared<[4:+4] {offset=[[1, 0], [2, 0], [0, 1], [0, 2]], block=[]}>
-// expected-error @below {{Mismatch in expected shape for dimension 0. Expected: 4, got: 8}}
-!out_dim_too_large = !ttg.memdesc<8x8xf32, #shared, #ttg.shared_memory>
+// expected-error @below {{Mismatch of shape and order ranks in padded layout}}
+#shared = #ttg.padded_shared<[4:+4] {shape=[1, 2, 4], order=[1, 0]}>
+
+// -----
+
+#shared = #ttg.padded_shared<[4:+4] {shape=[32, 32], order=[1, 0]}>
+#smem = #ttg.shared_memory
+tt.func public @padded_subview_unsupported_size(%arg0: !ttg.memdesc<2x32x32xf32, #shared, #smem>) {
+    // expected-error @+1 {{SubSlice of low rank PaddedSharedEncoding from higher rank tensors is not supported yet}}
+    %a = ttg.memdesc_subslice %arg0 [0, 16, 0] : !ttg.memdesc<2x32x32xf32, #shared, #smem> -> !ttg.memdesc<2x16x32xf32, #shared, #smem, 2x32x32>
+    tt.return
+}
 
 // -----
 
diff --git a/test/TritonGPU/memdesc-subview-split.mlir b/test/TritonGPU/memdesc-subview-split.mlir
@@ -2,6 +2,7 @@
 
 
 #shared = #ttg.swizzled_shared<{vec = 8, perPhase = 2, maxPhase = 8, order = [1, 0]}>
+#padded = #ttg.padded_shared<[32:+4] {order = [1, 0], shape = [256, 128]}>
 #smem = #ttg.shared_memory
 
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
@@ -10,30 +11,20 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
     %c0_i32 = arith.constant 0 : i32
     %0 = ttg.local_alloc : () -> !ttg.memdesc<1x256x128xf16, #shared, #smem, mutable>
     %1 = ttg.memdesc_index %0[%c0_i32] : !ttg.memdesc<1x256x128xf16, #shared, #smem, mutable> -> !ttg.memdesc<256x128xf16, #shared, #smem, mutable>
-    %c0_i32_0 = arith.constant 0 : i32
-    %c0_i32_1 = arith.constant 0 : i32
     %2 = ttg.memdesc_subslice %1 [0, 0]  : !ttg.memdesc<256x128xf16, #shared, #smem, mutable> -> !ttg.memdesc<128x32xf16, #shared, #smem, mutable, 256x128>
-    %c0_i32_2 = arith.constant 0 : i32
-    %c32_i32 = arith.constant 32 : i32
     %3 = ttg.memdesc_subslice %1 [0, 32]  : !ttg.memdesc<256x128xf16, #shared, #smem, mutable> -> !ttg.memdesc<128x32xf16, #shared, #smem, mutable, 256x128>
-    %c0_i32_3 = arith.constant 0 : i32
-    %c64_i32 = arith.constant 64 : i32
     %4 = ttg.memdesc_subslice %1 [0, 64]  : !ttg.memdesc<256x128xf16, #shared, #smem, mutable> -> !ttg.memdesc<128x32xf16, #shared, #smem, mutable, 256x128>
-    %c0_i32_4 = arith.constant 0 : i32
-    %c96_i32 = arith.constant 96 : i32
     %5 = ttg.memdesc_subslice %1 [0, 96]  : !ttg.memdesc<256x128xf16, #shared, #smem, mutable> -> !ttg.memdesc<128x32xf16, #shared, #smem, mutable, 256x128>
-    %c128_i32 = arith.constant 128 : i32
-    %c0_i32_5 = arith.constant 0 : i32
     %6 = ttg.memdesc_subslice %1 [128, 0]  : !ttg.memdesc<256x128xf16, #shared, #smem, mutable> -> !ttg.memdesc<128x32xf16, #shared, #smem, mutable, 256x128>
-    %c128_i32_6 = arith.constant 128 : i32
-    %c32_i32_7 = arith.constant 32 : i32
     %7 = ttg.memdesc_subslice %1 [128, 32]  : !ttg.memdesc<256x128xf16, #shared, #smem, mutable> -> !ttg.memdesc<128x32xf16, #shared, #smem, mutable, 256x128>
-    %c128_i32_8 = arith.constant 128 : i32
-    %c64_i32_9 = arith.constant 64 : i32
     %8 = ttg.memdesc_subslice %1 [128, 64]  : !ttg.memdesc<256x128xf16, #shared, #smem, mutable> -> !ttg.memdesc<128x32xf16, #shared, #smem, mutable, 256x128>
-    %c128_i32_10 = arith.constant 128 : i32
-    %c96_i32_11 = arith.constant 96 : i32
     %9 = ttg.memdesc_subslice %1 [128, 96]  : !ttg.memdesc<256x128xf16, #shared, #smem, mutable> -> !ttg.memdesc<128x32xf16, #shared, #smem, mutable, 256x128>
+
+    %padded = ttg.local_alloc : () -> !ttg.memdesc<1x256x128xf16, #padded, #smem, mutable>
+    %padded_indexed_explicit_alloc_shape = ttg.memdesc_index %padded[%c0_i32] : !ttg.memdesc<1x256x128xf16, #padded, #smem, mutable> -> !ttg.memdesc<256x128xf16, #padded, #smem, mutable, 1x256x128>
+    %10 = ttg.memdesc_subslice %padded_indexed_explicit_alloc_shape [128, 96]  : !ttg.memdesc<256x128xf16, #padded, #smem, mutable, 1x256x128> -> !ttg.memdesc<128x32xf16, #padded, #smem, mutable, 1x256x128>
+    %padded_indexed_implicit_alloc_shape = ttg.memdesc_index %padded[%c0_i32] : !ttg.memdesc<1x256x128xf16, #padded, #smem, mutable> -> !ttg.memdesc<256x128xf16, #padded, #smem, mutable>
+    %11 = ttg.memdesc_subslice %padded_indexed_implicit_alloc_shape [128, 96]  : !ttg.memdesc<256x128xf16, #padded, #smem, mutable> -> !ttg.memdesc<128x32xf16, #padded, #smem, mutable, 256x128>
     tt.return
   }
 }
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/MemoryOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/MemoryOpToLLVM.cpp
@@ -46,6 +46,9 @@ LogicalResult lowerLdStMatrix(
       return result;
     }
   }
+  if (isa<PaddedSharedEncodingAttr>(memDescType.getEncoding())) {
+    return failure();
+  }
   auto memLayout = toLinearLayout(memDescType);
   auto cvt = regLayout.invertAndCompose(memLayout);
   auto kBlock = StringAttr::get(loc.getContext(), "block");

Original file line number	Diff line number	Diff line change
`@@ -1815,6 +1815,12 @@ Attribute PaddedSharedEncodingAttr::parse(AsmParser &parser, Type type) {`
`1815`	`1815`	`}`
`1816`	`1816`	`}`
`1817`	`1817`
	`1818`	`+ if (order.size() != shape.size()) {`
	`1819`	`+ parser.emitError(parser.getCurrentLocation(),`
	`1820`	`+ "Mismatch of shape and order ranks in padded layout");`
	`1821`	`+ return {};`
	`1822`	`+ }`
	`1823`	`+`
`1818`	`1824`	`// Create identity mapping based on shape and order`
`1819`	`1825`	`auto kOffset = StringAttr::get(parser.getContext(), "offset");`
`1820`	`1826`	`maybeLL = identityStandardND(kOffset, shape, order);`