Skip to content

Commit b9eda84

Browse files
authored
[BACKEND] Limit vector size to scratch size for convert_layout (#5746)
Without this, we can get into a situation when the vector loads/stores would exceed the size of the scratch buffer (and trigger an assertion). Fixes #5745.
1 parent 0753712 commit b9eda84

File tree

2 files changed

+26
-0
lines changed

2 files changed

+26
-0
lines changed

lib/Analysis/Allocation.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,12 @@ ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
123123

124124
std::tie(scratchConfig.inVec, scratchConfig.outVec) =
125125
getScratchCvtInOutVecLengths(srcTy, dstTy);
126+
// We can't write a longer vector than the shape of shared memory.
127+
// This shape might be smaller than the tensor shape in case we decided to
128+
// do the conversion in multiple iterations.
129+
unsigned contiguousShapeDim = scratchConfig.repShape[scratchConfig.order[0]];
130+
scratchConfig.inVec = std::min(scratchConfig.inVec, contiguousShapeDim);
131+
scratchConfig.outVec = std::min(scratchConfig.outVec, contiguousShapeDim);
126132

127133
// No padding is required if the tensor is 1-D, or if all dimensions except
128134
// the first accessed dimension have a size of 1.

test/Conversion/tritongpu_to_llvm.mlir

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1266,6 +1266,26 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
12661266

12671267
// -----
12681268

1269+
// Regression test for https://github.com/triton-lang/triton/issues/5745
1270+
#linear = #ttg.linear<{register = [[0, 1], [0, 2]], lane = [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]], warp = [[1, 0], [2, 0], [4, 0]], block = []}>
1271+
#linear1 = #ttg.linear<{register = [[0, 2]], lane = [[0, 0], [0, 0], [0, 0], [0, 0], [1, 0]], warp = [[2, 0], [4, 0], [0, 1]], block = []}>
1272+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
1273+
// CHECK-LABEL: linear_layout_with_multiple_iterations
1274+
tt.func @linear_layout_with_multiple_iterations(%src: tensor<8x4xbf16, #linear>) {
1275+
%cvt = ttg.convert_layout %src : tensor<8x4xbf16, #linear> -> tensor<8x4xbf16, #linear1>
1276+
// CHECK: inline_asm{{.*}}st.shared.v2
1277+
// CHECK: nvvm.barrier0
1278+
// CHECK: llvm.load
1279+
// CHECK: nvvm.barrier0
1280+
// CHECK: inline_asm{{.*}}st.shared.v2
1281+
// CHECK: nvvm.barrier0
1282+
// CHECK: llvm.load
1283+
tt.return
1284+
}
1285+
}
1286+
1287+
// -----
1288+
12691289
#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [1, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
12701290
#shared = #ttg.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
12711291
#mma = #ttg.nvidia_mma<{versionMajor = 2, warpsPerCTA = [2, 2], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], instrShape = [16, 8]}>

0 commit comments

Comments
 (0)