Skip to content

Commit 2f32ce4

Browse files
[LoadOpToBlockIOConversion] Generalize the fix of baseHeight < tileHeight (#5267)
This PR generalizes a fix for the `baseHeight < tileHeight` condition in the `LoadOpToBlockIOConversion` by replacing `umin` operations with `urem` (unsigned remainder) operations for better handling of offset calculations. Signed-off-by: Whitney Tsang <[email protected]>
1 parent 5e96f82 commit 2f32ce4

File tree

1 file changed

+4
-5
lines changed

1 file changed

+4
-5
lines changed

third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1158,9 +1158,9 @@ struct LoadOpToBlockIOConversion
11581158
for (int repM = 0; repM < repCluster[0]; ++repM) {
11591159

11601160
Value offsetY =
1161-
b.umin(b.sub(baseHeight, b.i32_val(1)),
1162-
b.add(warpId0Offset, b.i32_val(m * replicaStride[0] +
1163-
repM * tileHeight)));
1161+
b.urem(b.add(warpId0Offset, b.i32_val(m * replicaStride[0] +
1162+
repM * tileHeight)),
1163+
baseHeight);
11641164
for (int repN = 0; repN < repCluster[1]; ++repN) {
11651165
Value offsetX =
11661166
b.add(warpId1Offset,
@@ -1193,8 +1193,7 @@ struct LoadOpToBlockIOConversion
11931193

11941194
for (size_t i = 0; i < elemsPerLane; ++i) {
11951195
Value loaded = b.extract_element(
1196-
eltTy, ret,
1197-
b.umin(b.sub(baseHeight, b.i32_val(1)), b.i32_val(i)));
1196+
eltTy, ret, b.urem(b.i32_val(i), baseHeight));
11981197
unpackedLoadedVals.push_back(loaded);
11991198
}
12001199
}

0 commit comments

Comments
 (0)