Skip to content

Commit 24d1821

Browse files
lialanYour Name
authored andcommitted
Address some comments.
1 parent faf553a commit 24d1821

File tree

3 files changed

+122
-3
lines changed

3 files changed

+122
-3
lines changed

compiler/src/iree/compiler/Codegen/Common/GPU/AMDGPULowerCoalescedDMAToGatherLDS.cpp

Lines changed: 59 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -304,7 +304,8 @@ struct LowerCoalescedGatherDMAPattern final
304304
}
305305

306306
emitTransfers(rewriter, loc, source, dest, destShape, numLinearDims,
307-
elementType, indices, segments, segmentLaneOffsets);
307+
elementType, indices, segments, segmentLaneOffsets,
308+
dmaOp.getInBounds());
308309

309310
rewriter.eraseOp(dmaOp);
310311
return success();
@@ -337,7 +338,8 @@ struct LowerCoalescedGatherDMAPattern final
337338
Value dest, ArrayRef<int64_t> destShape,
338339
int64_t numLinearDims, Type elementType,
339340
OperandRange indices, ArrayRef<TransferSegment> segments,
340-
ArrayRef<Value> segmentLaneOffsets) const {
341+
ArrayRef<Value> segmentLaneOffsets,
342+
std::optional<ArrayAttr> inBoundsAttr) const {
341343
int64_t destRank = destShape.size();
342344
int64_t numOuterDims = destRank - numLinearDims;
343345
LDBG() << "Emitting transfers: " << numOuterDims << " outer dims, "
@@ -400,6 +402,61 @@ struct LowerCoalescedGatherDMAPattern final
400402
auto [srcIndices, dstIndices] = generateGatherIndices(
401403
rewriter, loc, srcDimOffsets, dstDimOffsets, indices);
402404

405+
// Raw buffer OOB clamping is 1D (linear): it returns 0 only when the
406+
// byte offset >= total buffer size. For non-outermost dimensions,
407+
// an OOB index wraps into the next row instead of returning 0.
408+
// Fix: when any non-outermost source index exceeds its dimension,
409+
// replace the outermost index with sourceShape[0] to force the
410+
// linearized offset past the buffer end → hardware returns 0.
411+
if (inBoundsAttr) {
412+
auto sourceType = cast<MemRefType>(source.getType());
413+
ArrayRef<int64_t> sourceShape = sourceType.getShape();
414+
Value anyNonOutermostOOB;
415+
416+
for (int64_t dim = 1; dim < sourceType.getRank(); ++dim) {
417+
if (dim >= static_cast<int64_t>(inBoundsAttr->size())) {
418+
break;
419+
}
420+
bool dimInBounds =
421+
cast<BoolAttr>((*inBoundsAttr)[dim]).getValue();
422+
if (dimInBounds) {
423+
continue;
424+
}
425+
426+
Value dimSize;
427+
if (ShapedType::isDynamic(sourceShape[dim])) {
428+
dimSize = memref::DimOp::create(rewriter, loc, source, dim);
429+
} else {
430+
dimSize = arith::ConstantIndexOp::create(rewriter, loc,
431+
sourceShape[dim]);
432+
}
433+
434+
Value isOOB = arith::CmpIOp::create(rewriter, loc,
435+
arith::CmpIPredicate::uge,
436+
srcIndices[dim], dimSize);
437+
438+
if (anyNonOutermostOOB) {
439+
anyNonOutermostOOB = arith::OrIOp::create(
440+
rewriter, loc, anyNonOutermostOOB, isOOB);
441+
} else {
442+
anyNonOutermostOOB = isOOB;
443+
}
444+
}
445+
446+
if (anyNonOutermostOOB) {
447+
Value oobOuterIdx;
448+
if (ShapedType::isDynamic(sourceShape[0])) {
449+
oobOuterIdx = memref::DimOp::create(rewriter, loc, source, 0);
450+
} else {
451+
oobOuterIdx = arith::ConstantIndexOp::create(rewriter, loc,
452+
sourceShape[0]);
453+
}
454+
srcIndices[0] =
455+
arith::SelectOp::create(rewriter, loc, anyNonOutermostOOB,
456+
oobOuterIdx, srcIndices[0]);
457+
}
458+
}
459+
403460
amdgpu::GatherToLDSOp::create(rewriter, loc, source, srcIndices, dest,
404461
dstIndices,
405462
TypeAttr::get(transferType));

compiler/src/iree/compiler/Codegen/Common/GPU/test/amdgpu_lower_coalesced_dma_to_gather_lds.mlir

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1164,3 +1164,63 @@ func.func @lower_coalesced_dma_4x64_tensor_pad_fusion(
11641164
} {mapping = [#gpu.thread<linear_dim_0>]}
11651165
return
11661166
}
1167+
1168+
// -----
1169+
1170+
// Test: Non-outermost dimension padding with in_bounds = [false, false].
1171+
// Source: 4x6, dest: 4x8. Dim 1 has padding (6 → 8).
1172+
// Raw buffer OOB is linear/1D, so for non-outermost dim OOB, we must
1173+
// replace the outermost index with sourceShape[0] to force hardware OOB.
1174+
//
1175+
// Without the fix: reading at [0, 6] computes a byte offset within the
1176+
// buffer and wraps to [1, 0] instead of returning 0.
1177+
// With the fix: when srcIndices[1] >= 6, srcIndices[0] is replaced with 4
1178+
// (source dim 0 size), guaranteeing linear offset >= buffer size → returns 0.
1179+
1180+
#executable_target_rocm_hsaco_fb_pad = #hal.executable.target<"rocm",
1181+
"rocm-hsaco-fb", {iree_codegen.target_info = #iree_gpu.target<
1182+
arch = "gfx950", features = "", wgp = <
1183+
compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [32, 32],
1184+
max_workgroup_sizes = [1024, 1024, 1024],
1185+
max_thread_count_per_workgroup = 1024,
1186+
max_workgroup_memory_bytes = 65536,
1187+
max_workgroup_counts = [2147483647, 2147483647, 2147483647],
1188+
max_load_instruction_bits = 128, simds_per_wgp = 4,
1189+
vgpr_space_bits = 8192, dma_sizes = [32]>>}>
1190+
1191+
#translation_32_pad = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [32, 1, 1] subgroup_size = 32>
1192+
1193+
// CHECK-LABEL: func.func @gather_dma_non_outermost_oob_check
1194+
// CHECK-SAME: %[[SRC:[a-zA-Z0-9]+]]: memref<4x6xf32, #amdgpu.address_space<fat_raw_buffer>>
1195+
// CHECK-SAME: %[[DST:[a-zA-Z0-9]+]]: memref<4x8xf32, #gpu.address_space<workgroup>>
1196+
func.func @gather_dma_non_outermost_oob_check(
1197+
%source: memref<4x6xf32, #amdgpu.address_space<fat_raw_buffer>>,
1198+
%dest: memref<4x8xf32, #gpu.address_space<workgroup>>)
1199+
attributes {
1200+
hal.executable.target = #executable_target_rocm_hsaco_fb_pad,
1201+
translation_info = #translation_32_pad} {
1202+
// CHECK: scf.forall (%[[LANE_ID:[a-zA-Z0-9]+]]) in (32)
1203+
scf.forall (%arg6) in (32) {
1204+
// CHECK: %[[C1:[a-zA-Z0-9_]+]] = arith.constant 1
1205+
// CHECK: %[[LANE_OFFSET:[a-zA-Z0-9_]+]] = arith.muli %[[LANE_ID]], %[[C1]]
1206+
//
1207+
// Transfer 1: linearOffset = 0
1208+
// CHECK: %[[C0:.+]] = arith.constant 0 : index
1209+
// CHECK: %[[SRC_LIN0:.+]] = arith.addi %[[C0]], %[[LANE_OFFSET]]
1210+
// CHECK: %[[SRC_DELIN0:.+]]:2 = affine.delinearize_index %[[SRC_LIN0]] into (4, 8)
1211+
// CHECK: %[[DST_DELIN0:.+]]:2 = affine.delinearize_index %[[C0]] into (4, 8)
1212+
//
1213+
// Bounds check: compare srcIndices[1] >= 6 (source dim 1 size)
1214+
// CHECK: %[[C6:.+]] = arith.constant 6 : index
1215+
// CHECK: %[[OOB:.+]] = arith.cmpi uge, %[[SRC_DELIN0]]#1, %[[C6]] : index
1216+
// Replace outermost index with 4 (source dim 0 size) to force hardware OOB
1217+
// CHECK: %[[C4_OOB:.+]] = arith.constant 4 : index
1218+
// CHECK: %[[FIXED_IDX:.+]] = arith.select %[[OOB]], %[[C4_OOB]], %[[SRC_DELIN0]]#0 : index
1219+
// CHECK: amdgpu.gather_to_lds %[[SRC]][%[[FIXED_IDX]], %[[SRC_DELIN0]]#1], %[[DST]][%[[DST_DELIN0]]#0, %[[DST_DELIN0]]#1] : vector<1xf32>
1220+
// CHECK-NOT: iree_gpu.coalesced_gather_dma
1221+
iree_gpu.coalesced_gather_dma %source into %dest lane(%arg6) in_bounds [false, false] :
1222+
memref<4x6xf32, #amdgpu.address_space<fat_raw_buffer>>,
1223+
memref<4x8xf32, #gpu.address_space<workgroup>>, index
1224+
} {mapping = [#gpu.thread<linear_dim_0>]}
1225+
return
1226+
}

compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -339,7 +339,9 @@ LogicalResult CoalescedGatherDMAOp::verify() {
339339
}
340340

341341
// If in_bounds is present and this dimension allows OOB (in_bounds=false),
342-
// skip the size matching check - hardware OOB returns 0 for padding.
342+
// skip the size matching check. For non-outermost dimensions, the lowering
343+
// adds explicit bounds checks since raw buffer OOB only provides 1D
344+
// (linear) clamping, not per-dimension clamping.
343345
if (inBoundsAttr) {
344346
auto inBoundsArray = *inBoundsAttr;
345347
if (dim < inBoundsArray.size()) {

0 commit comments

Comments
 (0)