Skip to content

Commit f563608

Browse files
committed
[GPU] Reject DMA lowering for OOB padding without fat_raw_buffer source.
* Emit an error when in_bounds has OOB dimensions but the source memref lacks fat_raw_buffer address space, since hardware OOB clamping is unavailable without it. * Add lit test for the rejection case.
1 parent 1a0d0b7 commit f563608

File tree

2 files changed

+47
-0
lines changed

2 files changed

+47
-0
lines changed

compiler/src/iree/compiler/Codegen/Common/GPU/AMDGPULowerCoalescedDMAToGatherLDS.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,20 @@ struct LowerCoalescedGatherDMAPattern final
288288
}
289289
SmallVector<TransferSegment> segments = std::move(*segmentsOrFailure);
290290

291+
// OOB padding requires fat_raw_buffer for hardware OOB clamping.
292+
if (std::optional<ArrayAttr> inBounds = dmaOp.getInBounds()) {
293+
auto srcType = cast<MemRefType>(source.getType());
294+
if (!hasAMDGPUFatRawBufferAddressSpace(srcType)) {
295+
for (Attribute attr : *inBounds) {
296+
if (!cast<BoolAttr>(attr).getValue()) {
297+
dmaOp.emitOpError("in_bounds with OOB dimensions requires "
298+
"fat_raw_buffer address space on source");
299+
return failure();
300+
}
301+
}
302+
}
303+
}
304+
291305
// Set up for code generation.
292306
rewriter.setInsertionPoint(dmaOp);
293307
TypedValue<IndexType> laneId = dmaOp.getLane();

compiler/src/iree/compiler/Codegen/Common/GPU/test/amdgpu_lower_coalesced_dma_to_gather_lds.mlir

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1266,3 +1266,36 @@ func.func @gather_dma_inner_dim_oob_64x62(
12661266
} {mapping = [#gpu.thread<linear_dim_0>]}
12671267
return
12681268
}
1269+
1270+
// -----
1271+
1272+
// Test: in_bounds with OOB dimensions on non-fat_raw_buffer source should
1273+
// not be lowered (pattern fails because hardware OOB clamping is unavailable).
1274+
1275+
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm",
1276+
"rocm-hsaco-fb", {iree_codegen.target_info = #iree_gpu.target<
1277+
arch = "gfx950", features = "", wgp = <
1278+
compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [32, 32],
1279+
max_workgroup_sizes = [1024, 1024, 1024],
1280+
max_thread_count_per_workgroup = 1024,
1281+
max_workgroup_memory_bytes = 65536,
1282+
max_workgroup_counts = [2147483647, 2147483647, 2147483647],
1283+
max_load_instruction_bits = 128, simds_per_wgp = 4,
1284+
vgpr_space_bits = 8192, dma_sizes = [32, 128]>>}>
1285+
1286+
#translation_64 = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 32>
1287+
1288+
func.func @no_lower_oob_without_fat_raw_buffer(
1289+
%source: memref<2x128xf32>,
1290+
%dest: memref<4x128xf32, #gpu.address_space<workgroup>>)
1291+
attributes {hal.executable.target = #executable_target_rocm_hsaco_fb,
1292+
translation_info = #translation_64} {
1293+
scf.forall (%arg6) in (64) {
1294+
// expected-error @+2 {{in_bounds with OOB dimensions requires fat_raw_buffer address space on source}}
1295+
// expected-error @+1 {{failed to lower coalesced_gather_dma op}}
1296+
iree_gpu.coalesced_gather_dma %source into %dest lane(%arg6) in_bounds [false, true] :
1297+
memref<2x128xf32>,
1298+
memref<4x128xf32, #gpu.address_space<workgroup>>, index
1299+
} {mapping = [#gpu.thread<linear_dim_0>]}
1300+
return
1301+
}

0 commit comments

Comments
 (0)