Skip to content

Commit 6beab38

Browse files
committed
Adding DWORD range check for bounds.
1 parent efd0e54 commit 6beab38

File tree

2 files changed

+70
-0
lines changed

2 files changed

+70
-0
lines changed

compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,26 @@ static LogicalResult createDMAInForall(scf::ForallOp threadForallOp,
340340
// This is the tensor.extract_slice result (e.g., tensor<?x64xf32>).
341341
source = pad.getSource();
342342

343+
// Check if source tensor's innermost row size is DWORD (4-byte)
344+
// aligned. On AMD CDNA, per-component range checking is performed for
345+
// each DWORD. If a DWORD is partially out-of-bounds, the entire DWORD
346+
// returns zero, causing incorrect results. Additionally, partial OOB
347+
// triggers the slow path with multi-cycling and instruction issue
348+
// penalties.
349+
auto sourceType = cast<RankedTensorType>(source.getType());
350+
int64_t innermostDim = sourceType.getShape().back();
351+
if (!ShapedType::isDynamic(innermostDim)) {
352+
Type elemType = sourceType.getElementType();
353+
int64_t elemBytes = elemType.getIntOrFloatBitWidth() / 8;
354+
int64_t rowBytes = innermostDim * elemBytes;
355+
if (rowBytes % 4 != 0) {
356+
LLVM_DEBUG(llvm::dbgs()
357+
<< "Skipping DMA: row size " << rowBytes
358+
<< " bytes not DWORD-aligned (slow path)\n");
359+
return failure();
360+
}
361+
}
362+
343363
// Compute in_bounds based on whether padding was added per dimension.
344364
for (auto [low, high] :
345365
llvm::zip(pad.getMixedLowPad(), pad.getMixedHighPad())) {

compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_convert_to_coalesced_dma.mlir

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -526,3 +526,53 @@ func.func @copy_with_tensor_pad_fusion_multi_warp(%source: tensor<121x64xf32>, %
526526

527527
return %result : tensor<4x64xf32>
528528
}
529+
530+
// -----
531+
532+
// Test: tensor.pad fusion bails out when source row size is not DWORD-aligned.
533+
// On AMD CDNA, per-component range checking is performed for each DWORD.
534+
// If a DWORD is partially out-of-bounds, the entire DWORD returns zero,
535+
// causing incorrect results. We bail out to avoid the slow path.
536+
537+
#gpu_target_pad_unaligned = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
538+
compute = fp32, storage = b32, subgroup = shuffle,
539+
max_load_instruction_bits = 128, subgroup_size_choices = [64],
540+
max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
541+
max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647],
542+
dma_sizes = [32, 128]
543+
>>
544+
545+
#exec_target_pad_unaligned = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree_codegen.target_info = #gpu_target_pad_unaligned}>
546+
#translation_pad_unaligned = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>}>
547+
548+
// CHECK-LABEL: func.func @copy_with_tensor_pad_unaligned_row
549+
// CHECK-SAME: %[[SRC:[a-zA-Z0-9]+]]: tensor<65x121xf16>
550+
// CHECK-SAME: %[[INIT:[a-zA-Z0-9]+]]: tensor<4x124xf16>
551+
func.func @copy_with_tensor_pad_unaligned_row(%source: tensor<65x121xf16>, %init: tensor<4x124xf16>, %off: index, %sz: index, %high_m: index) -> tensor<4x124xf16>
552+
attributes {hal.executable.target = #exec_target_pad_unaligned, translation_info = #translation_pad_unaligned} {
553+
// Extract a dynamic slice: tensor<?x121xf16>
554+
// Row size = 121 * 2 bytes = 242 bytes, NOT 4-byte aligned
555+
%extracted = tensor.extract_slice %source[%off, 0] [%sz, 121] [1, 1]
556+
: tensor<65x121xf16> to tensor<?x121xf16>
557+
558+
// Pad to static size
559+
%cst = arith.constant 0.0 : f16
560+
%padded = tensor.pad %extracted low[0, 0] high[%high_m, 3] {
561+
^bb0(%arg0: index, %arg1: index):
562+
tensor.yield %cst : f16
563+
} : tensor<?x121xf16> to tensor<4x124xf16>
564+
565+
// Copy from padded tensor
566+
%result = linalg.copy {lowering_config = #iree_gpu.use_global_load_dma}
567+
ins(%padded : tensor<4x124xf16>)
568+
outs(%init : tensor<4x124xf16>) -> tensor<4x124xf16>
569+
570+
// Source row size (121 * 2 = 242 bytes) is not DWORD-aligned.
571+
// Coalesced DMA bails out to avoid partial OOB in per-DWORD range checking.
572+
// The linalg.copy should remain unchanged.
573+
// CHECK: tensor.pad
574+
// CHECK: linalg.copy
575+
// CHECK-NOT: iree_gpu.coalesced_gather_dma
576+
577+
return %result : tensor<4x124xf16>
578+
}

0 commit comments

Comments
 (0)