Skip to content

Commit 852c0f4

Browse files
committed
[GPU] Address PR #23365 review comments.
* Use gfx950 target and dma_sizes = [32, 128] in tests. * Use explicit tensor::PadOp type instead of auto. * Add trailing periods to comments.
1 parent 753cd7f commit 852c0f4

File tree

4 files changed

+20
-53
lines changed

4 files changed

+20
-53
lines changed

compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -319,7 +319,7 @@ static LogicalResult createDMAInForall(scf::ForallOp threadForallOp,
319319
// After tiling, the input is typically:
320320
// tensor.extract_slice %padded[...] [...] [1, 1]
321321
// We need to trace through extract_slice to find if source is tensor.pad.
322-
if (auto pad = traceToTensorPad(input)) {
322+
if (tensor::PadOp pad = traceToTensorPad(input)) {
323323
// Verify pad constraints: low padding must be all zeros, pad value must
324324
// be 0.
325325
bool validPad = true;
@@ -892,7 +892,7 @@ struct GPUConvertToCoalescedDMAPass final
892892
// Check if this is a tensor.pad fusion case.
893893
bool isPadFusion = false;
894894
if (auto copyOp = dyn_cast<linalg::CopyOp>(op.getOperation())) {
895-
if (auto pad = traceToTensorPad(copyOp.getInputs()[0])) {
895+
if (tensor::PadOp pad = traceToTensorPad(copyOp.getInputs()[0])) {
896896
// Check if padding exists (non-zero low/high pad).
897897
for (auto [low, high] :
898898
llvm::zip(pad.getMixedLowPad(), pad.getMixedHighPad())) {

compiler/src/iree/compiler/Codegen/Common/GPU/test/amdgpu_lower_coalesced_dma_to_gather_lds.mlir

Lines changed: 12 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1091,25 +1091,22 @@ func.func @lower_coalesced_dma_with_in_bounds(
10911091
// - 64 lanes (one subgroup)
10921092
// - in_bounds = [false, true]: K-dim may OOB (last tile 121 % 4 = 1), N-dim is aligned
10931093
//
1094-
// With 64 lanes and 4x64 dest shape:
1095-
// - Elements per lane = 64 / 64 = 1 (each lane reads 1 f32)
1094+
// With 64 lanes, 4x64 dest shape, and dma_sizes = [32, 128]:
1095+
// - Elements per lane = 256 / 64 = 4 (each lane reads 4xf32 = 128 bits)
10961096
// - Delinearization basis = (4, 64)
1097-
// - 4 transfers per lane (one per row)
1098-
//
1099-
// This verifies correct row access pattern: all 4 rows (0-3) are accessed,
1100-
// not just row 0 repeated 4 times (which was the bug before the fix).
1097+
// - 1 transfer covers all 256 elements
11011098

11021099
#executable_target_rocm_hsaco_fb_unaligned = #hal.executable.target<"rocm",
11031100
"rocm-hsaco-fb", {iree_codegen.target_info = #iree_gpu.target<
1104-
arch = "gfx942", features = "", wgp = <
1101+
arch = "gfx950", features = "", wgp = <
11051102
compute = fp32, storage = b32, subgroup = shuffle, dot = none, mma = [],
11061103
subgroup_size_choices = [64, 64],
11071104
max_workgroup_sizes = [1024, 1024, 1024],
11081105
max_thread_count_per_workgroup = 1024,
11091106
max_workgroup_memory_bytes = 65536,
11101107
max_workgroup_counts = [2147483647, 2147483647, 2147483647],
11111108
max_load_instruction_bits = 128, simds_per_wgp = 4,
1112-
vgpr_space_bits = 8192, dma_sizes = [32]>>}>
1109+
vgpr_space_bits = 8192, dma_sizes = [32, 128]>>}>
11131110

11141111
#translation_64_unaligned = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>
11151112

@@ -1124,12 +1121,12 @@ func.func @lower_coalesced_dma_4x64_tensor_pad_fusion(
11241121
translation_info = #translation_64_unaligned} {
11251122
// CHECK: scf.forall (%[[LANE_ID:[a-zA-Z0-9]+]]) in (64)
11261123
scf.forall (%arg6) in (64) {
1127-
// Each lane reads 1 element (64 elements / 64 lanes = 1).
1128-
// CHECK: %[[C1:[a-zA-Z0-9_]+]] = arith.constant 1 : index
1129-
// CHECK: %[[LANE_OFFSET:[a-zA-Z0-9_]+]] = arith.muli %[[LANE_ID]], %[[C1]]
1124+
// Each lane reads 4 elements (256 elements / 64 lanes = 4).
1125+
// CHECK: %[[C4:[a-zA-Z0-9_]+]] = arith.constant 4 : index
1126+
// CHECK: %[[LANE_OFFSET:[a-zA-Z0-9_]+]] = arith.muli %[[LANE_ID]], %[[C4]]
11301127
//
1131-
// 4 transfers with delinearization basis (4, 64):
1132-
// Transfer 1: linearOffset = 0, accesses row 0
1128+
// 1 transfer with delinearization basis (4, 64):
1129+
// Transfer 1: linearOffset = 0
11331130
// CHECK: %[[C0:.+]] = arith.constant 0 : index
11341131
// CHECK: %[[SRC_LIN0:.+]] = arith.addi %[[C0]], %[[LANE_OFFSET]]
11351132
// CHECK: %[[SRC_DELIN0:.+]]:2 = affine.delinearize_index %[[SRC_LIN0]] into (4, 64)
@@ -1138,37 +1135,7 @@ func.func @lower_coalesced_dma_4x64_tensor_pad_fusion(
11381135
// CHECK: %[[FALSE0:.+]] = arith.constant false
11391136
// CHECK: %[[DIM0:.+]] = memref.dim %[[SRC]], %{{.+}}
11401137
// CHECK: %[[FIXED0:.+]] = arith.select %[[FALSE0]], %[[DIM0]], %[[SRC_DELIN0]]#0
1141-
// CHECK: amdgpu.gather_to_lds %[[SRC]][%[[FIXED0]], %[[SRC_DELIN0]]#1], %[[DST]][%[[DST_DELIN0]]#0, %[[DST_DELIN0]]#1] : vector<1xf32>
1142-
//
1143-
// Transfer 2: linearOffset = 64, accesses row 1
1144-
// CHECK: %[[C64:.+]] = arith.constant 64 : index
1145-
// CHECK: %[[SRC_LIN64:.+]] = arith.addi %[[C64]], %[[LANE_OFFSET]]
1146-
// CHECK: %[[SRC_DELIN64:.+]]:2 = affine.delinearize_index %[[SRC_LIN64]] into (4, 64)
1147-
// CHECK: %[[DST_DELIN64:.+]]:2 = affine.delinearize_index %[[C64]] into (4, 64)
1148-
// CHECK: %[[FALSE1:.+]] = arith.constant false
1149-
// CHECK: %[[DIM1:.+]] = memref.dim %[[SRC]], %{{.+}}
1150-
// CHECK: %[[FIXED1:.+]] = arith.select %[[FALSE1]], %[[DIM1]], %[[SRC_DELIN64]]#0
1151-
// CHECK: amdgpu.gather_to_lds %[[SRC]][%[[FIXED1]], %[[SRC_DELIN64]]#1], %[[DST]][%[[DST_DELIN64]]#0, %[[DST_DELIN64]]#1] : vector<1xf32>
1152-
//
1153-
// Transfer 3: linearOffset = 128, accesses row 2
1154-
// CHECK: %[[C128:.+]] = arith.constant 128 : index
1155-
// CHECK: %[[SRC_LIN128:.+]] = arith.addi %[[C128]], %[[LANE_OFFSET]]
1156-
// CHECK: %[[SRC_DELIN128:.+]]:2 = affine.delinearize_index %[[SRC_LIN128]] into (4, 64)
1157-
// CHECK: %[[DST_DELIN128:.+]]:2 = affine.delinearize_index %[[C128]] into (4, 64)
1158-
// CHECK: %[[FALSE2:.+]] = arith.constant false
1159-
// CHECK: %[[DIM2:.+]] = memref.dim %[[SRC]], %{{.+}}
1160-
// CHECK: %[[FIXED2:.+]] = arith.select %[[FALSE2]], %[[DIM2]], %[[SRC_DELIN128]]#0
1161-
// CHECK: amdgpu.gather_to_lds %[[SRC]][%[[FIXED2]], %[[SRC_DELIN128]]#1], %[[DST]][%[[DST_DELIN128]]#0, %[[DST_DELIN128]]#1] : vector<1xf32>
1162-
//
1163-
// Transfer 4: linearOffset = 192, accesses row 3
1164-
// CHECK: %[[C192:.+]] = arith.constant 192 : index
1165-
// CHECK: %[[SRC_LIN192:.+]] = arith.addi %[[C192]], %[[LANE_OFFSET]]
1166-
// CHECK: %[[SRC_DELIN192:.+]]:2 = affine.delinearize_index %[[SRC_LIN192]] into (4, 64)
1167-
// CHECK: %[[DST_DELIN192:.+]]:2 = affine.delinearize_index %[[C192]] into (4, 64)
1168-
// CHECK: %[[FALSE3:.+]] = arith.constant false
1169-
// CHECK: %[[DIM3:.+]] = memref.dim %[[SRC]], %{{.+}}
1170-
// CHECK: %[[FIXED3:.+]] = arith.select %[[FALSE3]], %[[DIM3]], %[[SRC_DELIN192]]#0
1171-
// CHECK: amdgpu.gather_to_lds %[[SRC]][%[[FIXED3]], %[[SRC_DELIN192]]#1], %[[DST]][%[[DST_DELIN192]]#0, %[[DST_DELIN192]]#1] : vector<1xf32>
1138+
// CHECK: amdgpu.gather_to_lds %[[SRC]][%[[FIXED0]], %[[SRC_DELIN0]]#1], %[[DST]][%[[DST_DELIN0]]#0, %[[DST_DELIN0]]#1] : vector<4xf32>
11721139
// CHECK-NOT: amdgpu.gather_to_lds
11731140
// CHECK-NOT: iree_gpu.coalesced_gather_dma
11741141
iree_gpu.coalesced_gather_dma %source into %dest lane(%arg6) in_bounds [false, true] :
@@ -1199,7 +1166,7 @@ func.func @lower_coalesced_dma_4x64_tensor_pad_fusion(
11991166
max_workgroup_memory_bytes = 65536,
12001167
max_workgroup_counts = [2147483647, 2147483647, 2147483647],
12011168
max_load_instruction_bits = 128, simds_per_wgp = 4,
1202-
vgpr_space_bits = 8192, dma_sizes = [32]>>}>
1169+
vgpr_space_bits = 8192, dma_sizes = [32, 128]>>}>
12031170

12041171
#translation_32_pad = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [32, 1, 1] subgroup_size = 32>
12051172

compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_convert_to_coalesced_dma.mlir

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -550,19 +550,19 @@ func.func @copy_with_tensor_pad_fusion_multi_warp(%source: tensor<121x64xf32>, %
550550
// CHECK-SAME: %[[INIT:[a-zA-Z0-9]+]]: tensor<4x124xf16>
551551
func.func @copy_with_tensor_pad_unaligned_row(%source: tensor<65x121xf16>, %init: tensor<4x124xf16>, %off: index, %sz: index, %high_m: index) -> tensor<4x124xf16>
552552
attributes {hal.executable.target = #exec_target_pad_unaligned, translation_info = #translation_pad_unaligned} {
553-
// Extract a dynamic slice: tensor<?x121xf16>
554-
// Row size = 121 * 2 bytes = 242 bytes, NOT 4-byte aligned
553+
// Extract a dynamic slice: tensor<?x121xf16>.
554+
// Row size = 121 * 2 bytes = 242 bytes, NOT 4-byte aligned.
555555
%extracted = tensor.extract_slice %source[%off, 0] [%sz, 121] [1, 1]
556556
: tensor<65x121xf16> to tensor<?x121xf16>
557557

558-
// Pad to static size
558+
// Pad to static size.
559559
%cst = arith.constant 0.0 : f16
560560
%padded = tensor.pad %extracted low[0, 0] high[%high_m, 3] {
561561
^bb0(%arg0: index, %arg1: index):
562562
tensor.yield %cst : f16
563563
} : tensor<?x121xf16> to tensor<4x124xf16>
564564

565-
// Copy from padded tensor
565+
// Copy from padded tensor.
566566
%result = linalg.copy {lowering_config = #iree_gpu.use_global_load_dma}
567567
ins(%padded : tensor<4x124xf16>)
568568
outs(%init : tensor<4x124xf16>) -> tensor<4x124xf16>

compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/BufferizationInterfaces.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -384,10 +384,10 @@ struct CoalescedGatherDMAOpBufferizationInterface
384384
// terminator (not inside the in_parallel region which will be removed).
385385
auto inParallelOp = gatherOp->getParentOfType<scf::InParallelOp>();
386386
if (inParallelOp) {
387-
// Insert before the in_parallel terminator (in the forall body)
387+
// Insert before the in_parallel terminator (in the forall body).
388388
rewriter.setInsertionPoint(inParallelOp);
389389
} else {
390-
// Not in in_parallel, just insert at current location
390+
// Not in in_parallel, just insert at current location.
391391
rewriter.setInsertionPoint(gatherOp);
392392
}
393393

0 commit comments

Comments
 (0)