[GPU] Address PR #23365 review comments.

lialan · lialan · commit 852c0f4a14bb · 2026-02-06T04:45:25.000-08:00
* Use gfx950 target and dma_sizes = [32, 128] in tests.
* Use explicit tensor::PadOp type instead of auto.
* Add trailing periods to comments.
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp
@@ -319,7 +319,7 @@ static LogicalResult createDMAInForall(scf::ForallOp threadForallOp,
     // After tiling, the input is typically:
     //   tensor.extract_slice %padded[...] [...] [1, 1]
     // We need to trace through extract_slice to find if source is tensor.pad.
-    if (auto pad = traceToTensorPad(input)) {
+    if (tensor::PadOp pad = traceToTensorPad(input)) {
       // Verify pad constraints: low padding must be all zeros, pad value must
       // be 0.
       bool validPad = true;
@@ -892,7 +892,7 @@ struct GPUConvertToCoalescedDMAPass final
     // Check if this is a tensor.pad fusion case.
     bool isPadFusion = false;
     if (auto copyOp = dyn_cast<linalg::CopyOp>(op.getOperation())) {
-      if (auto pad = traceToTensorPad(copyOp.getInputs()[0])) {
+      if (tensor::PadOp pad = traceToTensorPad(copyOp.getInputs()[0])) {
         // Check if padding exists (non-zero low/high pad).
         for (auto [low, high] :
              llvm::zip(pad.getMixedLowPad(), pad.getMixedHighPad())) {
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/amdgpu_lower_coalesced_dma_to_gather_lds.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/amdgpu_lower_coalesced_dma_to_gather_lds.mlir
@@ -1091,25 +1091,22 @@ func.func @lower_coalesced_dma_with_in_bounds(
 //   - 64 lanes (one subgroup)
 //   - in_bounds = [false, true]: K-dim may OOB (last tile 121 % 4 = 1), N-dim is aligned
 //
-// With 64 lanes and 4x64 dest shape:
-//   - Elements per lane = 64 / 64 = 1 (each lane reads 1 f32)
+// With 64 lanes, 4x64 dest shape, and dma_sizes = [32, 128]:
+//   - Elements per lane = 256 / 64 = 4 (each lane reads 4xf32 = 128 bits)
 //   - Delinearization basis = (4, 64)
-//   - 4 transfers per lane (one per row)
-//
-// This verifies correct row access pattern: all 4 rows (0-3) are accessed,
-// not just row 0 repeated 4 times (which was the bug before the fix).
+//   - 1 transfer covers all 256 elements
 
 #executable_target_rocm_hsaco_fb_unaligned = #hal.executable.target<"rocm",
   "rocm-hsaco-fb", {iree_codegen.target_info = #iree_gpu.target<
-  arch = "gfx942", features = "", wgp = <
+  arch = "gfx950", features = "", wgp = <
     compute = fp32, storage = b32, subgroup = shuffle, dot = none, mma = [],
     subgroup_size_choices = [64, 64],
     max_workgroup_sizes = [1024, 1024, 1024],
     max_thread_count_per_workgroup = 1024,
     max_workgroup_memory_bytes = 65536,
     max_workgroup_counts = [2147483647, 2147483647, 2147483647],
     max_load_instruction_bits = 128, simds_per_wgp = 4,
-    vgpr_space_bits = 8192, dma_sizes = [32]>>}>
+    vgpr_space_bits = 8192, dma_sizes = [32, 128]>>}>
 
 #translation_64_unaligned = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>
 
@@ -1124,12 +1121,12 @@ func.func @lower_coalesced_dma_4x64_tensor_pad_fusion(
     translation_info = #translation_64_unaligned} {
   // CHECK: scf.forall (%[[LANE_ID:[a-zA-Z0-9]+]]) in (64)
   scf.forall (%arg6) in (64) {
-    // Each lane reads 1 element (64 elements / 64 lanes = 1).
-    // CHECK: %[[C1:[a-zA-Z0-9_]+]] = arith.constant 1 : index
-    // CHECK: %[[LANE_OFFSET:[a-zA-Z0-9_]+]] = arith.muli %[[LANE_ID]], %[[C1]]
+    // Each lane reads 4 elements (256 elements / 64 lanes = 4).
+    // CHECK: %[[C4:[a-zA-Z0-9_]+]] = arith.constant 4 : index
+    // CHECK: %[[LANE_OFFSET:[a-zA-Z0-9_]+]] = arith.muli %[[LANE_ID]], %[[C4]]
     //
-    // 4 transfers with delinearization basis (4, 64):
-    // Transfer 1: linearOffset = 0, accesses row 0
+    // 1 transfer with delinearization basis (4, 64):
+    // Transfer 1: linearOffset = 0
     // CHECK: %[[C0:.+]] = arith.constant 0 : index
     // CHECK: %[[SRC_LIN0:.+]] = arith.addi %[[C0]], %[[LANE_OFFSET]]
     // CHECK: %[[SRC_DELIN0:.+]]:2 = affine.delinearize_index %[[SRC_LIN0]] into (4, 64)
@@ -1138,37 +1135,7 @@ func.func @lower_coalesced_dma_4x64_tensor_pad_fusion(
     // CHECK: %[[FALSE0:.+]] = arith.constant false
     // CHECK: %[[DIM0:.+]] = memref.dim %[[SRC]], %{{.+}}
     // CHECK: %[[FIXED0:.+]] = arith.select %[[FALSE0]], %[[DIM0]], %[[SRC_DELIN0]]#0
-    // CHECK: amdgpu.gather_to_lds %[[SRC]][%[[FIXED0]], %[[SRC_DELIN0]]#1], %[[DST]][%[[DST_DELIN0]]#0, %[[DST_DELIN0]]#1] : vector<1xf32>
-    //
-    // Transfer 2: linearOffset = 64, accesses row 1
-    // CHECK: %[[C64:.+]] = arith.constant 64 : index
-    // CHECK: %[[SRC_LIN64:.+]] = arith.addi %[[C64]], %[[LANE_OFFSET]]
-    // CHECK: %[[SRC_DELIN64:.+]]:2 = affine.delinearize_index %[[SRC_LIN64]] into (4, 64)
-    // CHECK: %[[DST_DELIN64:.+]]:2 = affine.delinearize_index %[[C64]] into (4, 64)
-    // CHECK: %[[FALSE1:.+]] = arith.constant false
-    // CHECK: %[[DIM1:.+]] = memref.dim %[[SRC]], %{{.+}}
-    // CHECK: %[[FIXED1:.+]] = arith.select %[[FALSE1]], %[[DIM1]], %[[SRC_DELIN64]]#0
-    // CHECK: amdgpu.gather_to_lds %[[SRC]][%[[FIXED1]], %[[SRC_DELIN64]]#1], %[[DST]][%[[DST_DELIN64]]#0, %[[DST_DELIN64]]#1] : vector<1xf32>
-    //
-    // Transfer 3: linearOffset = 128, accesses row 2
-    // CHECK: %[[C128:.+]] = arith.constant 128 : index
-    // CHECK: %[[SRC_LIN128:.+]] = arith.addi %[[C128]], %[[LANE_OFFSET]]
-    // CHECK: %[[SRC_DELIN128:.+]]:2 = affine.delinearize_index %[[SRC_LIN128]] into (4, 64)
-    // CHECK: %[[DST_DELIN128:.+]]:2 = affine.delinearize_index %[[C128]] into (4, 64)
-    // CHECK: %[[FALSE2:.+]] = arith.constant false
-    // CHECK: %[[DIM2:.+]] = memref.dim %[[SRC]], %{{.+}}
-    // CHECK: %[[FIXED2:.+]] = arith.select %[[FALSE2]], %[[DIM2]], %[[SRC_DELIN128]]#0
-    // CHECK: amdgpu.gather_to_lds %[[SRC]][%[[FIXED2]], %[[SRC_DELIN128]]#1], %[[DST]][%[[DST_DELIN128]]#0, %[[DST_DELIN128]]#1] : vector<1xf32>
-    //
-    // Transfer 4: linearOffset = 192, accesses row 3
-    // CHECK: %[[C192:.+]] = arith.constant 192 : index
-    // CHECK: %[[SRC_LIN192:.+]] = arith.addi %[[C192]], %[[LANE_OFFSET]]
-    // CHECK: %[[SRC_DELIN192:.+]]:2 = affine.delinearize_index %[[SRC_LIN192]] into (4, 64)
-    // CHECK: %[[DST_DELIN192:.+]]:2 = affine.delinearize_index %[[C192]] into (4, 64)
-    // CHECK: %[[FALSE3:.+]] = arith.constant false
-    // CHECK: %[[DIM3:.+]] = memref.dim %[[SRC]], %{{.+}}
-    // CHECK: %[[FIXED3:.+]] = arith.select %[[FALSE3]], %[[DIM3]], %[[SRC_DELIN192]]#0
-    // CHECK: amdgpu.gather_to_lds %[[SRC]][%[[FIXED3]], %[[SRC_DELIN192]]#1], %[[DST]][%[[DST_DELIN192]]#0, %[[DST_DELIN192]]#1] : vector<1xf32>
+    // CHECK: amdgpu.gather_to_lds %[[SRC]][%[[FIXED0]], %[[SRC_DELIN0]]#1], %[[DST]][%[[DST_DELIN0]]#0, %[[DST_DELIN0]]#1] : vector<4xf32>
     // CHECK-NOT: amdgpu.gather_to_lds
     // CHECK-NOT: iree_gpu.coalesced_gather_dma
     iree_gpu.coalesced_gather_dma %source into %dest lane(%arg6) in_bounds [false, true] :
@@ -1199,7 +1166,7 @@ func.func @lower_coalesced_dma_4x64_tensor_pad_fusion(
     max_workgroup_memory_bytes = 65536,
     max_workgroup_counts = [2147483647, 2147483647, 2147483647],
     max_load_instruction_bits = 128, simds_per_wgp = 4,
-    vgpr_space_bits = 8192, dma_sizes = [32]>>}>
+    vgpr_space_bits = 8192, dma_sizes = [32, 128]>>}>
 
 #translation_32_pad = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [32, 1, 1] subgroup_size = 32>
 
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_convert_to_coalesced_dma.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_convert_to_coalesced_dma.mlir
@@ -550,19 +550,19 @@ func.func @copy_with_tensor_pad_fusion_multi_warp(%source: tensor<121x64xf32>, %
 // CHECK-SAME:    %[[INIT:[a-zA-Z0-9]+]]: tensor<4x124xf16>
 func.func @copy_with_tensor_pad_unaligned_row(%source: tensor<65x121xf16>, %init: tensor<4x124xf16>, %off: index, %sz: index, %high_m: index) -> tensor<4x124xf16>
   attributes {hal.executable.target = #exec_target_pad_unaligned, translation_info = #translation_pad_unaligned} {
-  // Extract a dynamic slice: tensor<?x121xf16>
-  // Row size = 121 * 2 bytes = 242 bytes, NOT 4-byte aligned
+  // Extract a dynamic slice: tensor<?x121xf16>.
+  // Row size = 121 * 2 bytes = 242 bytes, NOT 4-byte aligned.
   %extracted = tensor.extract_slice %source[%off, 0] [%sz, 121] [1, 1]
       : tensor<65x121xf16> to tensor<?x121xf16>
 
-  // Pad to static size
+  // Pad to static size.
   %cst = arith.constant 0.0 : f16
   %padded = tensor.pad %extracted low[0, 0] high[%high_m, 3] {
   ^bb0(%arg0: index, %arg1: index):
     tensor.yield %cst : f16
   } : tensor<?x121xf16> to tensor<4x124xf16>
 
-  // Copy from padded tensor
+  // Copy from padded tensor.
   %result = linalg.copy {lowering_config = #iree_gpu.use_global_load_dma}
     ins(%padded : tensor<4x124xf16>)
     outs(%init : tensor<4x124xf16>) -> tensor<4x124xf16>
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/BufferizationInterfaces.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/BufferizationInterfaces.cpp
@@ -384,10 +384,10 @@ struct CoalescedGatherDMAOpBufferizationInterface
     // terminator (not inside the in_parallel region which will be removed).
     auto inParallelOp = gatherOp->getParentOfType<scf::InParallelOp>();
     if (inParallelOp) {
-      // Insert before the in_parallel terminator (in the forall body)
+      // Insert before the in_parallel terminator (in the forall body).
       rewriter.setInsertionPoint(inParallelOp);
     } else {
-      // Not in in_parallel, just insert at current location
+      // Not in in_parallel, just insert at current location.
       rewriter.setInsertionPoint(gatherOp);
     }