[GPU] Address PR #23365 review comments.

lialan · lialan · commit 64c504134c3d · 2026-02-05T13:56:42.000-08:00
* Change gfx942 → gfx950 in gpu_convert_to_coalesced_dma tests.
* Add in_bounds semantics documentation to CoalescedGatherDMAOp.
* Remove hardware-specific references from op verifier comment.
* Rewrite misleading "ONE level of extract_slice" fallback comment.
* Add inner-dim padding OOB lowering test (64x62xf32 → 64x64xf32).
* Fix missing trailing periods on comments.
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp
@@ -370,8 +370,8 @@ static LogicalResult createDMAInForall(scf::ForallOp threadForallOp,
       }
     }
 
-    // Fallback: original behavior without tensor.pad fusion.
-    // Only trace through ONE level of extract_slice (the immediate input).
+    // Fallback: no tensor.pad fusion. The input is an extract_slice from
+    // tiling; trace through it to get the actual source.
     if (!source) {
       if (auto extractSlice = input.getDefiningOp<tensor::ExtractSliceOp>()) {
         source = extractSlice.getSource();
@@ -506,19 +506,19 @@ struct ConvertPadFusionCopyToCoalescedDMA
 
   LogicalResult matchAndRewrite(linalg::CopyOp copyOp,
                                 PatternRewriter &rewriter) const override {
-    // Only match copies with use_global_load_dma config
+    // Only match copies with use_global_load_dma config.
     auto config = getLoweringConfig<IREE::GPU::UseGlobalLoadDMAAttr>(copyOp);
     if (!config) {
       return failure();
     }
 
-    // Check if this is a tensor.pad fusion case
+    // Check if this is a tensor.pad fusion case.
     auto pad = traceToTensorPad(copyOp.getInputs()[0]);
     if (!pad) {
       return failure(); // Not a pad fusion case
     }
 
-    // Check if padding exists (non-zero low/high pad)
+    // Check if padding exists (non-zero low/high pad).
     bool hasPadding = false;
     for (auto [low, high] :
          llvm::zip(pad.getMixedLowPad(), pad.getMixedHighPad())) {
@@ -893,7 +893,7 @@ struct GPUConvertToCoalescedDMAPass final
     bool isPadFusion = false;
     if (auto copyOp = dyn_cast<linalg::CopyOp>(op.getOperation())) {
       if (auto pad = traceToTensorPad(copyOp.getInputs()[0])) {
-        // Check if padding exists (non-zero low/high pad)
+        // Check if padding exists (non-zero low/high pad).
         for (auto [low, high] :
              llvm::zip(pad.getMixedLowPad(), pad.getMixedHighPad())) {
           if (!isConstantIntValue(low, 0) || !isConstantIntValue(high, 0)) {
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/amdgpu_lower_coalesced_dma_to_gather_lds.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/amdgpu_lower_coalesced_dma_to_gather_lds.mlir
@@ -1224,3 +1224,61 @@ func.func @gather_dma_non_outermost_oob_check(
   } {mapping = [#gpu.thread<linear_dim_0>]}
   return
 }
+
+// -----
+
+// Test: Inner-dim padding OOB check with <64x62xf32> source padded to <64x64xf32>.
+// Only inner dim (dim 1) has padding: 62 → 64. in_bounds = [true, false].
+// Raw buffer OOB is 1D (linear): reading <4 x f32> at [0, 60] would compute a
+// linear offset within the buffer and wrap to [1, 0], [1, 1] instead of returning 0.
+// Fix: when srcIndices[1] >= 62, replace srcIndices[0] with 64 (past buffer end)
+// so the linearized offset exceeds buffer size → hardware returns 0.
+
+#executable_target_rocm_hsaco_fb_inner_pad = #hal.executable.target<"rocm",
+  "rocm-hsaco-fb", {iree_codegen.target_info = #iree_gpu.target<
+  arch = "gfx950", features = "", wgp = <
+    compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [64, 64],
+    max_workgroup_sizes = [1024, 1024, 1024],
+    max_thread_count_per_workgroup = 1024,
+    max_workgroup_memory_bytes = 65536,
+    max_workgroup_counts = [2147483647, 2147483647, 2147483647],
+    max_load_instruction_bits = 128, simds_per_wgp = 4,
+    vgpr_space_bits = 8192, dma_sizes = [32, 128]>>}>
+
+#translation_64_inner_pad = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>
+
+// CHECK-LABEL: func.func @gather_dma_inner_dim_oob_64x62
+// CHECK-SAME:    %[[SRC:[a-zA-Z0-9]+]]: memref<64x62xf32, #amdgpu.address_space<fat_raw_buffer>>
+// CHECK-SAME:    %[[DST:[a-zA-Z0-9]+]]: memref<64x64xf32, #gpu.address_space<workgroup>>
+func.func @gather_dma_inner_dim_oob_64x62(
+    %source: memref<64x62xf32, #amdgpu.address_space<fat_raw_buffer>>,
+    %dest: memref<64x64xf32, #gpu.address_space<workgroup>>)
+  attributes {
+    hal.executable.target = #executable_target_rocm_hsaco_fb_inner_pad,
+    translation_info = #translation_64_inner_pad} {
+  // CHECK: scf.forall (%[[LANE_ID:[a-zA-Z0-9]+]]) in (64)
+  scf.forall (%arg6) in (64) {
+    // Each lane transfers vector<4xf32> (dma_sizes [128] = 128 bits = 4 x f32).
+    // CHECK: %[[C4:[a-zA-Z0-9_]+]] = arith.constant 4
+    // CHECK: %[[LANE_OFFSET:[a-zA-Z0-9_]+]] = arith.muli %[[LANE_ID]], %[[C4]]
+    //
+    // Transfer 1: linearOffset = 0
+    // CHECK: %[[C0:.+]] = arith.constant 0 : index
+    // CHECK: %[[SRC_LIN0:.+]] = arith.addi %[[C0]], %[[LANE_OFFSET]]
+    // CHECK: %[[SRC_DELIN0:.+]]:2 = affine.delinearize_index %[[SRC_LIN0]] into (64, 64)
+    // CHECK: %[[DST_DELIN0:.+]]:2 = affine.delinearize_index %[[C0]] into (64, 64)
+    //
+    // Bounds check: compare srcIndices[1] >= 62 (source inner dim size).
+    // CHECK: %[[C62:.+]] = arith.constant 62 : index
+    // CHECK: %[[OOB:.+]] = arith.cmpi uge, %[[SRC_DELIN0]]#1, %[[C62]] : index
+    // Replace outermost index with 64 (source dim 0 size) to force hardware OOB.
+    // CHECK: %[[C64_OOB:.+]] = arith.constant 64 : index
+    // CHECK: %[[FIXED_IDX:.+]] = arith.select %[[OOB]], %[[C64_OOB]], %[[SRC_DELIN0]]#0 : index
+    // CHECK: amdgpu.gather_to_lds %[[SRC]][%[[FIXED_IDX]], %[[SRC_DELIN0]]#1], %[[DST]][%[[DST_DELIN0]]#0, %[[DST_DELIN0]]#1] : vector<4xf32>
+    // CHECK-NOT: iree_gpu.coalesced_gather_dma
+    iree_gpu.coalesced_gather_dma %source into %dest lane(%arg6) in_bounds [true, false] :
+      memref<64x62xf32, #amdgpu.address_space<fat_raw_buffer>>,
+      memref<64x64xf32, #gpu.address_space<workgroup>>, index
+  } {mapping = [#gpu.thread<linear_dim_0>]}
+  return
+}
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_convert_to_coalesced_dma.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_convert_to_coalesced_dma.mlir
@@ -1,6 +1,6 @@
 // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-convert-to-coalesced-dma,canonicalize))" %s --split-input-file | FileCheck %s
 
-#gpu_target_copy = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
+#gpu_target_copy = #iree_gpu.target<arch = "gfx950", features = "", wgp = <
   compute = fp32, storage = b32, subgroup = shuffle,
   max_load_instruction_bits = 128, subgroup_size_choices = [32],
   max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
@@ -49,7 +49,7 @@ func.func @copy(%source: tensor<64x512xf32>, %init: tensor<64x512xf32>) -> tenso
 
 // -----
 
-#gpu_target_gather = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
+#gpu_target_gather = #iree_gpu.target<arch = "gfx950", features = "", wgp = <
   compute = fp32, storage = b32, subgroup = shuffle,
   max_load_instruction_bits = 128, subgroup_size_choices = [64],
   max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
@@ -103,7 +103,7 @@ func.func @gather(%source: tensor<64x512xf32>, %indices: tensor<64xi32>, %init:
 // Test: Skip coalesced DMA when innermost dimension < subgroup size. This is to ensure we do not go down
 // the slow path (which is not implemented yet).
 
-#gpu_target_small_inner = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
+#gpu_target_small_inner = #iree_gpu.target<arch = "gfx950", features = "", wgp = <
   compute = fp32, storage = b32, subgroup = shuffle,
   max_load_instruction_bits = 128, subgroup_size_choices = [64],
   max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
@@ -140,7 +140,7 @@ func.func @copy_small_innermost_dim(%source: tensor<64x32xf32>, %init: tensor<64
 // - Instead, we should tile rows to 16 (64/4) and keep columns whole (128)
 // This ensures subviews are contiguous in memory.
 
-#gpu_target_contiguous = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
+#gpu_target_contiguous = #iree_gpu.target<arch = "gfx950", features = "", wgp = <
   compute = fp32, storage = b32, subgroup = shuffle,
   max_load_instruction_bits = 128, subgroup_size_choices = [64],
   max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
@@ -200,7 +200,7 @@ func.func @copy_prefer_contiguous_subview(%source: tensor<64x128xf32>, %init: te
 // When output comes from tensor.empty(), we can use total elements instead of
 // innermost dimension for the size check, enabling coalesced DMA.
 
-#gpu_target_linearize = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
+#gpu_target_linearize = #iree_gpu.target<arch = "gfx950", features = "", wgp = <
   compute = fp32, storage = b32, subgroup = shuffle,
   max_load_instruction_bits = 128, subgroup_size_choices = [64],
   max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
@@ -260,7 +260,7 @@ func.func @copy_small_innermost_linearized(%source: tensor<128x16xf32>) -> tenso
 // Test: 1D tensor copy distributes warps across the single dimension.
 // This tests the 1D tile size computation logic for flattened copies.
 
-#gpu_target_1d = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
+#gpu_target_1d = #iree_gpu.target<arch = "gfx950", features = "", wgp = <
   compute = fp32, storage = b32, subgroup = shuffle,
   max_load_instruction_bits = 128, subgroup_size_choices = [64],
   max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
@@ -322,7 +322,7 @@ func.func @copy_1d_tensor(%source: tensor<2048xf32>) -> tensor<2048xf32>
 // 1. Innermost dim (16) < minElementsPerTransfer (64)
 // 2. Output is a function argument, not tensor.empty, so we can't linearize
 
-#gpu_target_no_linearize = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
+#gpu_target_no_linearize = #iree_gpu.target<arch = "gfx950", features = "", wgp = <
   compute = fp32, storage = b32, subgroup = shuffle,
   max_load_instruction_bits = 128, subgroup_size_choices = [64],
   max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
@@ -359,7 +359,7 @@ func.func @copy_small_innermost_no_linearize(%source: tensor<128x16xf32>, %dest:
 // The copy should be converted to coalesced DMA when the input comes from an
 // extract_slice with contiguous innermost dimensions.
 
-#gpu_target_extract_input = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
+#gpu_target_extract_input = #iree_gpu.target<arch = "gfx950", features = "", wgp = <
   compute = fp32, storage = b32, subgroup = shuffle,
   max_load_instruction_bits = 128, subgroup_size_choices = [64],
   max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
@@ -415,7 +415,7 @@ func.func @copy_with_extract_slice_input(%large_source: tensor<256x128xf32>) ->
 // When linalg.copy reads from tensor.pad, trace through to the original source
 // and set in_bounds attribute based on padding.
 
-#gpu_target_pad = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
+#gpu_target_pad = #iree_gpu.target<arch = "gfx950", features = "", wgp = <
   compute = fp32, storage = b32, subgroup = shuffle,
   max_load_instruction_bits = 128, subgroup_size_choices = [64],
   max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
@@ -431,24 +431,24 @@ func.func @copy_with_extract_slice_input(%large_source: tensor<256x128xf32>) ->
 // CHECK-SAME:    %[[INIT:[a-zA-Z0-9]+]]: tensor<4x64xf32>
 func.func @copy_with_tensor_pad_fusion(%source: tensor<121x64xf32>, %init: tensor<4x64xf32>, %off: index, %sz: index, %high: index) -> tensor<4x64xf32>
   attributes {hal.executable.target = #exec_target_pad, translation_info = #translation_pad} {
-  // Extract a dynamic slice
+  // Extract a dynamic slice.
   %extracted = tensor.extract_slice %source[%off, 0] [%sz, 64] [1, 1]
       : tensor<121x64xf32> to tensor<?x64xf32>
 
-  // Pad to static size (only M dimension has padding)
+  // Pad to static size (only M dimension has padding).
   %cst = arith.constant 0.0 : f32
   %padded = tensor.pad %extracted low[0, 0] high[%high, 0] {
   ^bb0(%arg0: index, %arg1: index):
     tensor.yield %cst : f32
   } : tensor<?x64xf32> to tensor<4x64xf32>
 
-  // Copy from padded tensor
+  // Copy from padded tensor.
   %result = linalg.copy {lowering_config = #iree_gpu.use_global_load_dma}
     ins(%padded : tensor<4x64xf32>)
     outs(%init : tensor<4x64xf32>) -> tensor<4x64xf32>
 
-  // Key check: tensor.pad is fused - source is the extract_slice result, not the padded tensor
-  // in_bounds = [false, true] because M dim has dynamic padding, K dim has no padding
+  // Key check: tensor.pad is fused - source is the extract_slice result, not the padded tensor.
+  // in_bounds = [false, true] because M dim has dynamic padding, K dim has no padding.
   // CHECK: %[[EXTRACTED:.+]] = tensor.extract_slice %[[SRC]]
   // CHECK: scf.forall {{.*}} shared_outs(%[[OUTER_INIT:.+]] = %[[INIT]])
   // CHECK:   scf.forall (%[[LANE:.+]]) in (64) shared_outs(%[[INNER_INIT:.+]] = %[[OUTER_INIT]])
@@ -468,7 +468,7 @@ func.func @copy_with_tensor_pad_fusion(%source: tensor<121x64xf32>, %init: tenso
 // operates on the full padded buffer shape, not on smaller subviews.
 // This is critical for correct delinearization in the lowering pass.
 
-#gpu_target_pad_multi_warp = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
+#gpu_target_pad_multi_warp = #iree_gpu.target<arch = "gfx950", features = "", wgp = <
   compute = fp32, storage = b32, subgroup = shuffle,
   max_load_instruction_bits = 128, subgroup_size_choices = [64],
   max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
@@ -484,18 +484,18 @@ func.func @copy_with_tensor_pad_fusion(%source: tensor<121x64xf32>, %init: tenso
 // CHECK-SAME:    %[[INIT:[a-zA-Z0-9]+]]: tensor<4x64xf32>
 func.func @copy_with_tensor_pad_fusion_multi_warp(%source: tensor<121x64xf32>, %init: tensor<4x64xf32>, %off: index, %sz: index, %high: index) -> tensor<4x64xf32>
   attributes {hal.executable.target = #exec_target_pad_multi_warp, translation_info = #translation_pad_multi_warp} {
-  // Extract a dynamic slice
+  // Extract a dynamic slice.
   %extracted = tensor.extract_slice %source[%off, 0] [%sz, 64] [1, 1]
       : tensor<121x64xf32> to tensor<?x64xf32>
 
-  // Pad to static size (only M dimension has padding)
+  // Pad to static size (only M dimension has padding).
   %cst = arith.constant 0.0 : f32
   %padded = tensor.pad %extracted low[0, 0] high[%high, 0] {
   ^bb0(%arg0: index, %arg1: index):
     tensor.yield %cst : f32
   } : tensor<?x64xf32> to tensor<4x64xf32>
 
-  // Copy from padded tensor with 4 warps (256/64=4)
+  // Copy from padded tensor with 4 warps (256/64=4).
   %result = linalg.copy {lowering_config = #iree_gpu.use_global_load_dma}
     ins(%padded : tensor<4x64xf32>)
     outs(%init : tensor<4x64xf32>) -> tensor<4x64xf32>
@@ -534,7 +534,7 @@ func.func @copy_with_tensor_pad_fusion_multi_warp(%source: tensor<121x64xf32>, %
 // If a DWORD is partially out-of-bounds, the entire DWORD returns zero,
 // causing incorrect results. We bail out to avoid the slow path.
 
-#gpu_target_pad_unaligned = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
+#gpu_target_pad_unaligned = #iree_gpu.target<arch = "gfx950", features = "", wgp = <
   compute = fp32, storage = b32, subgroup = shuffle,
   max_load_instruction_bits = 128, subgroup_size_choices = [64],
   max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.cpp
@@ -218,7 +218,7 @@ void CoalescedGatherDMAOp::getEffects(
   Value source = getSource();
   Value init = getInit();
 
-  // The operation reads from the source
+  // The operation reads from the source.
   if (isa<MemRefType>(source.getType())) {
     effects.emplace_back(MemoryEffects::Read::get(),
                          &getOperation()->getOpOperand(sourceOperandIdx),
@@ -235,7 +235,7 @@ void CoalescedGatherDMAOp::getEffects(
                          SideEffects::DefaultResource::get());
   } else if (isa<RankedTensorType>(init.getType()) &&
              getOperation()->getNumResults() == 0) {
-    // Tensor combiner case: declare write effect to prevent DCE
+    // Tensor combiner case: declare write effect to prevent DCE.
     effects.emplace_back(MemoryEffects::Write::get(),
                          &getOperation()->getOpOperand(initOperandIdx),
                          SideEffects::DefaultResource::get());
@@ -339,9 +339,8 @@ LogicalResult CoalescedGatherDMAOp::verify() {
     }
 
     // If in_bounds is present and this dimension allows OOB (in_bounds=false),
-    // skip the size matching check. For non-outermost dimensions, the lowering
-    // adds explicit bounds checks since raw buffer OOB only provides 1D
-    // (linear) clamping, not per-dimension clamping.
+    // skip the size matching check. The source may be smaller than init along
+    // this dimension, and reads beyond the source extent return zero.
     if (inBoundsAttr) {
       auto inBoundsArray = *inBoundsAttr;
       if (dim < inBoundsArray.size()) {
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.td
@@ -300,6 +300,19 @@ def IREEGPU_CoalescedGatherDMAOp : Op<IREEGPU_Dialect, "coalesced_gather_dma", [
     * `lane`: The lane that specifies the coalescing store's offset within the
       workgroup/shared memory.
 
+    ## In-Bounds Attribute
+
+    The optional `in_bounds` attribute is a boolean array with one entry per
+    dimension of `init`. When not present, all dimensions are treated as
+    in-bounds (source and init must have matching sizes for non-indexed dims).
+
+    When present, `in_bounds[i] = false` indicates that the source may be
+    smaller than init along dimension `i`. Reads beyond the source extent
+    return zero (padding semantics). This enables fusion of `tensor.pad`
+    with zero padding into the DMA operation.
+
+    `in_bounds[i] = true` means the source and init sizes match along that
+    dimension, and no padding is needed.
 
     ## Example of a single subgroup using coalesced_gather_dma in copy mode
        for transferring tensor<4x128xf32>, with an intended DMA width of 128 bits