Address some comments.

lialan · Your Name · commit 24d1821affae · 2026-02-11T07:51:58.000-06:00
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/AMDGPULowerCoalescedDMAToGatherLDS.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/AMDGPULowerCoalescedDMAToGatherLDS.cpp
@@ -304,7 +304,8 @@ struct LowerCoalescedGatherDMAPattern final
     }
 
     emitTransfers(rewriter, loc, source, dest, destShape, numLinearDims,
-                  elementType, indices, segments, segmentLaneOffsets);
+                  elementType, indices, segments, segmentLaneOffsets,
+                  dmaOp.getInBounds());
 
     rewriter.eraseOp(dmaOp);
     return success();
@@ -337,7 +338,8 @@ struct LowerCoalescedGatherDMAPattern final
                      Value dest, ArrayRef<int64_t> destShape,
                      int64_t numLinearDims, Type elementType,
                      OperandRange indices, ArrayRef<TransferSegment> segments,
-                     ArrayRef<Value> segmentLaneOffsets) const {
+                     ArrayRef<Value> segmentLaneOffsets,
+                     std::optional<ArrayAttr> inBoundsAttr) const {
     int64_t destRank = destShape.size();
     int64_t numOuterDims = destRank - numLinearDims;
     LDBG() << "Emitting transfers: " << numOuterDims << " outer dims, "
@@ -400,6 +402,61 @@ struct LowerCoalescedGatherDMAPattern final
           auto [srcIndices, dstIndices] = generateGatherIndices(
               rewriter, loc, srcDimOffsets, dstDimOffsets, indices);
 
+          // Raw buffer OOB clamping is 1D (linear): it returns 0 only when the
+          // byte offset >= total buffer size. For non-outermost dimensions,
+          // an OOB index wraps into the next row instead of returning 0.
+          // Fix: when any non-outermost source index exceeds its dimension,
+          // replace the outermost index with sourceShape[0] to force the
+          // linearized offset past the buffer end → hardware returns 0.
+          if (inBoundsAttr) {
+            auto sourceType = cast<MemRefType>(source.getType());
+            ArrayRef<int64_t> sourceShape = sourceType.getShape();
+            Value anyNonOutermostOOB;
+
+            for (int64_t dim = 1; dim < sourceType.getRank(); ++dim) {
+              if (dim >= static_cast<int64_t>(inBoundsAttr->size())) {
+                break;
+              }
+              bool dimInBounds =
+                  cast<BoolAttr>((*inBoundsAttr)[dim]).getValue();
+              if (dimInBounds) {
+                continue;
+              }
+
+              Value dimSize;
+              if (ShapedType::isDynamic(sourceShape[dim])) {
+                dimSize = memref::DimOp::create(rewriter, loc, source, dim);
+              } else {
+                dimSize = arith::ConstantIndexOp::create(rewriter, loc,
+                                                         sourceShape[dim]);
+              }
+
+              Value isOOB = arith::CmpIOp::create(rewriter, loc,
+                                                  arith::CmpIPredicate::uge,
+                                                  srcIndices[dim], dimSize);
+
+              if (anyNonOutermostOOB) {
+                anyNonOutermostOOB = arith::OrIOp::create(
+                    rewriter, loc, anyNonOutermostOOB, isOOB);
+              } else {
+                anyNonOutermostOOB = isOOB;
+              }
+            }
+
+            if (anyNonOutermostOOB) {
+              Value oobOuterIdx;
+              if (ShapedType::isDynamic(sourceShape[0])) {
+                oobOuterIdx = memref::DimOp::create(rewriter, loc, source, 0);
+              } else {
+                oobOuterIdx = arith::ConstantIndexOp::create(rewriter, loc,
+                                                             sourceShape[0]);
+              }
+              srcIndices[0] =
+                  arith::SelectOp::create(rewriter, loc, anyNonOutermostOOB,
+                                          oobOuterIdx, srcIndices[0]);
+            }
+          }
+
           amdgpu::GatherToLDSOp::create(rewriter, loc, source, srcIndices, dest,
                                         dstIndices,
                                         TypeAttr::get(transferType));
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/amdgpu_lower_coalesced_dma_to_gather_lds.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/amdgpu_lower_coalesced_dma_to_gather_lds.mlir
@@ -1164,3 +1164,63 @@ func.func @lower_coalesced_dma_4x64_tensor_pad_fusion(
   } {mapping = [#gpu.thread<linear_dim_0>]}
   return
 }
+
+// -----
+
+// Test: Non-outermost dimension padding with in_bounds = [false, false].
+// Source: 4x6, dest: 4x8. Dim 1 has padding (6 → 8).
+// Raw buffer OOB is linear/1D, so for non-outermost dim OOB, we must
+// replace the outermost index with sourceShape[0] to force hardware OOB.
+//
+// Without the fix: reading at [0, 6] computes a byte offset within the
+// buffer and wraps to [1, 0] instead of returning 0.
+// With the fix: when srcIndices[1] >= 6, srcIndices[0] is replaced with 4
+// (source dim 0 size), guaranteeing linear offset >= buffer size → returns 0.
+
+#executable_target_rocm_hsaco_fb_pad = #hal.executable.target<"rocm",
+  "rocm-hsaco-fb", {iree_codegen.target_info = #iree_gpu.target<
+  arch = "gfx950", features = "", wgp = <
+    compute = fp32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [32, 32],
+    max_workgroup_sizes = [1024, 1024, 1024],
+    max_thread_count_per_workgroup = 1024,
+    max_workgroup_memory_bytes = 65536,
+    max_workgroup_counts = [2147483647, 2147483647, 2147483647],
+    max_load_instruction_bits = 128, simds_per_wgp = 4,
+    vgpr_space_bits = 8192, dma_sizes = [32]>>}>
+
+#translation_32_pad = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [32, 1, 1] subgroup_size = 32>
+
+// CHECK-LABEL: func.func @gather_dma_non_outermost_oob_check
+// CHECK-SAME:    %[[SRC:[a-zA-Z0-9]+]]: memref<4x6xf32, #amdgpu.address_space<fat_raw_buffer>>
+// CHECK-SAME:    %[[DST:[a-zA-Z0-9]+]]: memref<4x8xf32, #gpu.address_space<workgroup>>
+func.func @gather_dma_non_outermost_oob_check(
+    %source: memref<4x6xf32, #amdgpu.address_space<fat_raw_buffer>>,
+    %dest: memref<4x8xf32, #gpu.address_space<workgroup>>)
+  attributes {
+    hal.executable.target = #executable_target_rocm_hsaco_fb_pad,
+    translation_info = #translation_32_pad} {
+  // CHECK: scf.forall (%[[LANE_ID:[a-zA-Z0-9]+]]) in (32)
+  scf.forall (%arg6) in (32) {
+    // CHECK: %[[C1:[a-zA-Z0-9_]+]] = arith.constant 1
+    // CHECK: %[[LANE_OFFSET:[a-zA-Z0-9_]+]] = arith.muli %[[LANE_ID]], %[[C1]]
+    //
+    // Transfer 1: linearOffset = 0
+    // CHECK: %[[C0:.+]] = arith.constant 0 : index
+    // CHECK: %[[SRC_LIN0:.+]] = arith.addi %[[C0]], %[[LANE_OFFSET]]
+    // CHECK: %[[SRC_DELIN0:.+]]:2 = affine.delinearize_index %[[SRC_LIN0]] into (4, 8)
+    // CHECK: %[[DST_DELIN0:.+]]:2 = affine.delinearize_index %[[C0]] into (4, 8)
+    //
+    // Bounds check: compare srcIndices[1] >= 6 (source dim 1 size)
+    // CHECK: %[[C6:.+]] = arith.constant 6 : index
+    // CHECK: %[[OOB:.+]] = arith.cmpi uge, %[[SRC_DELIN0]]#1, %[[C6]] : index
+    // Replace outermost index with 4 (source dim 0 size) to force hardware OOB
+    // CHECK: %[[C4_OOB:.+]] = arith.constant 4 : index
+    // CHECK: %[[FIXED_IDX:.+]] = arith.select %[[OOB]], %[[C4_OOB]], %[[SRC_DELIN0]]#0 : index
+    // CHECK: amdgpu.gather_to_lds %[[SRC]][%[[FIXED_IDX]], %[[SRC_DELIN0]]#1], %[[DST]][%[[DST_DELIN0]]#0, %[[DST_DELIN0]]#1] : vector<1xf32>
+    // CHECK-NOT: iree_gpu.coalesced_gather_dma
+    iree_gpu.coalesced_gather_dma %source into %dest lane(%arg6) in_bounds [false, false] :
+      memref<4x6xf32, #amdgpu.address_space<fat_raw_buffer>>,
+      memref<4x8xf32, #gpu.address_space<workgroup>>, index
+  } {mapping = [#gpu.thread<linear_dim_0>]}
+  return
+}
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.cpp
@@ -339,7 +339,9 @@ LogicalResult CoalescedGatherDMAOp::verify() {
     }
 
     // If in_bounds is present and this dimension allows OOB (in_bounds=false),
-    // skip the size matching check - hardware OOB returns 0 for padding.
+    // skip the size matching check. For non-outermost dimensions, the lowering
+    // adds explicit bounds checks since raw buffer OOB only provides 1D
+    // (linear) clamping, not per-dimension clamping.
     if (inBoundsAttr) {
       auto inBoundsArray = *inBoundsAttr;
       if (dim < inBoundsArray.size()) {