Adding DWORD range check for bounds.

lialan · lialan · commit 6beab38fc348 · 2026-02-04T18:28:15.000-08:00
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp
@@ -340,6 +340,26 @@ static LogicalResult createDMAInForall(scf::ForallOp threadForallOp,
         // This is the tensor.extract_slice result (e.g., tensor<?x64xf32>).
         source = pad.getSource();
 
+        // Check if source tensor's innermost row size is DWORD (4-byte)
+        // aligned. On AMD CDNA, per-component range checking is performed for
+        // each DWORD. If a DWORD is partially out-of-bounds, the entire DWORD
+        // returns zero, causing incorrect results. Additionally, partial OOB
+        // triggers the slow path with multi-cycling and instruction issue
+        // penalties.
+        auto sourceType = cast<RankedTensorType>(source.getType());
+        int64_t innermostDim = sourceType.getShape().back();
+        if (!ShapedType::isDynamic(innermostDim)) {
+          Type elemType = sourceType.getElementType();
+          int64_t elemBytes = elemType.getIntOrFloatBitWidth() / 8;
+          int64_t rowBytes = innermostDim * elemBytes;
+          if (rowBytes % 4 != 0) {
+            LLVM_DEBUG(llvm::dbgs()
+                       << "Skipping DMA: row size " << rowBytes
+                       << " bytes not DWORD-aligned (slow path)\n");
+            return failure();
+          }
+        }
+
         // Compute in_bounds based on whether padding was added per dimension.
         for (auto [low, high] :
              llvm::zip(pad.getMixedLowPad(), pad.getMixedHighPad())) {
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_convert_to_coalesced_dma.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_convert_to_coalesced_dma.mlir
@@ -526,3 +526,53 @@ func.func @copy_with_tensor_pad_fusion_multi_warp(%source: tensor<121x64xf32>, %
 
   return %result : tensor<4x64xf32>
 }
+
+// -----
+
+// Test: tensor.pad fusion bails out when source row size is not DWORD-aligned.
+// On AMD CDNA, per-component range checking is performed for each DWORD.
+// If a DWORD is partially out-of-bounds, the entire DWORD returns zero,
+// causing incorrect results. We bail out to avoid the slow path.
+
+#gpu_target_pad_unaligned = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
+  compute = fp32, storage = b32, subgroup = shuffle,
+  max_load_instruction_bits = 128, subgroup_size_choices = [64],
+  max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
+  max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647],
+  dma_sizes = [32, 128]
+>>
+
+#exec_target_pad_unaligned = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree_codegen.target_info = #gpu_target_pad_unaligned}>
+#translation_pad_unaligned = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_num_stages = 2, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>}>
+
+// CHECK-LABEL: func.func @copy_with_tensor_pad_unaligned_row
+// CHECK-SAME:    %[[SRC:[a-zA-Z0-9]+]]: tensor<65x121xf16>
+// CHECK-SAME:    %[[INIT:[a-zA-Z0-9]+]]: tensor<4x124xf16>
+func.func @copy_with_tensor_pad_unaligned_row(%source: tensor<65x121xf16>, %init: tensor<4x124xf16>, %off: index, %sz: index, %high_m: index) -> tensor<4x124xf16>
+  attributes {hal.executable.target = #exec_target_pad_unaligned, translation_info = #translation_pad_unaligned} {
+  // Extract a dynamic slice: tensor<?x121xf16>
+  // Row size = 121 * 2 bytes = 242 bytes, NOT 4-byte aligned
+  %extracted = tensor.extract_slice %source[%off, 0] [%sz, 121] [1, 1]
+      : tensor<65x121xf16> to tensor<?x121xf16>
+
+  // Pad to static size
+  %cst = arith.constant 0.0 : f16
+  %padded = tensor.pad %extracted low[0, 0] high[%high_m, 3] {
+  ^bb0(%arg0: index, %arg1: index):
+    tensor.yield %cst : f16
+  } : tensor<?x121xf16> to tensor<4x124xf16>
+
+  // Copy from padded tensor
+  %result = linalg.copy {lowering_config = #iree_gpu.use_global_load_dma}
+    ins(%padded : tensor<4x124xf16>)
+    outs(%init : tensor<4x124xf16>) -> tensor<4x124xf16>
+
+  // Source row size (121 * 2 = 242 bytes) is not DWORD-aligned.
+  // Coalesced DMA bails out to avoid partial OOB in per-DWORD range checking.
+  // The linalg.copy should remain unchanged.
+  // CHECK: tensor.pad
+  // CHECK: linalg.copy
+  // CHECK-NOT: iree_gpu.coalesced_gather_dma
+
+  return %result : tensor<4x124xf16>
+}